-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Use indexed dup for 128b segmented splat #144688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s | ||
|
||
;; Patterns that lower to concat_vectors where all incoming operands are the same. | ||
|
||
define void @concat_i8q_256(<16 x i8> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_i8q_256: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the tests, could you add some cases that result in an indexed dup with an index other than 0? e.g. a load of a 256bit vector (which would result in a z-register), a shuffle vector that splats the top 128-bits of that vector over the low and high parts, which should result in a indexed dup instruction with index of 1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's more for a followup patch -- this PR is focused on concat_vectors. |
||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <16 x i8> %data, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, | ||
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> | ||
store <32 x i8> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_i16q_256(<8 x i16> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_i16q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <8 x i16> %data, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, | ||
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | ||
store <16 x i16> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_i32q_256(<4 x i32> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_i32q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3> | ||
store <8 x i32> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_i64q_256(<2 x i64> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_i64q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <2 x i64> %data, <2 x i64> poison, <4 x i32> <i32 0, i32 1, | ||
i32 0, i32 1> | ||
store <4 x i64> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_f16q_256(<8 x half> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_f16q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <8 x half> %data, <8 x half> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, | ||
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | ||
store <16 x half> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_bf16q_256(<8 x bfloat> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_bf16q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: stp q0, q0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <8 x bfloat> %data, <8 x bfloat> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, | ||
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | ||
store <16 x bfloat> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_f32q_256(<4 x float> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_f32q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <4 x float> %data, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3> | ||
store <8 x float> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_f64q_256(<2 x double> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_f64q_256: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <2 x double> %data, <2 x double> poison, <4 x i32> <i32 0, i32 1, | ||
i32 0, i32 1> | ||
store <4 x double> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
;; Test a wider vector | ||
|
||
define void @concat_i32q_512_with_256_vectors(<4 x i32> %data, ptr %addr) #0 { | ||
; CHECK-LABEL: concat_i32q_512_with_256_vectors: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0, #1, mul vl] | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3> | ||
store <16 x i32> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
define void @concat_i32q_512_with_512_vectors(<4 x i32> %data, ptr %addr) #1 { | ||
; CHECK-LABEL: concat_i32q_512_with_512_vectors: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 | ||
; CHECK-NEXT: mov z0.q, q0 | ||
; CHECK-NEXT: str z0, [x0] | ||
; CHECK-NEXT: ret | ||
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3, | ||
i32 0, i32 1, i32 2, i32 3> | ||
store <16 x i32> %splat, ptr %addr, align 1 | ||
ret void | ||
} | ||
|
||
attributes #0 = { vscale_range(2,2) "target-features"="+sve,+bf16" } | ||
attributes #1 = { vscale_range(4,4) "target-features"="+sve,+bf16" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Is this check actually required? (if it got to the point of lowering the operation, I would think this condition is always true)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apparently not. I did include a test (
concat_i32q_512_with_256_vectors
) to exercise this, expecting it to be split into a concat of concats, then to see each turned into a duplane128 here, but it seems that legalization might kick in to split it first. This function is only called if a check withuseSVEForFixedLengthVectorVT
succeeds, which does exclude wider VTs. Will remove.