-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Remove EXT instr before UZP when extracting elements from vector #91328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesAssembly generated for getting odd/even elements from vector contained extra EXT instruction. This was due to way llvm constructs DAGs when vector_shuffling from larger type to smaller. This patch optimises DAG in these situations, allowing for correct assembly to be emitted. Full diff: https://github.com/llvm/llvm-project/pull/91328.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2af679e0755b54..4cf3c4fb007faf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21454,6 +21454,27 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
EVT ResVT = N->getValueType(0);
+ // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
+ if(Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op0.getOperand(0) == Op1.getOperand(0)){
+
+ SDValue SourceVec = Op0.getOperand(0);
+ uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
+ uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
+ uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
+ if(ExtIdx0 == 0 && ExtIdx1 == NumElements/2){
+ EVT OpVT = Op0.getOperand(1).getValueType();
+ EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+ SDValue uzp2 = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec, SourceVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, uzp2, DAG.getConstant(0, DL, OpVT));
+ }
+ }
+
+ // following optimization only work with uzp1
+ if (N->getOpcode() == AArch64ISD::UZP2)
+ return SDValue();
+
// uzp1(x, undef) -> concat(truncate(x), undef)
if (Op1.getOpcode() == ISD::UNDEF) {
EVT BCVT = MVT::Other, HalfVT = MVT::Other;
@@ -24670,6 +24691,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::UUNPKHI:
return performUnpackCombine(N, DAG, Subtarget);
case AArch64ISD::UZP1:
+ case AArch64ISD::UZP2:
return performUzpCombine(N, DAG, Subtarget);
case AArch64ISD::SETCC_MERGE_ZERO:
return performSetccMergeZeroCombine(N, DCI);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
index ba1ad9ba989c6f..4f76a2dd5947a7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -3,7 +3,7 @@
declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
; CHECK-LABEL: fun1:
-; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
define i32 @fun1() {
entry:
%vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
@@ -14,7 +14,7 @@ entry:
}
; CHECK-LABEL: fun2:
-; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
define i32 @fun2() {
entry:
%vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index dae8d9f89e9954..3157257083e52d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -7,18 +7,18 @@ target triple = "aarch64"
define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
; CHECK-LABEL: complex_add_v4f16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z0.s
+; CHECK-NEXT: uzp2 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z2.d, z2.s
; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: uunpkhi z3.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z3.s
; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d
-; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: movprfx z1, z2
-; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z4.h
+; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h
; CHECK-NEXT: zip2 z2.d, z0.d, z1.d
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index c09ec616b015df..15e9b903db5066 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -7,23 +7,22 @@ target triple = "aarch64"
define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
; CHECK-LABEL: complex_mul_v4f16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z2.d, z2.s
; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: uunpkhi z3.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z3.s
; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: uzp2 z4.d, z0.d, z2.d
-; CHECK-NEXT: uzp1 z0.d, z0.d, z2.d
-; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
-; CHECK-NEXT: movprfx z5, z2
-; CHECK-NEXT: fmul z5.h, p0/m, z5.h, z0.h
-; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z4.h
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: fmla z3.h, p0/m, z1.h, z4.h
-; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: zip2 z1.d, z0.d, z3.d
-; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT: movprfx z4, z3
+; CHECK-NEXT: fmul z4.h, p0/m, z4.h, z0.h
+; CHECK-NEXT: fmul z3.h, p0/m, z3.h, z2.h
+; CHECK-NEXT: fmad z2.h, p0/m, z1.h, z4.h
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: zip2 z1.d, z0.d, z2.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index c58db8290c87ab..5bd680ed489389 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -30,21 +30,13 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
}
define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
-; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: uzp1 v2.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp2 v1.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: fmov d0, d2
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: uzp1 v2.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT: uzp2 v1.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-GI-NEXT: fmov d0, d2
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
+; CHECK-NEXT: uzp2 v1.8h, v0.8h, v0.8h
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT: fmov d0, d2
+; CHECK-NEXT: ret
%retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
ret {<4 x half>, <4 x half>} %retval
}
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 26ffa2727a1cd1..645eb6b8f430e8 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -3888,9 +3888,9 @@ entry:
define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
; CHECK-LABEL: test_uzp:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: uzp1 v2.8b, v0.8b, v1.8b
-; CHECK-NEXT: uzp2 v1.8b, v0.8b, v1.8b
+; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b
+; CHECK-NEXT: uzp2 v1.16b, v0.16b, v0.16b
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-NEXT: fmov d0, d2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 478f4a689d3c7a..fd1365d56fee47 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -4,10 +4,10 @@
define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpkhi z1.d, z0.s
-; CHECK-NEXT: uunpklo z2.d, z0.s
-; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z2.s
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
@@ -16,10 +16,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpkhi z1.s, z0.h
-; CHECK-NEXT: uunpklo z2.s, z0.h
-; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
-; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z2.h
; CHECK-NEXT: ret
%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
@@ -39,10 +39,10 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpkhi z1.d, z0.s
-; CHECK-NEXT: uunpklo z2.d, z0.s
-; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z2.s
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
@@ -131,10 +131,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i1(<vscale x 16 x i1> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: uzp1 p0.h, p2.h, p1.h
-; CHECK-NEXT: uzp2 p1.h, p2.h, p1.h
+; CHECK-NEXT: uzp1 p1.b, p0.b, p0.b
+; CHECK-NEXT: uzp2 p2.b, p0.b, p0.b
+; CHECK-NEXT: punpklo p0.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: ret
%retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
ret {<vscale x 8 x i1>, <vscale x 8 x i1>} %retval
@@ -143,10 +143,10 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1(<vscale x 8 x i1> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: uzp1 p0.s, p2.s, p1.s
-; CHECK-NEXT: uzp2 p1.s, p2.s, p1.s
+; CHECK-NEXT: uzp1 p1.h, p0.h, p0.h
+; CHECK-NEXT: uzp2 p2.h, p0.h, p0.h
+; CHECK-NEXT: punpklo p0.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: ret
%retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
ret {<vscale x 4 x i1>, <vscale x 4 x i1>} %retval
@@ -155,10 +155,10 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1(<vscale x 4 x i1> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: punpkhi p1.h, p0.b
-; CHECK-NEXT: punpklo p2.h, p0.b
-; CHECK-NEXT: uzp1 p0.d, p2.d, p1.d
-; CHECK-NEXT: uzp2 p1.d, p2.d, p1.d
+; CHECK-NEXT: uzp1 p1.s, p0.s, p0.s
+; CHECK-NEXT: uzp2 p2.s, p0.s, p0.s
+; CHECK-NEXT: punpklo p0.h, p1.b
+; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
ret {<vscale x 2 x i1>, <vscale x 2 x i1>} %retval
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a nice addition.
if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) { | ||
EVT OpVT = Op0.getOperand(1).getValueType(); | ||
EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext()); | ||
SDValue uzp2 = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uzp2 -> Uzp2 (and maybe without the 2? UZP perhaps?)
Can the second SourceVec be Undef if we are only using the bottom half? Or does that produce worse code in places?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It certainly produces different code. But based on what I have seen from current testcases, it doesn't seem to degrade performance. The only thing is that sometimes compiler now prefers using XTN instruction instead of UZP1 for extracting even elements sometimes, but based on software optimizations guides for V3 and N3 core, both instructions have same latency and throughput. So unless smth changes in the future this should be fine.
I changed code to use UNDEF now. Thanks for the suggestions !
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks that does sound good. The Cortex-A55 issues Neon operations in 64bit chunks, and I believe an xtn counts as a 64-bit operations so can to two per cycle instead of 1 for 128bit operations. That's getting older now, but it might be a tiny bit nicer in places.
One of the problems it can cause is that because the undef is replaced by any register, it can create false-dependencies at times. Hopefully this won't be too much of a problem anywhere though, we can adjust it if needed.
Variable names should be Capitalized though!
} | ||
} | ||
|
||
// following optimization only work with uzp1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-> The following optimizations only work with uzp1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The coding standard prefers Capitalization and full sentences. https://llvm.org/docs/CodingStandards.html#commenting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
} | ||
} | ||
|
||
// following optimization only work with uzp1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The coding standard prefers Capitalization and full sentences. https://llvm.org/docs/CodingStandards.html#commenting
if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) { | ||
EVT OpVT = Op0.getOperand(1).getValueType(); | ||
EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext()); | ||
SDValue uzp2 = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks that does sound good. The Cortex-A55 issues Neon operations in 64bit chunks, and I believe an xtn counts as a 64-bit operations so can to two per cycle instead of 1 for 128bit operations. That's getting older now, but it might be a tiny bit nicer in places.
One of the problems it can cause is that because the undef is replaced by any register, it can create false-dependencies at times. Hopefully this won't be too much of a problem anywhere though, we can adjust it if needed.
Variable names should be Capitalized though!
Assembly generated for getting odd/even elements from vector contained extra EXT instruction. This was due to way llvm constructs DAGs when vector_shuffling from larger type to smaller. This patch optimises DAG in these situations, allowing for correct assembly to be emitted.