Skip to content

Commit 91ebc4e

Browse files
committed
[ARM] VMOVN undef folding
If we insert undef using a VMOVN, we can just use the original value in three out of the four possible combinations. Using VMOVT into a undef vector will still require the lanes to be moved, but otherwise the non-undef value can be used.
1 parent 860e862 commit 91ebc4e

File tree

2 files changed

+149
-0
lines changed

2 files changed

+149
-0
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15424,6 +15424,14 @@ static SDValue PerformVMOVNCombine(SDNode *N,
1542415424
SDValue Op1 = N->getOperand(1);
1542515425
unsigned IsTop = N->getConstantOperandVal(2);
1542615426

15427+
// VMOVNT a undef -> a
15428+
// VMOVNB a undef -> a
15429+
// VMOVNB undef a -> a
15430+
if (Op1->isUndef())
15431+
return Op0;
15432+
if (Op0->isUndef() && !IsTop)
15433+
return Op1;
15434+
1542715435
// VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
1542815436
// VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
1542915437
if ((Op1->getOpcode() == ARMISD::VQMOVNs ||

llvm/test/CodeGen/Thumb2/mve-vmovn.ll

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,144 @@ entry:
801801
%out = shufflevector <16 x i8> %src1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
802802
ret <16 x i8> %out
803803
}
804+
805+
806+
define arm_aapcs_vfpcc <8 x i16> @vmovn32trunct_undef2(<8 x i16> %a) {
807+
; CHECK-LABEL: vmovn32trunct_undef2:
808+
; CHECK: @ %bb.0: @ %entry
809+
; CHECK-NEXT: bx lr
810+
;
811+
; CHECKBE-LABEL: vmovn32trunct_undef2:
812+
; CHECKBE: @ %bb.0: @ %entry
813+
; CHECKBE-NEXT: bx lr
814+
entry:
815+
%c1 = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
816+
%c2 = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> undef)
817+
%strided.vec = shufflevector <4 x i32> %c1, <4 x i32> %c2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
818+
%out = trunc <8 x i32> %strided.vec to <8 x i16>
819+
ret <8 x i16> %out
820+
}
821+
822+
define arm_aapcs_vfpcc <8 x i16> @vmovn32trunct_undef1(<8 x i16> %a) {
823+
; CHECK-LABEL: vmovn32trunct_undef1:
824+
; CHECK: @ %bb.0: @ %entry
825+
; CHECK-NEXT: vmovnt.i32 q0, q0
826+
; CHECK-NEXT: bx lr
827+
;
828+
; CHECKBE-LABEL: vmovn32trunct_undef1:
829+
; CHECKBE: @ %bb.0: @ %entry
830+
; CHECKBE-NEXT: vrev64.16 q1, q0
831+
; CHECKBE-NEXT: vmovnt.i32 q1, q1
832+
; CHECKBE-NEXT: vrev64.16 q0, q1
833+
; CHECKBE-NEXT: bx lr
834+
entry:
835+
%c1 = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> undef)
836+
%c2 = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
837+
%strided.vec = shufflevector <4 x i32> %c1, <4 x i32> %c2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
838+
%out = trunc <8 x i32> %strided.vec to <8 x i16>
839+
ret <8 x i16> %out
840+
}
841+
842+
define arm_aapcs_vfpcc <8 x i16> @vmovn16b_undef2(<16 x i8> %a) {
843+
; CHECK-LABEL: vmovn16b_undef2:
844+
; CHECK: @ %bb.0: @ %entry
845+
; CHECK-NEXT: bx lr
846+
;
847+
; CHECKBE-LABEL: vmovn16b_undef2:
848+
; CHECKBE: @ %bb.0: @ %entry
849+
; CHECKBE-NEXT: vrev64.8 q1, q0
850+
; CHECKBE-NEXT: vrev64.16 q0, q1
851+
; CHECKBE-NEXT: bx lr
852+
entry:
853+
%c1 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
854+
%c2 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> undef)
855+
%out = shufflevector <8 x i16> %c1, <8 x i16> %c2, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
856+
ret <8 x i16> %out
857+
}
858+
859+
define arm_aapcs_vfpcc <8 x i16> @vmovn16b_undef1(<16 x i8> %a) {
860+
; CHECK-LABEL: vmovn16b_undef1:
861+
; CHECK: @ %bb.0: @ %entry
862+
; CHECK-NEXT: bx lr
863+
;
864+
; CHECKBE-LABEL: vmovn16b_undef1:
865+
; CHECKBE: @ %bb.0: @ %entry
866+
; CHECKBE-NEXT: vrev64.8 q1, q0
867+
; CHECKBE-NEXT: vrev64.16 q0, q1
868+
; CHECKBE-NEXT: bx lr
869+
entry:
870+
%c1 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> undef)
871+
%c2 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
872+
%out = shufflevector <8 x i16> %c1, <8 x i16> %c2, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
873+
ret <8 x i16> %out
874+
}
875+
876+
define arm_aapcs_vfpcc <8 x i16> @vmovn32_badlanes(<4 x i32> %src1) {
877+
; CHECK-LABEL: vmovn32_badlanes:
878+
; CHECK: @ %bb.0: @ %entry
879+
; CHECK-NEXT: vmov r0, s0
880+
; CHECK-NEXT: vmov.16 q1[1], r0
881+
; CHECK-NEXT: vmov r0, s1
882+
; CHECK-NEXT: vmov.16 q1[3], r0
883+
; CHECK-NEXT: vmov.16 q1[5], r0
884+
; CHECK-NEXT: vmov r0, s2
885+
; CHECK-NEXT: vmov.16 q1[7], r0
886+
; CHECK-NEXT: vmov q0, q1
887+
; CHECK-NEXT: bx lr
888+
;
889+
; CHECKBE-LABEL: vmovn32_badlanes:
890+
; CHECKBE: @ %bb.0: @ %entry
891+
; CHECKBE-NEXT: vrev64.32 q1, q0
892+
; CHECKBE-NEXT: vmov r0, s4
893+
; CHECKBE-NEXT: vmov.16 q2[1], r0
894+
; CHECKBE-NEXT: vmov r0, s5
895+
; CHECKBE-NEXT: vmov.16 q2[3], r0
896+
; CHECKBE-NEXT: vmov.16 q2[5], r0
897+
; CHECKBE-NEXT: vmov r0, s6
898+
; CHECKBE-NEXT: vmov.16 q2[7], r0
899+
; CHECKBE-NEXT: vrev64.16 q0, q2
900+
; CHECKBE-NEXT: bx lr
901+
entry:
902+
%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> undef, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 1, i32 7, i32 2>
903+
%out = trunc <8 x i32> %strided.vec to <8 x i16>
904+
ret <8 x i16> %out
905+
}
906+
907+
define arm_aapcs_vfpcc <16 x i8> @vmovn16trunct_undef2(<16 x i8> %a) {
908+
; CHECK-LABEL: vmovn16trunct_undef2:
909+
; CHECK: @ %bb.0: @ %entry
910+
; CHECK-NEXT: bx lr
911+
;
912+
; CHECKBE-LABEL: vmovn16trunct_undef2:
913+
; CHECKBE: @ %bb.0: @ %entry
914+
; CHECKBE-NEXT: bx lr
915+
entry:
916+
%c1 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
917+
%c2 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> undef)
918+
%strided.vec = shufflevector <8 x i16> %c1, <8 x i16> %c2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
919+
%out = trunc <16 x i16> %strided.vec to <16 x i8>
920+
ret <16 x i8> %out
921+
}
922+
923+
define arm_aapcs_vfpcc <16 x i8> @vmovn16trunct_undef1(<16 x i8> %a) {
924+
; CHECK-LABEL: vmovn16trunct_undef1:
925+
; CHECK: @ %bb.0: @ %entry
926+
; CHECK-NEXT: vmovnt.i16 q0, q0
927+
; CHECK-NEXT: bx lr
928+
;
929+
; CHECKBE-LABEL: vmovn16trunct_undef1:
930+
; CHECKBE: @ %bb.0: @ %entry
931+
; CHECKBE-NEXT: vrev64.8 q1, q0
932+
; CHECKBE-NEXT: vmovnt.i16 q1, q1
933+
; CHECKBE-NEXT: vrev64.8 q0, q1
934+
; CHECKBE-NEXT: bx lr
935+
entry:
936+
%c1 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> undef)
937+
%c2 = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
938+
%strided.vec = shufflevector <8 x i16> %c1, <8 x i16> %c2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
939+
%out = trunc <16 x i16> %strided.vec to <16 x i8>
940+
ret <16 x i8> %out
941+
}
942+
943+
declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)
944+
declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)

0 commit comments

Comments
 (0)