Skip to content

Commit 0f8ca20

Browse files
committed
[X86] Add tests for cmp-zero + and/trunc + or-reduction patterns
Expanding off the original PR44781 test case, show the failure to fold cmp-all-zero patterns when a demanded bits limiting and/trunc is in the way.
1 parent c37d25f commit 0f8ca20

File tree

1 file changed

+305
-0
lines changed

1 file changed

+305
-0
lines changed

llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,311 @@ define i1 @test_v128i8(<128 x i8> %a0) {
816816
ret i1 %2
817817
}
818818

819+
;
820+
; Compare Truncated/Masked OR Reductions
821+
;
822+
823+
define i1 @trunc_v2i64(<2 x i64> %a0) {
824+
; SSE-LABEL: trunc_v2i64:
825+
; SSE: # %bb.0:
826+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
827+
; SSE-NEXT: por %xmm0, %xmm1
828+
; SSE-NEXT: movd %xmm1, %eax
829+
; SSE-NEXT: testw %ax, %ax
830+
; SSE-NEXT: sete %al
831+
; SSE-NEXT: retq
832+
;
833+
; AVX-LABEL: trunc_v2i64:
834+
; AVX: # %bb.0:
835+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
836+
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
837+
; AVX-NEXT: vmovd %xmm0, %eax
838+
; AVX-NEXT: testw %ax, %ax
839+
; AVX-NEXT: sete %al
840+
; AVX-NEXT: retq
841+
%1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
842+
%2 = trunc i64 %1 to i16
843+
%3 = icmp eq i16 %2, 0
844+
ret i1 %3
845+
}
846+
847+
define i1 @mask_v8i32(<8 x i32> %a0) {
848+
; SSE-LABEL: mask_v8i32:
849+
; SSE: # %bb.0:
850+
; SSE-NEXT: por %xmm1, %xmm0
851+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
852+
; SSE-NEXT: por %xmm0, %xmm1
853+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
854+
; SSE-NEXT: por %xmm1, %xmm0
855+
; SSE-NEXT: movd %xmm0, %eax
856+
; SSE-NEXT: testl $-2147483648, %eax # imm = 0x80000000
857+
; SSE-NEXT: sete %al
858+
; SSE-NEXT: retq
859+
;
860+
; AVX1-LABEL: mask_v8i32:
861+
; AVX1: # %bb.0:
862+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
863+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
864+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
865+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
866+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
867+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
868+
; AVX1-NEXT: vmovd %xmm0, %eax
869+
; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000
870+
; AVX1-NEXT: sete %al
871+
; AVX1-NEXT: vzeroupper
872+
; AVX1-NEXT: retq
873+
;
874+
; AVX2-LABEL: mask_v8i32:
875+
; AVX2: # %bb.0:
876+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
877+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
878+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
879+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
880+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
881+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
882+
; AVX2-NEXT: vmovd %xmm0, %eax
883+
; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000
884+
; AVX2-NEXT: sete %al
885+
; AVX2-NEXT: vzeroupper
886+
; AVX2-NEXT: retq
887+
;
888+
; AVX512-LABEL: mask_v8i32:
889+
; AVX512: # %bb.0:
890+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
891+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
892+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
893+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
894+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
895+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
896+
; AVX512-NEXT: vmovd %xmm0, %eax
897+
; AVX512-NEXT: testl $-2147483648, %eax # imm = 0x80000000
898+
; AVX512-NEXT: sete %al
899+
; AVX512-NEXT: vzeroupper
900+
; AVX512-NEXT: retq
901+
%1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
902+
%2 = and i32 %1, 2147483648
903+
%3 = icmp eq i32 %2, 0
904+
ret i1 %3
905+
}
906+
907+
define i1 @trunc_v16i16(<16 x i16> %a0) {
908+
; SSE-LABEL: trunc_v16i16:
909+
; SSE: # %bb.0:
910+
; SSE-NEXT: por %xmm1, %xmm0
911+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
912+
; SSE-NEXT: por %xmm0, %xmm1
913+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
914+
; SSE-NEXT: por %xmm1, %xmm0
915+
; SSE-NEXT: movdqa %xmm0, %xmm1
916+
; SSE-NEXT: psrld $16, %xmm1
917+
; SSE-NEXT: por %xmm0, %xmm1
918+
; SSE-NEXT: movd %xmm1, %eax
919+
; SSE-NEXT: testb %al, %al
920+
; SSE-NEXT: setne %al
921+
; SSE-NEXT: retq
922+
;
923+
; AVX1-LABEL: trunc_v16i16:
924+
; AVX1: # %bb.0:
925+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
926+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
927+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
928+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
929+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
930+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
931+
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
932+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
933+
; AVX1-NEXT: vmovd %xmm0, %eax
934+
; AVX1-NEXT: testb %al, %al
935+
; AVX1-NEXT: setne %al
936+
; AVX1-NEXT: vzeroupper
937+
; AVX1-NEXT: retq
938+
;
939+
; AVX2-LABEL: trunc_v16i16:
940+
; AVX2: # %bb.0:
941+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
942+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
943+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
944+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
945+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
946+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
947+
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
948+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
949+
; AVX2-NEXT: vmovd %xmm0, %eax
950+
; AVX2-NEXT: testb %al, %al
951+
; AVX2-NEXT: setne %al
952+
; AVX2-NEXT: vzeroupper
953+
; AVX2-NEXT: retq
954+
;
955+
; AVX512-LABEL: trunc_v16i16:
956+
; AVX512: # %bb.0:
957+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
958+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
959+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
960+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
961+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
962+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
963+
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
964+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
965+
; AVX512-NEXT: vmovd %xmm0, %eax
966+
; AVX512-NEXT: testb %al, %al
967+
; AVX512-NEXT: setne %al
968+
; AVX512-NEXT: vzeroupper
969+
; AVX512-NEXT: retq
970+
%1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
971+
%2 = trunc i16 %1 to i8
972+
%3 = icmp ne i8 %2, 0
973+
ret i1 %3
974+
}
975+
976+
define i1 @mask_v128i8(<128 x i8> %a0) {
977+
; SSE-LABEL: mask_v128i8:
978+
; SSE: # %bb.0:
979+
; SSE-NEXT: por %xmm6, %xmm2
980+
; SSE-NEXT: por %xmm7, %xmm3
981+
; SSE-NEXT: por %xmm5, %xmm3
982+
; SSE-NEXT: por %xmm1, %xmm3
983+
; SSE-NEXT: por %xmm4, %xmm2
984+
; SSE-NEXT: por %xmm3, %xmm2
985+
; SSE-NEXT: por %xmm0, %xmm2
986+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
987+
; SSE-NEXT: por %xmm2, %xmm0
988+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
989+
; SSE-NEXT: por %xmm0, %xmm1
990+
; SSE-NEXT: movdqa %xmm1, %xmm0
991+
; SSE-NEXT: psrld $16, %xmm0
992+
; SSE-NEXT: por %xmm1, %xmm0
993+
; SSE-NEXT: movdqa %xmm0, %xmm1
994+
; SSE-NEXT: psrlw $8, %xmm1
995+
; SSE-NEXT: por %xmm0, %xmm1
996+
; SSE-NEXT: movd %xmm1, %eax
997+
; SSE-NEXT: testb $1, %al
998+
; SSE-NEXT: sete %al
999+
; SSE-NEXT: retq
1000+
;
1001+
; AVX1-LABEL: mask_v128i8:
1002+
; AVX1: # %bb.0:
1003+
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
1004+
; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
1005+
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1006+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1007+
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
1008+
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1009+
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
1010+
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
1011+
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
1012+
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1013+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1014+
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1015+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1016+
; AVX1-NEXT: vmovd %xmm0, %eax
1017+
; AVX1-NEXT: testb $1, %al
1018+
; AVX1-NEXT: sete %al
1019+
; AVX1-NEXT: vzeroupper
1020+
; AVX1-NEXT: retq
1021+
;
1022+
; AVX2-LABEL: mask_v128i8:
1023+
; AVX2: # %bb.0:
1024+
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
1025+
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
1026+
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1027+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1028+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1029+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1030+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1031+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1032+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1033+
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1034+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1035+
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1036+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1037+
; AVX2-NEXT: vmovd %xmm0, %eax
1038+
; AVX2-NEXT: testb $1, %al
1039+
; AVX2-NEXT: sete %al
1040+
; AVX2-NEXT: vzeroupper
1041+
; AVX2-NEXT: retq
1042+
;
1043+
; AVX512-LABEL: mask_v128i8:
1044+
; AVX512: # %bb.0:
1045+
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1046+
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1047+
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1048+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1049+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1050+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1051+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1052+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1053+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1054+
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1055+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1056+
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1057+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1058+
; AVX512-NEXT: vmovd %xmm0, %eax
1059+
; AVX512-NEXT: testb $1, %al
1060+
; AVX512-NEXT: sete %al
1061+
; AVX512-NEXT: vzeroupper
1062+
; AVX512-NEXT: retq
1063+
%1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
1064+
%2 = and i8 %1, 1
1065+
%3 = icmp eq i8 %2, 0
1066+
ret i1 %3
1067+
}
1068+
1069+
%struct.Box = type { i32, i32, i32, i32 }
1070+
define zeroext i1 @PR44781(%struct.Box* %0) {
1071+
; SSE-LABEL: PR44781:
1072+
; SSE: # %bb.0:
1073+
; SSE-NEXT: movdqu (%rdi), %xmm0
1074+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1075+
; SSE-NEXT: por %xmm0, %xmm1
1076+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1077+
; SSE-NEXT: por %xmm1, %xmm0
1078+
; SSE-NEXT: movd %xmm0, %eax
1079+
; SSE-NEXT: testb $15, %al
1080+
; SSE-NEXT: sete %al
1081+
; SSE-NEXT: retq
1082+
;
1083+
; AVX1-LABEL: PR44781:
1084+
; AVX1: # %bb.0:
1085+
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1086+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1087+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1088+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1089+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1090+
; AVX1-NEXT: vmovd %xmm0, %eax
1091+
; AVX1-NEXT: testb $15, %al
1092+
; AVX1-NEXT: sete %al
1093+
; AVX1-NEXT: retq
1094+
;
1095+
; AVX2-LABEL: PR44781:
1096+
; AVX2: # %bb.0:
1097+
; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0
1098+
; AVX2-NEXT: vpor (%rdi), %xmm0, %xmm0
1099+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1100+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1101+
; AVX2-NEXT: vmovd %xmm0, %eax
1102+
; AVX2-NEXT: testb $15, %al
1103+
; AVX2-NEXT: sete %al
1104+
; AVX2-NEXT: retq
1105+
;
1106+
; AVX512-LABEL: PR44781:
1107+
; AVX512: # %bb.0:
1108+
; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm0
1109+
; AVX512-NEXT: vpor (%rdi), %xmm0, %xmm0
1110+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1111+
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1112+
; AVX512-NEXT: vmovd %xmm0, %eax
1113+
; AVX512-NEXT: testb $15, %al
1114+
; AVX512-NEXT: sete %al
1115+
; AVX512-NEXT: retq
1116+
%2 = bitcast %struct.Box* %0 to <4 x i32>*
1117+
%3 = load <4 x i32>, <4 x i32>* %2, align 4
1118+
%4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %3)
1119+
%5 = and i32 %4, 15
1120+
%6 = icmp eq i32 %5, 0
1121+
ret i1 %6
1122+
}
1123+
8191124
declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
8201125
declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
8211126
declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)

0 commit comments

Comments
 (0)