@@ -882,6 +882,7 @@ define i64 @shuf64i1_zero(i64 %a) {
882
882
ret i64 %d
883
883
}
884
884
885
+ ; OR(KSHIFTL(X,8),Y) -> KUNPCKBW
885
886
define void @PR32547 (<8 x float > %a , <8 x float > %b , <8 x float > %c , <8 x float > %d , float * %p ) {
886
887
; AVX512F-LABEL: PR32547:
887
888
; AVX512F: # %bb.0: # %entry
@@ -933,5 +934,58 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
933
934
tail call void @llvm.masked.store.v16f32.p0v16f32 (<16 x float > zeroinitializer , <16 x float >* %2 , i32 64 , <16 x i1 > %3 ) #4
934
935
ret void
935
936
}
937
+
938
+ ; OR(X, KSHIFTL(Y,8)) -> KUNPCKBW
939
+ define void @PR32547_swap (<8 x float > %a , <8 x float > %b , <8 x float > %c , <8 x float > %d , float * %p ) {
940
+ ; AVX512F-LABEL: PR32547_swap:
941
+ ; AVX512F: # %bb.0: # %entry
942
+ ; AVX512F-NEXT: # kill: def $ymm3 killed $ymm3 def $zmm3
943
+ ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
944
+ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
945
+ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
946
+ ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
947
+ ; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
948
+ ; AVX512F-NEXT: kshiftlw $8, %k0, %k0
949
+ ; AVX512F-NEXT: kshiftlw $8, %k1, %k1
950
+ ; AVX512F-NEXT: kshiftrw $8, %k1, %k1
951
+ ; AVX512F-NEXT: korw %k0, %k1, %k1
952
+ ; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
953
+ ; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
954
+ ; AVX512F-NEXT: vzeroupper
955
+ ; AVX512F-NEXT: retq
956
+ ;
957
+ ; AVX512VL-LABEL: PR32547_swap:
958
+ ; AVX512VL: # %bb.0: # %entry
959
+ ; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
960
+ ; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
961
+ ; AVX512VL-NEXT: kshiftlw $8, %k0, %k0
962
+ ; AVX512VL-NEXT: korw %k0, %k1, %k1
963
+ ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
964
+ ; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
965
+ ; AVX512VL-NEXT: vzeroupper
966
+ ; AVX512VL-NEXT: retq
967
+ ;
968
+ ; VL_BW_DQ-LABEL: PR32547_swap:
969
+ ; VL_BW_DQ: # %bb.0: # %entry
970
+ ; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
971
+ ; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
972
+ ; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0
973
+ ; VL_BW_DQ-NEXT: korw %k0, %k1, %k1
974
+ ; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
975
+ ; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
976
+ ; VL_BW_DQ-NEXT: vzeroupper
977
+ ; VL_BW_DQ-NEXT: retq
978
+ entry:
979
+ %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256 (<8 x float > %a , <8 x float > %b , i32 1 , i8 -1 )
980
+ %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256 (<8 x float > %c , <8 x float > %d , i32 1 , i8 -1 )
981
+ %conv.i = zext i8 %0 to i16
982
+ %conv.i18 = zext i8 %1 to i16
983
+ %shl = shl nuw i16 %conv.i , 8
984
+ %or = or i16 %conv.i18 , %shl
985
+ %2 = bitcast float * %p to <16 x float >*
986
+ %3 = bitcast i16 %or to <16 x i1 >
987
+ tail call void @llvm.masked.store.v16f32.p0v16f32 (<16 x float > zeroinitializer , <16 x float >* %2 , i32 64 , <16 x i1 > %3 ) #4
988
+ ret void
989
+ }
936
990
declare i8 @llvm.x86.avx512.mask.cmp.ps.256 (<8 x float >, <8 x float >, i32 , i8 )
937
991
declare void @llvm.masked.store.v16f32.p0v16f32 (<16 x float >, <16 x float >*, i32 , <16 x i1 >)
0 commit comments