@@ -991,36 +991,92 @@ define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
991
991
ret void
992
992
}
993
993
994
- define void @test22 (< 4 x i32 > %trigger , < 4 x i32 >* %addr , < 4 x i32 > %val ) {
995
- ; AVX1-LABEL: test22:
996
- ; AVX1: ## BB#0:
997
- ; AVX1-NEXT: movl $-1, %eax
998
- ; AVX1-NEXT : vmovd %eax, %xmm0
999
- ; AVX1 -NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
1000
- ; AVX1 -NEXT: retq
994
+ ; When only one element of the mask is set, reduce to a scalar store.
995
+
996
+ define void @one_mask_bit_set1 (< 4 x i32 >* %addr , < 4 x i32 > %val ) {
997
+ ; AVX-LABEL: one_mask_bit_set1:
998
+ ; AVX : ## BB#0:
999
+ ; AVX -NEXT: vmovd %xmm0, (%rdi)
1000
+ ; AVX -NEXT: retq
1001
1001
;
1002
- ; AVX2-LABEL: test22:
1003
- ; AVX2: ## BB#0:
1004
- ; AVX2-NEXT: movl $-1, %eax
1005
- ; AVX2-NEXT: vmovd %eax, %xmm0
1006
- ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1007
- ; AVX2-NEXT: retq
1002
+ ; AVX512-LABEL: one_mask_bit_set1:
1003
+ ; AVX512: ## BB#0:
1004
+ ; AVX512-NEXT: vmovd %xmm0, (%rdi)
1005
+ ; AVX512-NEXT: retq
1006
+ call void @llvm.masked.store.v4i32 (<4 x i32 > %val , <4 x i32 >* %addr , i32 4 , <4 x i1 ><i1 true , i1 false , i1 false , i1 false >)
1007
+ ret void
1008
+ }
1009
+
1010
+ ; Choose a different element to show that the correct address offset is produced.
1011
+
1012
+ define void @one_mask_bit_set2 (<4 x float >* %addr , <4 x float > %val ) {
1013
+ ; AVX-LABEL: one_mask_bit_set2:
1014
+ ; AVX: ## BB#0:
1015
+ ; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
1016
+ ; AVX-NEXT: retq
1008
1017
;
1009
- ; AVX512F-LABEL: test22:
1010
- ; AVX512F: ## BB#0:
1011
- ; AVX512F-NEXT: movl $-1, %eax
1012
- ; AVX512F-NEXT: vmovd %eax, %xmm0
1013
- ; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1014
- ; AVX512F-NEXT: retq
1018
+ ; AVX512-LABEL: one_mask_bit_set2:
1019
+ ; AVX512: ## BB#0:
1020
+ ; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
1021
+ ; AVX512-NEXT: retq
1022
+ call void @llvm.masked.store.v4f32 (<4 x float > %val , <4 x float >* %addr , i32 4 , <4 x i1 ><i1 false , i1 false , i1 true , i1 false >)
1023
+ ret void
1024
+ }
1025
+
1026
+ ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
1027
+
1028
+ define void @one_mask_bit_set3 (<4 x i64 >* %addr , <4 x i64 > %val ) {
1029
+ ; AVX-LABEL: one_mask_bit_set3:
1030
+ ; AVX: ## BB#0:
1031
+ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1032
+ ; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
1033
+ ; AVX-NEXT: vzeroupper
1034
+ ; AVX-NEXT: retq
1015
1035
;
1016
- ; SKX-LABEL: test22:
1017
- ; SKX: ## BB#0:
1018
- ; SKX-NEXT: movb $1, %al
1019
- ; SKX-NEXT: kmovw %eax, %k1
1020
- ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
1021
- ; SKX-NEXT: retq
1022
- %mask = icmp eq <4 x i32 > %trigger , zeroinitializer
1023
- call void @llvm.masked.store.v4i32 (<4 x i32 >%val , <4 x i32 >* %addr , i32 4 , <4 x i1 ><i1 true , i1 false , i1 false , i1 false >)
1036
+ ; AVX512-LABEL: one_mask_bit_set3:
1037
+ ; AVX512: ## BB#0:
1038
+ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1039
+ ; AVX512-NEXT: vmovq %xmm0, 16(%rdi)
1040
+ ; AVX512-NEXT: retq
1041
+ call void @llvm.masked.store.v4i64 (<4 x i64 > %val , <4 x i64 >* %addr , i32 4 , <4 x i1 ><i1 false , i1 false , i1 true , i1 false >)
1042
+ ret void
1043
+ }
1044
+
1045
+ ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
1046
+
1047
+ define void @one_mask_bit_set4 (<4 x double >* %addr , <4 x double > %val ) {
1048
+ ; AVX-LABEL: one_mask_bit_set4:
1049
+ ; AVX: ## BB#0:
1050
+ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1051
+ ; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
1052
+ ; AVX-NEXT: vzeroupper
1053
+ ; AVX-NEXT: retq
1054
+ ;
1055
+ ; AVX512-LABEL: one_mask_bit_set4:
1056
+ ; AVX512: ## BB#0:
1057
+ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1058
+ ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
1059
+ ; AVX512-NEXT: retq
1060
+ call void @llvm.masked.store.v4f64 (<4 x double > %val , <4 x double >* %addr , i32 4 , <4 x i1 ><i1 false , i1 false , i1 false , i1 true >)
1061
+ ret void
1062
+ }
1063
+
1064
+ ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
1065
+
1066
+ define void @one_mask_bit_set5 (<8 x double >* %addr , <8 x double > %val ) {
1067
+ ; AVX-LABEL: one_mask_bit_set5:
1068
+ ; AVX: ## BB#0:
1069
+ ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
1070
+ ; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
1071
+ ; AVX-NEXT: vzeroupper
1072
+ ; AVX-NEXT: retq
1073
+ ;
1074
+ ; AVX512-LABEL: one_mask_bit_set5:
1075
+ ; AVX512: ## BB#0:
1076
+ ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1077
+ ; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
1078
+ ; AVX512-NEXT: retq
1079
+ call void @llvm.masked.store.v8f64 (<8 x double > %val , <8 x double >* %addr , i32 4 , <8 x i1 ><i1 false , i1 false , i1 false , i1 false , i1 false , i1 false , i1 true , i1 false >)
1024
1080
ret void
1025
1081
}
1026
1082
@@ -1030,8 +1086,10 @@ declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
1030
1086
declare void @llvm.masked.store.v16i32 (<16 x i32 >, <16 x i32 >*, i32 , <16 x i1 >)
1031
1087
declare void @llvm.masked.store.v8i32 (<8 x i32 >, <8 x i32 >*, i32 , <8 x i1 >)
1032
1088
declare void @llvm.masked.store.v4i32 (<4 x i32 >, <4 x i32 >*, i32 , <4 x i1 >)
1089
+ declare void @llvm.masked.store.v4i64 (<4 x i64 >, <4 x i64 >*, i32 , <4 x i1 >)
1033
1090
declare void @llvm.masked.store.v2f32 (<2 x float >, <2 x float >*, i32 , <2 x i1 >)
1034
1091
declare void @llvm.masked.store.v2i32 (<2 x i32 >, <2 x i32 >*, i32 , <2 x i1 >)
1092
+ declare void @llvm.masked.store.v4f32 (<4 x float >, <4 x float >*, i32 , <4 x i1 >)
1035
1093
declare void @llvm.masked.store.v16f32 (<16 x float >, <16 x float >*, i32 , <16 x i1 >)
1036
1094
declare void @llvm.masked.store.v16f32p (<16 x float >*, <16 x float >**, i32 , <16 x i1 >)
1037
1095
declare <16 x float > @llvm.masked.load.v16f32 (<16 x float >*, i32 , <16 x i1 >, <16 x float >)
@@ -1043,6 +1101,7 @@ declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x
1043
1101
declare <4 x double > @llvm.masked.load.v4f64 (<4 x double >*, i32 , <4 x i1 >, <4 x double >)
1044
1102
declare <2 x double > @llvm.masked.load.v2f64 (<2 x double >*, i32 , <2 x i1 >, <2 x double >)
1045
1103
declare void @llvm.masked.store.v8f64 (<8 x double >, <8 x double >*, i32 , <8 x i1 >)
1104
+ declare void @llvm.masked.store.v4f64 (<4 x double >, <4 x double >*, i32 , <4 x i1 >)
1046
1105
declare void @llvm.masked.store.v2f64 (<2 x double >, <2 x double >*, i32 , <2 x i1 >)
1047
1106
declare void @llvm.masked.store.v2i64 (<2 x i64 >, <2 x i64 >*, i32 , <2 x i1 >)
1048
1107
0 commit comments