@@ -4989,3 +4989,257 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
4989
4989
%ext = shufflevector <2 x i32 > %cvt , <2 x i32 > zeroinitializer , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
4990
4990
ret <4 x i32 > %ext
4991
4991
}
4992
+
4993
+ define <4 x i32 > @fptosi_4f16_to_4i32 (<4 x half > %a ) nounwind {
4994
+ ; AVX-LABEL: fptosi_4f16_to_4i32:
4995
+ ; AVX: # %bb.0:
4996
+ ; AVX-NEXT: subq $72, %rsp
4997
+ ; AVX-NEXT: vmovdqa %xmm0, %xmm1
4998
+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4999
+ ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
5000
+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5001
+ ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5002
+ ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5003
+ ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm0
5004
+ ; AVX-NEXT: callq __extendhfsf2@PLT
5005
+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5006
+ ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5007
+ ; AVX-NEXT: callq __extendhfsf2@PLT
5008
+ ; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5009
+ ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5010
+ ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
5011
+ ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5012
+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5013
+ ; AVX-NEXT: callq __extendhfsf2@PLT
5014
+ ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5015
+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5016
+ ; AVX-NEXT: callq __extendhfsf2@PLT
5017
+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5018
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5019
+ ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
5020
+ ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5021
+ ; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
5022
+ ; AVX-NEXT: addq $72, %rsp
5023
+ ; AVX-NEXT: retq
5024
+ ;
5025
+ ; F16C-LABEL: fptosi_4f16_to_4i32:
5026
+ ; F16C: # %bb.0:
5027
+ ; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5028
+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm0
5029
+ ; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5030
+ ; F16C-NEXT: vzeroupper
5031
+ ; F16C-NEXT: retq
5032
+ ;
5033
+ ; AVX512-LABEL: fptosi_4f16_to_4i32:
5034
+ ; AVX512: # %bb.0:
5035
+ ; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
5036
+ ; AVX512-NEXT: vcvttps2dq %ymm0, %ymm0
5037
+ ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5038
+ ; AVX512-NEXT: vzeroupper
5039
+ ; AVX512-NEXT: retq
5040
+ %cvt = fptosi <4 x half > %a to <4 x i32 >
5041
+ ret <4 x i32 > %cvt
5042
+ }
5043
+
5044
+ define <4 x i32 > @fptoui_2f16_to_4i32 (<2 x half > %a ) nounwind {
5045
+ ; AVX1-LABEL: fptoui_2f16_to_4i32:
5046
+ ; AVX1: # %bb.0:
5047
+ ; AVX1-NEXT: subq $40, %rsp
5048
+ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
5049
+ ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5051
+ ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5052
+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5053
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5054
+ ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
5055
+ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5056
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5057
+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5058
+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5059
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5060
+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5061
+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5062
+ ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5063
+ ; AVX1-NEXT: addq $40, %rsp
5064
+ ; AVX1-NEXT: retq
5065
+ ;
5066
+ ; AVX2-LABEL: fptoui_2f16_to_4i32:
5067
+ ; AVX2: # %bb.0:
5068
+ ; AVX2-NEXT: subq $40, %rsp
5069
+ ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
5070
+ ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5071
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5072
+ ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5073
+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5074
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5075
+ ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
5076
+ ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5077
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5078
+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5079
+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5080
+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5081
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5082
+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5083
+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5084
+ ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5085
+ ; AVX2-NEXT: addq $40, %rsp
5086
+ ; AVX2-NEXT: retq
5087
+ ;
5088
+ ; F16C-LABEL: fptoui_2f16_to_4i32:
5089
+ ; F16C: # %bb.0:
5090
+ ; F16C-NEXT: vpsrld $16, %xmm0, %xmm1
5091
+ ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
5092
+ ; F16C-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5093
+ ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
5094
+ ; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5095
+ ; F16C-NEXT: vcvttps2dq %xmm0, %xmm1
5096
+ ; F16C-NEXT: vpsrad $31, %xmm1, %xmm2
5097
+ ; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5098
+ ; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
5099
+ ; F16C-NEXT: vpand %xmm2, %xmm0, %xmm0
5100
+ ; F16C-NEXT: vpor %xmm0, %xmm1, %xmm0
5101
+ ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5102
+ ; F16C-NEXT: retq
5103
+ ;
5104
+ ; AVX512F-LABEL: fptoui_2f16_to_4i32:
5105
+ ; AVX512F: # %bb.0:
5106
+ ; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
5107
+ ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
5108
+ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5109
+ ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
5110
+ ; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5111
+ ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
5112
+ ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5113
+ ; AVX512F-NEXT: vzeroupper
5114
+ ; AVX512F-NEXT: retq
5115
+ ;
5116
+ ; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32:
5117
+ ; AVX512-FASTLANE: # %bb.0:
5118
+ ; AVX512-FASTLANE-NEXT: vpsrld $16, %xmm0, %xmm1
5119
+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm1, %xmm1
5120
+ ; AVX512-FASTLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5121
+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0
5122
+ ; AVX512-FASTLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5123
+ ; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0
5124
+ ; AVX512-FASTLANE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5125
+ ; AVX512-FASTLANE-NEXT: retq
5126
+ %cvt = fptoui <2 x half > %a to <2 x i32 >
5127
+ %ext = shufflevector <2 x i32 > %cvt , <2 x i32 > zeroinitializer , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
5128
+ ret <4 x i32 > %ext
5129
+ }
5130
+
5131
+ define <4 x i32 > @fptoui_4f16_to_4i32 (<4 x half > %a ) nounwind {
5132
+ ; AVX1-LABEL: fptoui_4f16_to_4i32:
5133
+ ; AVX1: # %bb.0:
5134
+ ; AVX1-NEXT: subq $72, %rsp
5135
+ ; AVX1-NEXT: vmovdqa %xmm0, %xmm1
5136
+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5137
+ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
5138
+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5139
+ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5140
+ ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5141
+ ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0
5142
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5143
+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5144
+ ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5145
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5146
+ ; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5147
+ ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5148
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5149
+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5150
+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5151
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5152
+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5153
+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5154
+ ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5155
+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5156
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5157
+ ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5158
+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5159
+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5160
+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5161
+ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5162
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5163
+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5164
+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5165
+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5166
+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5167
+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5168
+ ; AVX1-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5169
+ ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
5170
+ ; AVX1-NEXT: addq $72, %rsp
5171
+ ; AVX1-NEXT: retq
5172
+ ;
5173
+ ; AVX2-LABEL: fptoui_4f16_to_4i32:
5174
+ ; AVX2: # %bb.0:
5175
+ ; AVX2-NEXT: subq $72, %rsp
5176
+ ; AVX2-NEXT: vmovdqa %xmm0, %xmm1
5177
+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5178
+ ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
5179
+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5180
+ ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5181
+ ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5182
+ ; AVX2-NEXT: vpsrlq $48, %xmm1, %xmm0
5183
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5184
+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5185
+ ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5186
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5187
+ ; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5188
+ ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5189
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5190
+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5191
+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5192
+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5193
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5194
+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5195
+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5196
+ ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5197
+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5198
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5199
+ ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5200
+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5201
+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5202
+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5203
+ ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5204
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5205
+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5206
+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5207
+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5208
+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5209
+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5210
+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5211
+ ; AVX2-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5212
+ ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
5213
+ ; AVX2-NEXT: addq $72, %rsp
5214
+ ; AVX2-NEXT: retq
5215
+ ;
5216
+ ; F16C-LABEL: fptoui_4f16_to_4i32:
5217
+ ; F16C: # %bb.0:
5218
+ ; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5219
+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm1
5220
+ ; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5221
+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm0
5222
+ ; F16C-NEXT: vorps %ymm0, %ymm1, %ymm0
5223
+ ; F16C-NEXT: vblendvps %ymm1, %ymm0, %ymm1, %ymm0
5224
+ ; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5225
+ ; F16C-NEXT: vzeroupper
5226
+ ; F16C-NEXT: retq
5227
+ ;
5228
+ ; AVX512F-LABEL: fptoui_4f16_to_4i32:
5229
+ ; AVX512F: # %bb.0:
5230
+ ; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
5231
+ ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
5232
+ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
5233
+ ; AVX512F-NEXT: vzeroupper
5234
+ ; AVX512F-NEXT: retq
5235
+ ;
5236
+ ; AVX512-FASTLANE-LABEL: fptoui_4f16_to_4i32:
5237
+ ; AVX512-FASTLANE: # %bb.0:
5238
+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %ymm0
5239
+ ; AVX512-FASTLANE-NEXT: vcvttps2udq %ymm0, %ymm0
5240
+ ; AVX512-FASTLANE-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5241
+ ; AVX512-FASTLANE-NEXT: vzeroupper
5242
+ ; AVX512-FASTLANE-NEXT: retq
5243
+ %cvt = fptoui <4 x half > %a to <4 x i32 >
5244
+ ret <4 x i32 > %cvt
5245
+ }
0 commit comments