Skip to content

Commit cdca178

Browse files
committed
[CostModel][X86] Adjust uitofp(vXi64) SSE/AVX legalized costs based on llvm-mca reports.
Update v4i64 -> v4f32/v4f64 uitofp costs based on the worst case costs from the script in D103695. Fixes a few regressions before we start adding AVX costs for legalized types.
1 parent 28ac873 commit cdca178

File tree

4 files changed

+29
-84
lines changed

4 files changed

+29
-84
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1968,6 +1968,8 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
19681968
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
19691969
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
19701970
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
1971+
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
1972+
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
19711973
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
19721974
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
19731975
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
@@ -2069,6 +2071,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
20692071

20702072
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
20712073
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2074+
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2075+
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2076+
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
20722077

20732078
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
20742079
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },

llvm/test/Analysis/CostModel/X86/uitofp.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -114,16 +114,16 @@ define i32 @uitofp_i64_double() {
114114
;
115115
; SSE42-LABEL: 'uitofp_i64_double'
116116
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
117-
; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
118-
; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
119-
; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
117+
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
118+
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
119+
; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
120120
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
121121
;
122122
; AVX-LABEL: 'uitofp_i64_double'
123123
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
124124
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
125-
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
126-
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
125+
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
126+
; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
127127
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
128128
;
129129
; AVX512F-LABEL: 'uitofp_i64_double'
@@ -265,24 +265,24 @@ define i32 @uitofp_i64_float() {
265265
;
266266
; SSE42-LABEL: 'uitofp_i64_float'
267267
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
268-
; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
269-
; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
270-
; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
271-
; SSE42-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
268+
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
269+
; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
270+
; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
271+
; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
272272
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
273273
;
274274
; AVX-LABEL: 'uitofp_i64_float'
275275
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
276276
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
277-
; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
278-
; AVX-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
279-
; AVX-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
277+
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
278+
; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
279+
; AVX-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
280280
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
281281
;
282282
; AVX512F-LABEL: 'uitofp_i64_float'
283283
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
284284
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
285-
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
285+
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
286286
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
287287
; AVX512F-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
288288
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef

llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
77

88
; CHECK: cost of 4 for VF 1 For instruction: %conv = uitofp i64 %tmp to double
99
; CHECK: cost of 5 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
10-
; CHECK: cost of 6 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
10+
; CHECK: cost of 10 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
1111
define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
1212
entry:
1313
br label %for.body

llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll

Lines changed: 10 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -575,38 +575,11 @@ define void @uitofp_2i64_2f32() #0 {
575575
}
576576

577577
define void @uitofp_4i64_4f32() #0 {
578-
; SSE-LABEL: @uitofp_4i64_4f32(
579-
; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
580-
; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
581-
; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
582-
; SSE-NEXT: ret void
583-
;
584-
; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
585-
; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
586-
; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
587-
; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
588-
; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
589-
; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
590-
; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
591-
; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
592-
; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
593-
; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
594-
; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
595-
; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
596-
; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
597-
; AVX256NODQ-NEXT: ret void
598-
;
599-
; AVX512-LABEL: @uitofp_4i64_4f32(
600-
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
601-
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
602-
; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
603-
; AVX512-NEXT: ret void
604-
;
605-
; AVX256DQ-LABEL: @uitofp_4i64_4f32(
606-
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
607-
; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
608-
; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
609-
; AVX256DQ-NEXT: ret void
578+
; CHECK-LABEL: @uitofp_4i64_4f32(
579+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
580+
; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
581+
; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
582+
; CHECK-NEXT: ret void
610583
;
611584
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
612585
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -633,44 +606,11 @@ define void @uitofp_8i64_8f32() #0 {
633606
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
634607
; SSE-NEXT: ret void
635608
;
636-
; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
637-
; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
638-
; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
639-
; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
640-
; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
641-
; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
642-
; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
643-
; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
644-
; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
645-
; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
646-
; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
647-
; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
648-
; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
649-
; AVX256NODQ-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
650-
; AVX256NODQ-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
651-
; AVX256NODQ-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
652-
; AVX256NODQ-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
653-
; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
654-
; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
655-
; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
656-
; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
657-
; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
658-
; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
659-
; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
660-
; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
661-
; AVX256NODQ-NEXT: ret void
662-
;
663-
; AVX512-LABEL: @uitofp_8i64_8f32(
664-
; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
665-
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
666-
; AVX512-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
667-
; AVX512-NEXT: ret void
668-
;
669-
; AVX256DQ-LABEL: @uitofp_8i64_8f32(
670-
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
671-
; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
672-
; AVX256DQ-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
673-
; AVX256DQ-NEXT: ret void
609+
; AVX-LABEL: @uitofp_8i64_8f32(
610+
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
611+
; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
612+
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
613+
; AVX-NEXT: ret void
674614
;
675615
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
676616
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8

0 commit comments

Comments
 (0)