@@ -2063,9 +2063,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2063
2063
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2064
2064
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
2065
2065
2066
- { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2067
2066
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2068
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2069
2067
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2070
2068
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2071
2069
@@ -2084,39 +2082,36 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2084
2082
// These are somewhat magic numbers justified by looking at the output of
2085
2083
// Intel's IACA, running some kernels and making sure when we take
2086
2084
// legalization into account the throughput will be overestimated.
2087
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
2088
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16 *10 },
2089
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
2090
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8 *10 },
2091
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2092
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 *10 },
2093
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 *10 },
2094
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
2095
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2 *10 },
2096
-
2097
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16 *10 },
2098
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
2099
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
2100
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8 *10 },
2101
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 *10 },
2102
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2103
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
2104
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
2085
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2086
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2087
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2088
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2089
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2090
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2091
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2092
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2093
+
2094
+ { ISD::UINT_TO_FP, MVT::f32 , MVT::i64 , 8 },
2095
+ { ISD::UINT_TO_FP, MVT::f64 , MVT::i64 , 9 },
2096
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2097
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2098
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2099
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2100
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2101
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2102
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2103
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2105
2104
2106
2105
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
2107
2106
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
2108
2107
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
2109
2108
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2110
2109
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2111
2110
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
2112
-
2113
2111
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
2114
2112
2115
- { ISD::UINT_TO_FP, MVT::f32 , MVT::i64 , 6 },
2116
- { ISD::UINT_TO_FP, MVT::f64 , MVT::i64 , 6 },
2117
-
2118
2113
{ ISD::FP_TO_UINT, MVT::i64 , MVT::f32 , 4 },
2119
- { ISD::FP_TO_UINT, MVT::i64 , MVT::f64 , 4 },
2114
+ { ISD::FP_TO_UINT, MVT::i64 , MVT::f64 , 15 },
2120
2115
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
2121
2116
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
2122
2117
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
@@ -2138,11 +2133,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2138
2133
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2139
2134
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
2140
2135
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
2141
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
2136
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
2142
2137
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
2143
2138
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
2144
2139
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2145
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
2140
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
2146
2141
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2147
2142
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
2148
2143
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
@@ -2250,12 +2245,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2250
2245
if (ST->hasSSE41 () && !ST->hasAVX ())
2251
2246
if (const auto *Entry = ConvertCostTableLookup (SSE41ConversionTbl, ISD,
2252
2247
LTDest.second , LTSrc.second ))
2253
- return AdjustCost (LTSrc.first * Entry->Cost );
2248
+ return AdjustCost (std::max ( LTSrc.first , LTDest. first ) * Entry->Cost );
2254
2249
2255
2250
if (ST->hasSSE2 () && !ST->hasAVX ())
2256
2251
if (const auto *Entry = ConvertCostTableLookup (SSE2ConversionTbl, ISD,
2257
2252
LTDest.second , LTSrc.second ))
2258
- return AdjustCost (LTSrc.first * Entry->Cost );
2253
+ return AdjustCost (std::max ( LTSrc.first , LTDest. first ) * Entry->Cost );
2259
2254
2260
2255
return AdjustCost (
2261
2256
BaseT::getCastInstrCost (Opcode, Dst, Src, CCH, CostKind, I));
0 commit comments