@@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
321
321
{ ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
322
322
{ ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
323
323
{ ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
324
+
325
+ { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
326
+ { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
327
+ { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
328
+ { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
324
329
};
325
330
326
331
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
336
341
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
337
342
338
343
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
344
+
345
+ { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
346
+ { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
347
+ { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
348
+ { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
339
349
};
340
350
341
351
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
353
363
{ ISD::SHL, MVT::v32i8, 4 +2 }, // 2*(psllw + pand) + split.
354
364
{ ISD::SRL, MVT::v32i8, 4 +2 }, // 2*(psrlw + pand) + split.
355
365
{ ISD::SRA, MVT::v32i8, 8 +2 }, // 2*(psrlw, pand, pxor, psubb) + split.
366
+
367
+ { ISD::SDIV, MVT::v8i32, 12 +2 }, // 2*pmuludq sequence + split.
368
+ { ISD::SREM, MVT::v8i32, 16 +2 }, // 2*pmuludq+mul+sub sequence + split.
369
+ { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
370
+ { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
371
+ { ISD::UDIV, MVT::v8i32, 10 +2 }, // 2*pmuludq sequence + split.
372
+ { ISD::UREM, MVT::v8i32, 14 +2 }, // 2*pmuludq+mul+sub sequence + split.
373
+ { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
374
+ { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
356
375
};
357
376
358
377
// XOP has faster vXi8 shifts.
0 commit comments