Skip to content

Commit de25eba

Browse files
committed
[CostModel][X86] Add vXi32 division by uniform constant costs (PR47476)
Other types can be handled in future patches but their uniform / non-uniform costs are more similar and don't appear to cause many vectorization issues.
1 parent 0aea3a7 commit de25eba

File tree

5 files changed

+269
-880
lines changed

5 files changed

+269
-880
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
321321
{ ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
322322
{ ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
323323
{ ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
324+
325+
{ ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
326+
{ ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
327+
{ ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
328+
{ ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
324329
};
325330

326331
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
336341
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
337342

338343
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
344+
345+
{ ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
346+
{ ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
347+
{ ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
348+
{ ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
339349
};
340350

341351
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
353363
{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
354364
{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
355365
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
366+
367+
{ ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
368+
{ ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
369+
{ ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
370+
{ ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
371+
{ ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
372+
{ ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
373+
{ ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
374+
{ ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
356375
};
357376

358377
// XOP has faster vXi8 shifts.

0 commit comments

Comments
 (0)