Skip to content

Commit a7d8d11

Browse files
committed
[X86] combineConcatVectorOps - constant fold vector load concatenation directly into a new load.
Create a new constant pool entry directly instead of going via a BUILD_VECTOR node, which makes constant pool reuse more difficult. Helps with some regressions in #73509
1 parent 7e761ba commit a7d8d11

11 files changed

+730
-711
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7040,6 +7040,31 @@ static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
70407040
IsAfterLegalize);
70417041
}
70427042

7043+
static Constant *getConstantVector(MVT VT, ArrayRef<APInt> Bits,
7044+
const APInt &Undefs, LLVMContext &C) {
7045+
unsigned ScalarSize = VT.getScalarSizeInBits();
7046+
Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7047+
7048+
auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7049+
if (VT.isFloatingPoint()) {
7050+
if (ScalarSize == 16)
7051+
return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7052+
if (ScalarSize == 32)
7053+
return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7054+
assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7055+
return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7056+
}
7057+
return Constant::getIntegerValue(Ty, Val);
7058+
};
7059+
7060+
SmallVector<Constant *, 32> ConstantVec;
7061+
for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7062+
ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7063+
: getConstantScalar(Bits[I]));
7064+
7065+
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7066+
}
7067+
70437068
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
70447069
unsigned SplatBitSize, LLVMContext &C) {
70457070
unsigned ScalarSize = VT.getScalarSizeInBits();
@@ -54978,6 +55003,32 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5497855003
}
5497955004
}
5498055005

55006+
// Attempt to fold target constant loads.
55007+
if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55008+
SmallVector<APInt> EltBits;
55009+
APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55010+
for (unsigned I = 0; I != NumOps; ++I) {
55011+
APInt OpUndefElts;
55012+
SmallVector<APInt> OpEltBits;
55013+
if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55014+
OpEltBits, true, false))
55015+
break;
55016+
EltBits.append(OpEltBits);
55017+
UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55018+
}
55019+
if (EltBits.size() == VT.getVectorNumElements()) {
55020+
Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
55021+
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
55022+
SDValue CV = DAG.getConstantPool(C, PVT);
55023+
MachineFunction &MF = DAG.getMachineFunction();
55024+
MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
55025+
SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
55026+
SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
55027+
DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
55028+
return Ld;
55029+
}
55030+
}
55031+
5498155032
// If this simple subvector or scalar/subvector broadcast_load is inserted
5498255033
// into both halves, use a larger broadcast_load. Update other uses to use
5498355034
// an extracted subvector.
@@ -55000,23 +55051,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5500055051
}
5500155052
}
5500255053

55003-
// Attempt to fold target constant loads.
55004-
if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55005-
SmallVector<APInt> EltBits;
55006-
APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55007-
for (unsigned I = 0; I != NumOps; ++I) {
55008-
APInt OpUndefElts;
55009-
SmallVector<APInt> OpEltBits;
55010-
if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55011-
OpEltBits, true, false))
55012-
break;
55013-
EltBits.append(OpEltBits);
55014-
UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55015-
}
55016-
if (EltBits.size() == VT.getVectorNumElements())
55017-
return getConstVector(EltBits, UndefElts, VT, DAG, DL);
55018-
}
55019-
5502055054
// If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
5502155055
if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
5502255056
Subtarget.useAVX512Regs()) {

llvm/test/CodeGen/X86/combine-concatvectors.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ define void @concat_of_broadcast_v2f64_v4f64() {
4848
; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000
4949
; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001
5050
; AVX1-NEXT: movq %rcx, 46348(%rax)
51-
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3]
52-
; AVX1-NEXT: # ymm0 = mem[0,1,0,1]
51+
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
5352
; AVX1-NEXT: vmovups %ymm0, 48296(%rax)
5453
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
5554
; AVX1-NEXT: vmovsd %xmm0, 47372(%rax)

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -652,8 +652,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
652652
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
653653
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,0,u,1,1,u,2>
654654
; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm2
655-
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
656-
; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
655+
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
657656
; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero
658657
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
659658
; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2

0 commit comments

Comments
 (0)