Skip to content

[AMDGPU] Legalize 64bit elements for BUILD_VECTOR on gfx942 #145052

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT EltVT = VT.getVectorElementType();
SDLoc DL(N);
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
unsigned NumRegs = EltVT.getSizeInBits() / 32;
bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
bool IsGCN = TM.getTargetTriple().isAMDGCN();


if (NumVectorElts == 1) {
CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
Expand All @@ -449,12 +451,13 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {

assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
assert((IsGCN || (!IsGCN && NumRegs == 1)) &&
"R600 does not support 64-bit reg_seq elements");
// 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);

bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
Expand All @@ -464,8 +467,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
: R600RegisterInfo::getSubRegFromChannel(i);
unsigned Sub =
IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
: R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
Expand All @@ -475,8 +479,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
: R600RegisterInfo::getSubRegFromChannel(i);
unsigned Sub =
IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
: R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
CurDAG->getTargetConstant(Sub, DL, MVT::i32);
Expand Down Expand Up @@ -644,9 +649,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}

assert(VT.getVectorElementType().bitsEq(MVT::i32));
EVT VET = VT.getVectorElementType();
assert(VET.bitsEq(MVT::i32) || VET.bitsEq(MVT::i64));
unsigned EltSize = VET.getSizeInBits();
unsigned RegClassID =
SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * EltSize)
->getID();
SelectBuildVector(N, RegClassID);
return;
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5206,6 +5206,14 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST: {
EVT DestVT = N->getValueType(0);

// Avoid undoing build_vector with 64b elements if subtarget supports 64b
// movs (i.e., avoid inf loop through combines).
if (Subtarget->isGCN()) {
const GCNSubtarget &ST = DAG.getSubtarget<GCNSubtarget>();
if (ST.hasMovB64())
break;
}
Comment on lines +5209 to +5215
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this into the SITargetLowering one and avoid calling the base class implementation instead


// Push casts through vector builds. This helps avoid emitting a large
// number of copies when materializing floating point vector constants.
//
Expand Down
114 changes: 99 additions & 15 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,9 +357,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Most operations are naturally 32-bit vector operations. We only support
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);

if (STI.hasMovB64())
setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
else {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);

Expand All @@ -371,9 +374,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);

if (STI.hasMovB64())
setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
else {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);

Expand All @@ -385,9 +391,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);

if (STI.hasMovB64())
setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
else {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);

Expand All @@ -399,9 +408,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);

if (STI.hasMovB64())
setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
else {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);

Expand All @@ -413,9 +425,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);

if (STI.hasMovB64())
setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
Comment on lines +428 to +429
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be the default already?

else {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);

Expand Down Expand Up @@ -945,6 +960,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

setTargetDAGCombine({ISD::ADD,
ISD::BUILD_VECTOR,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
Expand Down Expand Up @@ -15486,6 +15502,72 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}

SDValue
SITargetLowering::performBuildVectorCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
const GCNSubtarget *ST = getSubtarget();
if (DCI.Level < AfterLegalizeDAG || !ST->hasMovB64())
return SDValue();
Comment on lines +15509 to +15510
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still worthwhile without v_mov_b64 if we are going to use s_mov_b64 for the final use


SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);

EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
unsigned SizeBits = VT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();

// Skip if:
// - Value type isn't multiplication of 64 bit (e.g., v3i32), or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// - Value type isn't multiplication of 64 bit (e.g., v3i32), or
// - Value type isn't multiple of 64 bit (e.g., v3i32), or

Still can handle the v3 case

// - BuildVector instruction has non-constants, or
// - Element type has already been combined into i64 elements
if ((SizeBits % 64) != 0 || !BVN->isConstant() || EltVT == MVT::i64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also f64?

return SDValue();
Comment on lines +15525 to +15526
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should drop the isConstant check, you're necessarily performing the same check below anyway


// Construct the 64b values.
SmallVector<uint64_t, 8> ImmVals;
uint64_t ImmVal = 0;
uint64_t ImmSize = 0;
for (SDValue Opand : N->ops()) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
if (!C)
return SDValue();

ImmVal |= C->getZExtValue() << ImmSize;
ImmSize += EltSize;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand what ImmSize is for. All the sizes are exactly computable from the type and number of operands, you shouldn't need to sum anything?

if (ImmSize > 64)
return SDValue();
Comment on lines +15539 to +15540
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this only handles v2i32 and maybe v4i16? Arbitrary width should work

if (ImmSize == 64) {
if (!isUInt<32>(ImmVal))
return SDValue();
ImmVals.push_back(ImmVal);
ImmVal = 0;
ImmSize = 0;
}
}

// Avoid emitting build_vector with 1 element and directly emit value.
if (ImmVals.size() == 1) {
SDValue Val = DAG.getConstant(ImmVals[0], SL, MVT::i64);
return DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Val);
}

// Construct and return build_vector with 64b elements.
if (!ImmVals.empty()) {
SmallVector<SDValue, 8> VectorConsts;
for (uint64_t I : ImmVals)
VectorConsts.push_back(DAG.getConstant(I, SL, MVT::i64));
unsigned NewNumElts = SizeBits / 64;
LLVMContext &Ctx = *DAG.getContext();
EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);
SDValue BV = DAG.getBuildVector(
NewVT, SL, ArrayRef(VectorConsts.begin(), VectorConsts.end()));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
NewVT, SL, ArrayRef(VectorConsts.begin(), VectorConsts.end()));
NewVT, SL, VectorConsts));

return DAG.getBitcast(VT, BV);
}
return SDValue();
}

SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
Expand Down Expand Up @@ -15573,6 +15655,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFCanonicalizeCombine(N, DCI);
case AMDGPUISD::RCP:
return performRcpCombine(N, DCI);
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI);
case ISD::FLDEXP:
case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;

bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;

Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -152,24 +152,24 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GFX-950: ; %bb.0:
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[2:3]|, |v[4:5]|
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
; GFX-950-NEXT: v_and_b32_e32 v2, 1, v6
; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1]
; GFX-950-NEXT: v_cvt_f32_f64_e32 v8, v[0:1]
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
; GFX-950-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[0:1]
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
; GFX-950-NEXT: v_add_u32_e32 v5, v8, v5
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX-950-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
Expand Down
Loading
Loading