Skip to content

AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 #123684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,95 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
}

void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();

// TODO: Handle 16-bit element vectors with even aligned masks.
if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
VT.getVectorNumElements() != 2) {
SelectCode(N);
return;
}

auto *SVN = cast<ShuffleVectorSDNode>(N);

SDValue Src0 = SVN->getOperand(0);
SDValue Src1 = SVN->getOperand(1);
ArrayRef<int> Mask = SVN->getMask();
SDLoc DL(N);

assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
Mask[0] < 4 && Mask[1] < 4);

SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;

if (Mask[0] < 0) {
Src0SubReg = Src1SubReg;
MachineSDNode *ImpDef =
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
VSrc0 = SDValue(ImpDef, 0);
}

if (Mask[1] < 0) {
Src1SubReg = Src0SubReg;
MachineSDNode *ImpDef =
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
VSrc1 = SDValue(ImpDef, 0);
}

// SGPR case needs to lower to copies.
//
// Also use subregister extract when we can directly blend the registers with
// a simple subregister copy.
//
// TODO: Maybe we should fold this out earlier
if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
Src1SubReg == AMDGPU::sub0) {
// The low element of the result always comes from src0.
// The high element of the result always comes from src1.
// op_sel selects the high half of src0.
// op_sel_hi selects the high half of src1.

unsigned Src0OpSel =
Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
unsigned Src1OpSel =
Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
Comment on lines +545 to +548
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is correctly encoded. I'm confused by how op_sel and op_sel_hi are supposed to be represented. We set fields in the source modifiers. I guess this should probably be OP_SEL_1?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is written in a very confusing way in the docs, but I think you have it correct in the code. Out of the 6 bits (op_sel[0-2] and op_sel_hi[0-2]) only op_sel[0] and op_sel[1] do anything iiuc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should set op_sel_hi to 1 to avoid the spurious printing of it in every test, with the ridiculous op_sel syntax


// Enable op_sel_hi to avoid printing it. This should have no effect on the
// result.
Src0OpSel |= SISrcMods::OP_SEL_1;
Src1OpSel |= SISrcMods::OP_SEL_1;

SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);

CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
{Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
ZeroMods, // clamp
ZeroMods, // op_sel
ZeroMods, // op_sel_hi
ZeroMods, // neg_lo
ZeroMods}); // neg_hi
return;
}

SDValue ResultElt0 =
CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
SDValue ResultElt1 =
CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);

const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
}

void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
Expand Down Expand Up @@ -562,6 +651,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBuildVector(N, RegClassID);
return;
}
case ISD::VECTOR_SHUFFLE:
SelectVectorShuffle(N);
return;
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
Expand Down Expand Up @@ -3101,6 +3193,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
}

Mods = VecMods;
} else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
Src.getNumOperands() == 2) {

// TODO: We should repeat the build_vector source check above for the
// vector_shuffle for negates and casts of individual elements.

auto *SVN = cast<ShuffleVectorSDNode>(Src);
ArrayRef<int> Mask = SVN->getMask();

if (Mask[0] < 2 && Mask[1] < 2) {
// src1 should be undef.
SDValue ShuffleSrc = SVN->getOperand(0);

if (ShuffleSrc.getOpcode() == ISD::FNEG) {
ShuffleSrc = ShuffleSrc.getOperand(0);
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
}

if (Mask[0] == 1)
Mods |= SISrcMods::OP_SEL_0;
if (Mask[1] == 1)
Mods |= SISrcMods::OP_SEL_1;

Src = ShuffleSrc;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
}

// Packed instructions do not have abs modifiers.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {

protected:
void SelectBuildVector(SDNode *N, unsigned RegClassID);
void SelectVectorShuffle(SDNode *N);

private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
Expand);

if (Subtarget->hasPkMovB32()) {
// TODO: 16-bit element vectors should be legal with even aligned elements.
// TODO: Can be legal with wider source types than the result with
// subregister extracts.
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}

setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);

Expand Down
49 changes: 21 additions & 28 deletions llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,31 +171,30 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down
40 changes: 17 additions & 23 deletions llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v3, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v3, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down
Loading
Loading