Skip to content

Commit e28e935

Browse files
authored
AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (#123684)
For VALU shuffles, this saves an instruction in some case.
1 parent 92b839e commit e28e935

17 files changed

+728
-833
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,95 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
489489
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490490
}
491491

492+
void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
493+
EVT VT = N->getValueType(0);
494+
EVT EltVT = VT.getVectorElementType();
495+
496+
// TODO: Handle 16-bit element vectors with even aligned masks.
497+
if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
498+
VT.getVectorNumElements() != 2) {
499+
SelectCode(N);
500+
return;
501+
}
502+
503+
auto *SVN = cast<ShuffleVectorSDNode>(N);
504+
505+
SDValue Src0 = SVN->getOperand(0);
506+
SDValue Src1 = SVN->getOperand(1);
507+
ArrayRef<int> Mask = SVN->getMask();
508+
SDLoc DL(N);
509+
510+
assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
511+
Mask[0] < 4 && Mask[1] < 4);
512+
513+
SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
514+
SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
515+
unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
516+
unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
517+
518+
if (Mask[0] < 0) {
519+
Src0SubReg = Src1SubReg;
520+
MachineSDNode *ImpDef =
521+
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
522+
VSrc0 = SDValue(ImpDef, 0);
523+
}
524+
525+
if (Mask[1] < 0) {
526+
Src1SubReg = Src0SubReg;
527+
MachineSDNode *ImpDef =
528+
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
529+
VSrc1 = SDValue(ImpDef, 0);
530+
}
531+
532+
// SGPR case needs to lower to copies.
533+
//
534+
// Also use subregister extract when we can directly blend the registers with
535+
// a simple subregister copy.
536+
//
537+
// TODO: Maybe we should fold this out earlier
538+
if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
539+
Src1SubReg == AMDGPU::sub0) {
540+
// The low element of the result always comes from src0.
541+
// The high element of the result always comes from src1.
542+
// op_sel selects the high half of src0.
543+
// op_sel_hi selects the high half of src1.
544+
545+
unsigned Src0OpSel =
546+
Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
547+
unsigned Src1OpSel =
548+
Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
549+
550+
// Enable op_sel_hi to avoid printing it. This should have no effect on the
551+
// result.
552+
Src0OpSel |= SISrcMods::OP_SEL_1;
553+
Src1OpSel |= SISrcMods::OP_SEL_1;
554+
555+
SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
556+
SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
557+
SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
558+
559+
CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
560+
{Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
561+
ZeroMods, // clamp
562+
ZeroMods, // op_sel
563+
ZeroMods, // op_sel_hi
564+
ZeroMods, // neg_lo
565+
ZeroMods}); // neg_hi
566+
return;
567+
}
568+
569+
SDValue ResultElt0 =
570+
CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
571+
SDValue ResultElt1 =
572+
CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
573+
574+
const SDValue Ops[] = {
575+
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
576+
ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
577+
ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
578+
CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
579+
}
580+
492581
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
493582
unsigned int Opc = N->getOpcode();
494583
if (N->isMachineOpcode()) {
@@ -562,6 +651,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
562651
SelectBuildVector(N, RegClassID);
563652
return;
564653
}
654+
case ISD::VECTOR_SHUFFLE:
655+
SelectVectorShuffle(N);
656+
return;
565657
case ISD::BUILD_PAIR: {
566658
SDValue RC, SubReg0, SubReg1;
567659
SDLoc DL(N);
@@ -3101,6 +3193,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
31013193
}
31023194

31033195
Mods = VecMods;
3196+
} else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3197+
Src.getNumOperands() == 2) {
3198+
3199+
// TODO: We should repeat the build_vector source check above for the
3200+
// vector_shuffle for negates and casts of individual elements.
3201+
3202+
auto *SVN = cast<ShuffleVectorSDNode>(Src);
3203+
ArrayRef<int> Mask = SVN->getMask();
3204+
3205+
if (Mask[0] < 2 && Mask[1] < 2) {
3206+
// src1 should be undef.
3207+
SDValue ShuffleSrc = SVN->getOperand(0);
3208+
3209+
if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3210+
ShuffleSrc = ShuffleSrc.getOperand(0);
3211+
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3212+
}
3213+
3214+
if (Mask[0] == 1)
3215+
Mods |= SISrcMods::OP_SEL_0;
3216+
if (Mask[1] == 1)
3217+
Mods |= SISrcMods::OP_SEL_1;
3218+
3219+
Src = ShuffleSrc;
3220+
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3221+
return true;
3222+
}
31043223
}
31053224

31063225
// Packed instructions do not have abs modifiers.

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
8686

8787
protected:
8888
void SelectBuildVector(SDNode *N, unsigned RegClassID);
89+
void SelectVectorShuffle(SDNode *N);
8990

9091
private:
9192
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
422422
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423423
Expand);
424424

425+
if (Subtarget->hasPkMovB32()) {
426+
// TODO: 16-bit element vectors should be legal with even aligned elements.
427+
// TODO: Can be legal with wider source types than the result with
428+
// subregister extracts.
429+
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
430+
}
431+
425432
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426433
Custom);
427434

llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -171,31 +171,30 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
171171
; GFX90A: ; %bb.0:
172172
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173173
; GFX90A-NEXT: ;;#ASMSTART
174-
; GFX90A-NEXT: ; def v[2:3]
174+
; GFX90A-NEXT: ; def v[0:1]
175175
; GFX90A-NEXT: ;;#ASMEND
176176
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
177177
; GFX90A-NEXT: ;;#ASMSTART
178-
; GFX90A-NEXT: ; def v[0:1]
178+
; GFX90A-NEXT: ; def v[2:3]
179179
; GFX90A-NEXT: ;;#ASMEND
180-
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
181-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
182-
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
180+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
181+
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
183182
; GFX90A-NEXT: s_waitcnt vmcnt(0)
184183
; GFX90A-NEXT: s_setpc_b64 s[30:31]
185184
;
186185
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_0:
187186
; GFX940: ; %bb.0:
188187
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189188
; GFX940-NEXT: ;;#ASMSTART
190-
; GFX940-NEXT: ; def v[2:3]
189+
; GFX940-NEXT: ; def v[0:1]
191190
; GFX940-NEXT: ;;#ASMEND
192191
; GFX940-NEXT: v_mov_b32_e32 v4, 0
193192
; GFX940-NEXT: ;;#ASMSTART
194-
; GFX940-NEXT: ; def v[0:1]
193+
; GFX940-NEXT: ; def v[2:3]
195194
; GFX940-NEXT: ;;#ASMEND
196-
; GFX940-NEXT: v_mov_b32_e32 v2, v3
197-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
198-
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
195+
; GFX940-NEXT: s_nop 0
196+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
197+
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
199198
; GFX940-NEXT: s_waitcnt vmcnt(0)
200199
; GFX940-NEXT: s_setpc_b64 s[30:31]
201200
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) {
274273
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2:
275274
; GFX90A: ; %bb.0:
276275
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
278276
; GFX90A-NEXT: ;;#ASMSTART
279277
; GFX90A-NEXT: ; def v[0:1]
280278
; GFX90A-NEXT: ;;#ASMEND
281-
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
282-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
283-
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
279+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
280+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
281+
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
284282
; GFX90A-NEXT: s_waitcnt vmcnt(0)
285283
; GFX90A-NEXT: s_setpc_b64 s[30:31]
286284
;
287285
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2:
288286
; GFX940: ; %bb.0:
289287
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290-
; GFX940-NEXT: v_mov_b32_e32 v4, 0
291288
; GFX940-NEXT: ;;#ASMSTART
292289
; GFX940-NEXT: ; def v[0:1]
293290
; GFX940-NEXT: ;;#ASMEND
294-
; GFX940-NEXT: s_nop 0
295-
; GFX940-NEXT: v_mov_b32_e32 v2, v1
296-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
297-
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
291+
; GFX940-NEXT: v_mov_b32_e32 v2, 0
292+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
293+
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
298294
; GFX940-NEXT: s_waitcnt vmcnt(0)
299295
; GFX940-NEXT: s_setpc_b64 s[30:31]
300296
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) {
447443
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0:
448444
; GFX90A: ; %bb.0:
449445
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
451446
; GFX90A-NEXT: ;;#ASMSTART
452447
; GFX90A-NEXT: ; def v[0:1]
453448
; GFX90A-NEXT: ;;#ASMEND
454-
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
455-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
456-
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
449+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
450+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
451+
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
457452
; GFX90A-NEXT: s_waitcnt vmcnt(0)
458453
; GFX90A-NEXT: s_setpc_b64 s[30:31]
459454
;
460455
; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0:
461456
; GFX940: ; %bb.0:
462457
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463-
; GFX940-NEXT: v_mov_b32_e32 v4, 0
464458
; GFX940-NEXT: ;;#ASMSTART
465459
; GFX940-NEXT: ; def v[0:1]
466460
; GFX940-NEXT: ;;#ASMEND
467-
; GFX940-NEXT: s_nop 0
468-
; GFX940-NEXT: v_mov_b32_e32 v2, v1
469-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
470-
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
461+
; GFX940-NEXT: v_mov_b32_e32 v2, 0
462+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
463+
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
471464
; GFX940-NEXT: s_waitcnt vmcnt(0)
472465
; GFX940-NEXT: s_setpc_b64 s[30:31]
473466
%vec0 = call <2 x float> asm "; def $0", "=v"()

llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
632632
; GFX90A-NEXT: ;;#ASMSTART
633633
; GFX90A-NEXT: ; def v[0:2]
634634
; GFX90A-NEXT: ;;#ASMEND
635-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
636-
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
637-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
638-
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
635+
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
636+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
637+
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
639638
; GFX90A-NEXT: s_waitcnt vmcnt(0)
640639
; GFX90A-NEXT: s_setpc_b64 s[30:31]
641640
;
@@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
645644
; GFX940-NEXT: ;;#ASMSTART
646645
; GFX940-NEXT: ; def v[0:2]
647646
; GFX940-NEXT: ;;#ASMEND
648-
; GFX940-NEXT: v_mov_b32_e32 v4, 0
649-
; GFX940-NEXT: v_mov_b32_e32 v2, v1
650-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
651-
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
647+
; GFX940-NEXT: v_mov_b32_e32 v3, 0
648+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
649+
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
652650
; GFX940-NEXT: s_waitcnt vmcnt(0)
653651
; GFX940-NEXT: s_setpc_b64 s[30:31]
654652
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
765763
; GFX90A-NEXT: ;;#ASMSTART
766764
; GFX90A-NEXT: ; def v[0:2]
767765
; GFX90A-NEXT: ;;#ASMEND
766+
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
768767
; GFX90A-NEXT: ;;#ASMSTART
769768
; GFX90A-NEXT: ; def v[2:4]
770769
; GFX90A-NEXT: ;;#ASMEND
771-
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
772-
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
773-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
774-
; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
770+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
771+
; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
775772
; GFX90A-NEXT: s_waitcnt vmcnt(0)
776773
; GFX90A-NEXT: s_setpc_b64 s[30:31]
777774
;
@@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
786783
; GFX940-NEXT: ; def v[2:4]
787784
; GFX940-NEXT: ;;#ASMEND
788785
; GFX940-NEXT: s_nop 0
789-
; GFX940-NEXT: v_mov_b32_e32 v2, v3
790-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
791-
; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
786+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
787+
; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
792788
; GFX940-NEXT: s_waitcnt vmcnt(0)
793789
; GFX940-NEXT: s_setpc_b64 s[30:31]
794790
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
14801476
; GFX90A-NEXT: ;;#ASMSTART
14811477
; GFX90A-NEXT: ; def v[0:2]
14821478
; GFX90A-NEXT: ;;#ASMEND
1483-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1484-
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
1485-
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
1486-
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
1479+
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
1480+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
1481+
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
14871482
; GFX90A-NEXT: s_waitcnt vmcnt(0)
14881483
; GFX90A-NEXT: s_setpc_b64 s[30:31]
14891484
;
@@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
14931488
; GFX940-NEXT: ;;#ASMSTART
14941489
; GFX940-NEXT: ; def v[0:2]
14951490
; GFX940-NEXT: ;;#ASMEND
1496-
; GFX940-NEXT: v_mov_b32_e32 v4, 0
1497-
; GFX940-NEXT: v_mov_b32_e32 v2, v1
1498-
; GFX940-NEXT: v_mov_b32_e32 v3, v0
1499-
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1491+
; GFX940-NEXT: v_mov_b32_e32 v3, 0
1492+
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
1493+
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
15001494
; GFX940-NEXT: s_waitcnt vmcnt(0)
15011495
; GFX940-NEXT: s_setpc_b64 s[30:31]
15021496
%vec0 = call <3 x float> asm "; def $0", "=v"()

0 commit comments

Comments
 (0)