Skip to content

Commit 391249d

Browse files
committed
[AMDGPU] Allow 8,16 bit sources in calculateSrcByte
This is required for many trees produced in practice for i8 CodeGen. Differential Revision: https://reviews.llvm.org/D155864 Change-Id: Iac01d183d9998b15138bdc7a5051e3bed338e7d9
1 parent 27f39ad commit 391249d

File tree

6 files changed

+114
-65
lines changed

6 files changed

+114
-65
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10428,10 +10428,12 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
1042810428
if (Depth >= 6)
1042910429
return std::nullopt;
1043010430

10431+
auto ValueSize = Op.getValueSizeInBits();
10432+
if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
10433+
return std::nullopt;
10434+
1043110435
switch (Op->getOpcode()) {
1043210436
case ISD::TRUNCATE: {
10433-
if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
10434-
return std::nullopt;
1043510437
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
1043610438
}
1043710439

@@ -10451,9 +10453,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
1045110453
}
1045210454

1045310455
default: {
10454-
if (Op.getScalarValueSizeInBits() != 32)
10455-
return std::nullopt;
10456-
1045710456
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
1045810457
}
1045910458
}
@@ -10595,6 +10594,17 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
1059510594
return std::nullopt;
1059610595
}
1059710596

10597+
case ISD::CopyFromReg: {
10598+
auto BitWidth = Op.getScalarValueSizeInBits();
10599+
if (BitWidth % 8)
10600+
llvm_unreachable("Invalid type in CopyFromReg");
10601+
10602+
if (BitWidth / 8 > Index)
10603+
return calculateSrcByte(Op, StartingIndex, Index);
10604+
10605+
return std::nullopt;
10606+
}
10607+
1059810608
case ISD::LOAD: {
1059910609
auto L = cast<LoadSDNode>(Op.getNode());
1060010610
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
@@ -10631,7 +10641,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
1063110641
}
1063210642

1063310643
// Returns true if the Operand is a scalar and is 16 bits
10634-
static bool is16BitScalarOp(SDValue &Operand) {
10644+
static bool isExtendedFrom16Bits(SDValue &Operand) {
10645+
1063510646
switch (Operand.getOpcode()) {
1063610647
case ISD::ANY_EXTEND:
1063710648
case ISD::SIGN_EXTEND:
@@ -10647,7 +10658,7 @@ static bool is16BitScalarOp(SDValue &Operand) {
1064710658
auto MemVT = L->getMemoryVT();
1064810659
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
1064910660
}
10650-
return false;
10661+
return L->getMemoryVT().getSizeInBits() == 16;
1065110662
}
1065210663
default:
1065310664
return false;
@@ -10675,29 +10686,29 @@ static bool addresses16Bits(int Mask) {
1067510686
// Do not lower into v_perm if the operands are actually 16 bit
1067610687
// and the selected bits (based on PermMask) correspond with two
1067710688
// easily addressable 16 bit operands.
10678-
static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
10689+
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
1067910690
SDValue &OtherOp) {
1068010691
int Low16 = PermMask & 0xffff;
1068110692
int Hi16 = (PermMask & 0xffff0000) >> 16;
1068210693

10683-
// ByteProvider only accepts 32 bit operands
10684-
assert(Op.getValueType().getSizeInBits() == 32);
10685-
assert(OtherOp.getValueType().getSizeInBits() == 32);
10694+
assert(Op.getValueType().isByteSized());
10695+
assert(OtherOp.getValueType().isByteSized());
1068610696

10687-
auto OpIs16Bit = is16BitScalarOp(Op);
10688-
auto OtherOpIs16Bit = is16BitScalarOp(Op);
10697+
auto TempOp = peekThroughBitcasts(Op);
10698+
auto TempOtherOp = peekThroughBitcasts(OtherOp);
1068910699

10690-
// If there is a size mismatch, then we must use masking on at least one
10691-
// operand
10692-
if (OpIs16Bit != OtherOpIs16Bit)
10700+
auto OpIs16Bit =
10701+
TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
10702+
if (!OpIs16Bit)
1069310703
return true;
1069410704

10695-
// If both operands are 16 bit, return whether or not we cleanly address both
10696-
if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
10697-
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
10705+
auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
10706+
isExtendedFrom16Bits(TempOtherOp);
10707+
if (!OtherOpIs16Bit)
10708+
return true;
1069810709

10699-
// Both are 32 bit operands
10700-
return true;
10710+
// Do we cleanly address both
10711+
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
1070110712
}
1070210713

1070310714
SDValue SITargetLowering::performOrCombine(SDNode *N,
@@ -10822,8 +10833,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1082210833
std::optional<ByteProvider<SDValue>> P =
1082310834
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
1082410835
// TODO support constantZero
10825-
if (!P || P->isConstantZero())
10836+
if (!P || P->isConstantZero()) {
1082610837
return SDValue();
10838+
}
1082710839

1082810840
PermNodes.push_back(*P);
1082910841
}
@@ -10832,7 +10844,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1083210844

1083310845
int FirstSrc = 0;
1083410846
std::optional<int> SecondSrc;
10835-
uint64_t permMask = 0x00000000;
10847+
uint64_t PermMask = 0x00000000;
1083610848
for (size_t i = 0; i < PermNodes.size(); i++) {
1083710849
auto PermOp = PermNodes[i];
1083810850
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
@@ -10843,15 +10855,15 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1084310855
if (SecondSrc.has_value())
1084410856
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
1084510857
return SDValue();
10858+
1084610859
// Set the index of the second distinct Src node
1084710860
SecondSrc = i;
10848-
assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
10849-
32);
10861+
assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
1085010862
SrcByteAdjust = 0;
1085110863
}
1085210864
assert(PermOp.SrcOffset + SrcByteAdjust < 8);
1085310865
assert(!DAG.getDataLayout().isBigEndian());
10854-
permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
10866+
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
1085510867
}
1085610868

1085710869
SDValue Op = *PermNodes[FirstSrc].Src;
@@ -10860,8 +10872,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1086010872

1086110873
// Check that we are not just extracting the bytes in order from an op
1086210874
if (Op == OtherOp) {
10863-
int Low16 = permMask & 0xffff;
10864-
int Hi16 = (permMask & 0xffff0000) >> 16;
10875+
int Low16 = PermMask & 0xffff;
10876+
int Hi16 = (PermMask & 0xffff0000) >> 16;
1086510877

1086610878
bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
1086710879
bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
@@ -10871,10 +10883,23 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1087110883
return Op;
1087210884
}
1087310885

10874-
if (hasEightBitAccesses(permMask, Op, OtherOp)) {
10886+
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
1087510887
SDLoc DL(N);
10888+
assert(Op.getValueType().isByteSized() &&
10889+
OtherOp.getValueType().isByteSized());
10890+
if (Op.getValueSizeInBits() < 32)
10891+
// If the ultimate src is less than 32 bits, then we will only be
10892+
// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
10893+
// CalculateByteProvider would not have returned Op as source if we
10894+
// used a byte that is outside its ValueType. Thus, we are free to
10895+
// ANY_EXTEND as the extended bits are dont-cares.
10896+
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op);
10897+
10898+
if (OtherOp.getValueSizeInBits() < 32)
10899+
OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
10900+
1087610901
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
10877-
DAG.getConstant(permMask, DL, MVT::i32));
10902+
DAG.getConstant(PermMask, DL, MVT::i32));
1087810903
}
1087910904
}
1088010905
}

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,18 +1263,13 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
12631263
; GFX9-LABEL: test_ret_v3bf16:
12641264
; GFX9: ; %bb.0: ; %entry
12651265
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266-
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
1267-
; GFX9-NEXT: s_mov_b32 s4, 0xffff
1268-
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
12691266
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
12701267
; GFX9-NEXT: s_setpc_b64 s[30:31]
12711268
;
12721269
; GFX10-LABEL: test_ret_v3bf16:
12731270
; GFX10: ; %bb.0: ; %entry
12741271
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275-
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
12761272
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
1277-
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
12781273
; GFX10-NEXT: s_setpc_b64 s[30:31]
12791274
entry:
12801275
ret <3 x bfloat> %in
@@ -1802,9 +1797,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
18021797
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
18031798
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
18041799
; GFX9-NEXT: s_addk_i32 s32, 0x400
1805-
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
1806-
; GFX9-NEXT: s_mov_b32 s4, 0xffff
1807-
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4
18081800
; GFX9-NEXT: s_getpc_b64 s[4:5]
18091801
; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
18101802
; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -1841,11 +1833,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
18411833
; GFX10-NEXT: s_getpc_b64 s[4:5]
18421834
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
18431835
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
1844-
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
1845-
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
18461836
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
1837+
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
18471838
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
1848-
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4
18491839
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
18501840
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
18511841
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]

llvm/test/CodeGen/AMDGPU/load-hi16.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
266266
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267267
; GFX803-NEXT: s_mov_b32 m0, -1
268268
; GFX803-NEXT: ds_read_u16 v0, v0
269+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
269270
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
270-
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
271-
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
271+
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
272272
; GFX803-NEXT: s_setpc_b64 s[30:31]
273273
;
274274
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
311311
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312312
; GFX803-NEXT: s_mov_b32 m0, -1
313313
; GFX803-NEXT: ds_read_u16 v0, v0
314+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
314315
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
315-
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
316-
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
316+
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
317317
; GFX803-NEXT: flat_store_dword v[0:1], v0
318318
; GFX803-NEXT: s_waitcnt vmcnt(0)
319319
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
696696
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
697697
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
698698
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
699+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
699700
; GFX803-NEXT: s_waitcnt vmcnt(0)
700-
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
701-
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
701+
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
702702
; GFX803-NEXT: flat_store_dword v[0:1], v0
703703
; GFX803-NEXT: s_waitcnt vmcnt(0)
704704
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
10061006
; GFX803: ; %bb.0: ; %entry
10071007
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10081008
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
1009+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
10091010
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1010-
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1011-
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1011+
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
10121012
; GFX803-NEXT: flat_store_dword v[0:1], v0
10131013
; GFX803-NEXT: s_waitcnt vmcnt(0)
10141014
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
13001300
; GFX803: ; %bb.0: ; %entry
13011301
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13021302
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1303+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
13031304
; GFX803-NEXT: s_waitcnt vmcnt(0)
1304-
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1305-
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1305+
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
13061306
; GFX803-NEXT: flat_store_dword v[0:1], v0
13071307
; GFX803-NEXT: s_waitcnt vmcnt(0)
13081308
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
13991399
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14001400
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
14011401
; GFX803-NEXT: s_waitcnt vmcnt(0)
1402-
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1403-
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1402+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
1403+
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
14041404
; GFX803-NEXT: flat_store_dword v[0:1], v0
14051405
; GFX803-NEXT: s_waitcnt vmcnt(0)
14061406
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
18511851
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
18521852
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
18531853
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
1854+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
18541855
; GFX803-NEXT: s_waitcnt vmcnt(0)
1855-
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1856-
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1856+
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
18571857
; GFX803-NEXT: flat_store_dword v[0:1], v0
18581858
; GFX803-NEXT: s_waitcnt vmcnt(0)
18591859
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
20692069
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
20702070
; GFX803-NEXT: s_waitcnt vmcnt(0)
20712071
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
2072+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
20722073
; GFX803-NEXT: s_waitcnt vmcnt(0)
2073-
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2074-
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2074+
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
20752075
; GFX803-NEXT: flat_store_dword v[0:1], v0
20762076
; GFX803-NEXT: s_waitcnt vmcnt(0)
20772077
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
26782678
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26792679
; GFX803-NEXT: s_mov_b32 m0, -1
26802680
; GFX803-NEXT: ds_read_u16 v2, v1
2681+
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
26812682
; GFX803-NEXT: ds_write_b16 v1, v0
26822683
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
2683-
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2684-
; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2684+
; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
26852685
; GFX803-NEXT: v_mov_b32_e32 v0, v2
26862686
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
26872687
; GFX803-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/load-lo16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -621,10 +621,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
621621
; GFX803-NEXT: s_mov_b32 m0, -1
622622
; GFX803-NEXT: ds_read_u16 v0, v0
623623
; GFX803-NEXT: v_mov_b32_e32 v2, 0
624-
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
624+
; GFX803-NEXT: s_mov_b32 s4, 0x3020504
625625
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
626626
; GFX803-NEXT: ds_write_b16 v2, v0
627-
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
627+
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
628628
; GFX803-NEXT: flat_store_dword v[0:1], v0
629629
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
630630
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -734,12 +734,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
734734
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735735
; GFX803-NEXT: s_mov_b32 m0, -1
736736
; GFX803-NEXT: ds_read_u16 v0, v0
737+
; GFX803-NEXT: s_mov_b32 s4, 0x3020504
737738
; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1
738-
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
739739
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
740740
; GFX803-NEXT: ds_write_b16 v2, v0
741741
; GFX803-NEXT: ds_write_b16 v3, v4
742-
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
742+
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
743743
; GFX803-NEXT: flat_store_dword v[0:1], v0
744744
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
745745
; GFX803-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/permute_i8.ll

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2717,3 +2717,37 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
27172717
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
27182718
ret void
27192719
}
2720+
2721+
define void @Source16Bit(i16 %in, <2 x i16> %reg) {
2722+
; GFX10-LABEL: Source16Bit:
2723+
; GFX10: ; %bb.0: ; %entry
2724+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2725+
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
2726+
; GFX10-NEXT: global_store_dword v[0:1], v0, off
2727+
; GFX10-NEXT: s_setpc_b64 s[30:31]
2728+
;
2729+
; GFX9-LABEL: Source16Bit:
2730+
; GFX9: ; %bb.0: ; %entry
2731+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2732+
; GFX9-NEXT: s_mov_b32 s4, 0x3050204
2733+
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
2734+
; GFX9-NEXT: global_store_dword v[0:1], v0, off
2735+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2736+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2737+
entry:
2738+
%elt0 = extractelement <2 x i16> %reg, i32 1
2739+
%e0b0 = and i16 %elt0, 255
2740+
%e0b1 = and i16 %elt0, -256
2741+
%e1b0 = and i16 %in, 255
2742+
%e1b1 = and i16 %in, -256
2743+
%tmp0 = shl i16 %e0b0, 8
2744+
%byte0 = or i16 %tmp0, %e1b0
2745+
%tmp2 = lshr i16 %e1b1, 8
2746+
%byte1 = or i16 %e0b1, %tmp2
2747+
%ext0 = zext i16 %byte0 to i32
2748+
%ext1 = zext i16 %byte1 to i32
2749+
%shifted = shl i32 %ext1, 16
2750+
%result = or i32 %shifted, %ext0
2751+
store i32 %result, ptr addrspace(1) undef
2752+
ret void
2753+
}

0 commit comments

Comments
 (0)