Skip to content

Commit 7af3b30

Browse files
committed
Cleanup to avoid typesize-only operations.
1 parent 4917d46 commit 7af3b30

File tree

11 files changed

+973
-437
lines changed

11 files changed

+973
-437
lines changed

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -296,8 +296,21 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
296296
MachinePreds[Edge].push_back(NewPred);
297297
}
298298

299+
static bool containsBF16Type(const User &U) {
300+
// BF16 cannot currently be represented by LLT, to avoid miscompiles we
301+
// prevent any instructions using them. FIXME: This can be removed once LLT
302+
// supports bfloat.
303+
return U.getType()->getScalarType()->isBFloatTy() ||
304+
any_of(U.operands(), [](Value *V) {
305+
return V->getType()->getScalarType()->isBFloatTy();
306+
});
307+
}
308+
299309
bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
300310
MachineIRBuilder &MIRBuilder) {
311+
if (containsBF16Type(U))
312+
return false;
313+
301314
// Get or create a virtual register for each value.
302315
// Unless the value is a Constant => loadimm cst?
303316
// or inline constant each time?
@@ -317,6 +330,9 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
317330

318331
bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
319332
MachineIRBuilder &MIRBuilder) {
333+
if (containsBF16Type(U))
334+
return false;
335+
320336
Register Op0 = getOrCreateVReg(*U.getOperand(0));
321337
Register Res = getOrCreateVReg(U);
322338
uint32_t Flags = 0;
@@ -334,6 +350,9 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
334350

335351
bool IRTranslator::translateCompare(const User &U,
336352
MachineIRBuilder &MIRBuilder) {
353+
if (containsBF16Type(U))
354+
return false;
355+
337356
auto *CI = cast<CmpInst>(&U);
338357
Register Op0 = getOrCreateVReg(*U.getOperand(0));
339358
Register Op1 = getOrCreateVReg(*U.getOperand(1));
@@ -1553,6 +1572,9 @@ bool IRTranslator::translateBitCast(const User &U,
15531572

15541573
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
15551574
MachineIRBuilder &MIRBuilder) {
1575+
if (containsBF16Type(U))
1576+
return false;
1577+
15561578
uint32_t Flags = 0;
15571579
if (const Instruction *I = dyn_cast<Instruction>(&U))
15581580
Flags = MachineInstr::copyFlagsFromInstruction(*I);
@@ -2643,6 +2665,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
26432665

26442666
bool IRTranslator::translateInlineAsm(const CallBase &CB,
26452667
MachineIRBuilder &MIRBuilder) {
2668+
if (containsBF16Type(CB))
2669+
return false;
26462670

26472671
const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
26482672

@@ -2732,6 +2756,9 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
27322756
}
27332757

27342758
bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
2759+
if (containsBF16Type(U))
2760+
return false;
2761+
27352762
const CallInst &CI = cast<CallInst>(U);
27362763
auto TII = MF->getTarget().getIntrinsicInfo();
27372764
const Function *F = CI.getCalledFunction();
@@ -3367,6 +3394,9 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
33673394

33683395
bool IRTranslator::translateAtomicRMW(const User &U,
33693396
MachineIRBuilder &MIRBuilder) {
3397+
if (containsBF16Type(U))
3398+
return false;
3399+
33703400
const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
33713401
auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
33723402

@@ -3614,15 +3644,6 @@ bool IRTranslator::translate(const Instruction &Inst) {
36143644
CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
36153645
CurBuilder->setMMRAMetadata(Inst.getMetadata(LLVMContext::MD_mmra));
36163646

3617-
// BF16 cannot currently be represented by LLT, to avoid miscompiles we
3618-
// prevent any instructions using them. FIXME: This can be removed once LLT
3619-
// supports bfloat.
3620-
if (Inst.getType()->getScalarType()->isBFloatTy() ||
3621-
any_of(Inst.operands(), [](Value *V) {
3622-
return V->getType()->getScalarType()->isBFloatTy();
3623-
}))
3624-
return false;
3625-
36263647
if (TLI->fallBackToDAGISel(Inst))
36273648
return false;
36283649

llvm/test/CodeGen/AArch64/bf16-instructions.ll

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,11 @@
99
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd
1010
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv
1111
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_frem
12-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_store
13-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_load
1412
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_call
1513
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_call_flipped
1614
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_tailcall_flipped
17-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select
1815
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc
1916
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16
20-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f16_f32
2117
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une
2218
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq
2319
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt
@@ -34,7 +30,6 @@
3430
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord
3531
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fccmp
3632
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_br_cc
37-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_phi
3833
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32
3934
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i64
4035
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptoui_i32
@@ -49,8 +44,6 @@
4944
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double
5045
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_float
5146
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double
52-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcast_bfloattoi16
53-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcast_i16tobfloat
5447
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt
5548
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_powi
5649
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sin
@@ -373,8 +366,14 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
373366
;
374367
; CHECK-GI-LABEL: test_select:
375368
; CHECK-GI: // %bb.0:
376-
; CHECK-GI-NEXT: cmp w0, #0
377-
; CHECK-GI-NEXT: fcsel h0, h0, h1, ne
369+
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $s0
370+
; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $s1
371+
; CHECK-GI-NEXT: fmov w8, s0
372+
; CHECK-GI-NEXT: fmov w9, s1
373+
; CHECK-GI-NEXT: tst w0, #0x1
374+
; CHECK-GI-NEXT: csel w8, w8, w9, ne
375+
; CHECK-GI-NEXT: fmov s0, w8
376+
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $s0
378377
; CHECK-GI-NEXT: ret
379378
%r = select i1 %c, bfloat %a, bfloat %b
380379
ret bfloat %r
@@ -457,8 +456,14 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
457456
;
458457
; CHECK-GI-LABEL: test_select_cc_f16_f32:
459458
; CHECK-GI: // %bb.0:
459+
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $s0
460+
; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $s1
460461
; CHECK-GI-NEXT: fcmp s2, s3
461-
; CHECK-GI-NEXT: fcsel h0, h0, h1, ne
462+
; CHECK-GI-NEXT: fmov w8, s0
463+
; CHECK-GI-NEXT: fmov w9, s1
464+
; CHECK-GI-NEXT: csel w8, w8, w9, ne
465+
; CHECK-GI-NEXT: fmov s0, w8
466+
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $s0
462467
; CHECK-GI-NEXT: ret
463468
%cc = fcmp une float %c, %d
464469
%r = select i1 %cc, bfloat %a, bfloat %b

llvm/test/CodeGen/AArch64/concat-vector.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4-
5-
; CHECK-GI: warning: Instruction selection used fallback path for concat_high_high_v8bf16
3+
; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
64

75
define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
86
; CHECK-SD-LABEL: concat1:

llvm/test/CodeGen/AArch64/dup.ll

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,6 @@
55
; CHECK-GI: warning: Instruction selection used fallback path for dup_v2i8
66
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2i8
77
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2i8
8-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v2bfloat
9-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2bfloat
10-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2bfloat
11-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v3bfloat
12-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v3bfloat
13-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v3bfloat
14-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v4bfloat
15-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v4bfloat
16-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v4bfloat
17-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v8bfloat
18-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v8bfloat
19-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v8bfloat
20-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v16bfloat
21-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v16bfloat
22-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v16bfloat
238

249
define <2 x i8> @dup_v2i8(i8 %a) {
2510
; CHECK-LABEL: dup_v2i8:
@@ -1263,12 +1248,20 @@ entry:
12631248
}
12641249

12651250
define <16 x bfloat> @dup_v16bfloat(bfloat %a) {
1266-
; CHECK-LABEL: dup_v16bfloat:
1267-
; CHECK: // %bb.0: // %entry
1268-
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
1269-
; CHECK-NEXT: dup v0.8h, v0.h[0]
1270-
; CHECK-NEXT: mov v1.16b, v0.16b
1271-
; CHECK-NEXT: ret
1251+
; CHECK-SD-LABEL: dup_v16bfloat:
1252+
; CHECK-SD: // %bb.0: // %entry
1253+
; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $q0
1254+
; CHECK-SD-NEXT: dup v0.8h, v0.h[0]
1255+
; CHECK-SD-NEXT: mov v1.16b, v0.16b
1256+
; CHECK-SD-NEXT: ret
1257+
;
1258+
; CHECK-GI-LABEL: dup_v16bfloat:
1259+
; CHECK-GI: // %bb.0: // %entry
1260+
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
1261+
; CHECK-GI-NEXT: dup v2.8h, v0.h[0]
1262+
; CHECK-GI-NEXT: dup v1.8h, v0.h[0]
1263+
; CHECK-GI-NEXT: mov v0.16b, v2.16b
1264+
; CHECK-GI-NEXT: ret
12721265
entry:
12731266
%b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
12741267
%c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
@@ -1287,11 +1280,17 @@ entry:
12871280
}
12881281

12891282
define <16 x bfloat> @loaddup_v16bfloat(ptr %p) {
1290-
; CHECK-LABEL: loaddup_v16bfloat:
1291-
; CHECK: // %bb.0: // %entry
1292-
; CHECK-NEXT: ld1r { v0.8h }, [x0]
1293-
; CHECK-NEXT: mov v1.16b, v0.16b
1294-
; CHECK-NEXT: ret
1283+
; CHECK-SD-LABEL: loaddup_v16bfloat:
1284+
; CHECK-SD: // %bb.0: // %entry
1285+
; CHECK-SD-NEXT: ld1r { v0.8h }, [x0]
1286+
; CHECK-SD-NEXT: mov v1.16b, v0.16b
1287+
; CHECK-SD-NEXT: ret
1288+
;
1289+
; CHECK-GI-LABEL: loaddup_v16bfloat:
1290+
; CHECK-GI: // %bb.0: // %entry
1291+
; CHECK-GI-NEXT: ld1r { v0.8h }, [x0]
1292+
; CHECK-GI-NEXT: ld1r { v1.8h }, [x0]
1293+
; CHECK-GI-NEXT: ret
12951294
entry:
12961295
%a = load bfloat, ptr %p
12971296
%b = insertelement <16 x bfloat> poison, bfloat %a, i64 0

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
77
; GCN-LABEL: atomic_load_flat_monotonic_i8:
@@ -109,27 +109,12 @@ define half @atomic_load_flat_monotonic_f16(ptr %ptr) {
109109
}
110110

111111
define bfloat @atomic_load_flat_monotonic_bf16(ptr %ptr) {
112-
; GFX7-LABEL: atomic_load_flat_monotonic_bf16:
113-
; GFX7: ; %bb.0:
114-
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115-
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
116-
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
118-
; GFX7-NEXT: s_setpc_b64 s[30:31]
119-
;
120-
; GFX8-LABEL: atomic_load_flat_monotonic_bf16:
121-
; GFX8: ; %bb.0:
122-
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123-
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
124-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
125-
; GFX8-NEXT: s_setpc_b64 s[30:31]
126-
;
127-
; GFX9-LABEL: atomic_load_flat_monotonic_bf16:
128-
; GFX9: ; %bb.0:
129-
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130-
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
131-
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
132-
; GFX9-NEXT: s_setpc_b64 s[30:31]
112+
; GCN-LABEL: atomic_load_flat_monotonic_bf16:
113+
; GCN: ; %bb.0:
114+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115+
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
116+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117+
; GCN-NEXT: s_setpc_b64 s[30:31]
133118
%load = load atomic bfloat, ptr %ptr monotonic, align 2
134119
ret bfloat %load
135120
}
@@ -148,28 +133,12 @@ define i32 @atomic_load_flat_monotonic_f16_zext_to_i32(ptr %ptr) {
148133
}
149134

150135
define i32 @atomic_load_flat_monotonic_bf16_zext_to_i32(ptr %ptr) {
151-
; GFX7-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
152-
; GFX7: ; %bb.0:
153-
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154-
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
155-
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
156-
; GFX7-NEXT: s_setpc_b64 s[30:31]
157-
;
158-
; GFX8-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
159-
; GFX8: ; %bb.0:
160-
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161-
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
162-
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
163-
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
164-
; GFX8-NEXT: s_setpc_b64 s[30:31]
165-
;
166-
; GFX9-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
167-
; GFX9: ; %bb.0:
168-
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169-
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
170-
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171-
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
172-
; GFX9-NEXT: s_setpc_b64 s[30:31]
136+
; GCN-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
137+
; GCN: ; %bb.0:
138+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139+
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
140+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
141+
; GCN-NEXT: s_setpc_b64 s[30:31]
173142
%load = load atomic bfloat, ptr %ptr monotonic, align 2
174143
%cast = bitcast bfloat %load to i16
175144
%ext = zext i16 %cast to i32

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5-
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
66

77
define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) {
88
; GFX6-LABEL: atomic_load_global_monotonic_i8:
@@ -331,19 +331,16 @@ define bfloat @atomic_load_global_monotonic_bf16(ptr addrspace(1) %ptr) {
331331
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332332
; GFX6-NEXT: s_mov_b32 s6, 0
333333
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
334-
; GFX6-NEXT: s_mov_b32 s4, s6
335-
; GFX6-NEXT: s_mov_b32 s5, s6
334+
; GFX6-NEXT: s_mov_b64 s[4:5], 0
336335
; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
337336
; GFX6-NEXT: s_waitcnt vmcnt(0)
338-
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
339337
; GFX6-NEXT: s_setpc_b64 s[30:31]
340338
;
341339
; GFX7-LABEL: atomic_load_global_monotonic_bf16:
342340
; GFX7: ; %bb.0:
343341
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344342
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
345343
; GFX7-NEXT: s_waitcnt vmcnt(0)
346-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
347344
; GFX7-NEXT: s_setpc_b64 s[30:31]
348345
;
349346
; GFX8-LABEL: atomic_load_global_monotonic_bf16:
@@ -406,8 +403,7 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
406403
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407404
; GFX6-NEXT: s_mov_b32 s6, 0
408405
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
409-
; GFX6-NEXT: s_mov_b32 s4, s6
410-
; GFX6-NEXT: s_mov_b32 s5, s6
406+
; GFX6-NEXT: s_mov_b64 s[4:5], 0
411407
; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
412408
; GFX6-NEXT: s_waitcnt vmcnt(0)
413409
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -424,15 +420,13 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
424420
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425421
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
426422
; GFX8-NEXT: s_waitcnt vmcnt(0)
427-
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
428423
; GFX8-NEXT: s_setpc_b64 s[30:31]
429424
;
430425
; GFX9-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32:
431426
; GFX9: ; %bb.0:
432427
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433428
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
434429
; GFX9-NEXT: s_waitcnt vmcnt(0)
435-
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
436430
; GFX9-NEXT: s_setpc_b64 s[30:31]
437431
%load = load atomic bfloat, ptr addrspace(1) %ptr monotonic, align 2
438432
%cast = bitcast bfloat %load to i16

0 commit comments

Comments
 (0)