Skip to content

Commit 4917d46

Browse files
committed
[GISel] Explicitly disable BF16 tablegen patterns and codegen.
We currently have an issue where bf16 patters can be used to match fp16 types, as GISel does not know about the difference between the two types. This patch explicitly disables them to make sure that they are never used. The opposite can also happen too, where fp16 patterns are used for operators that should be bf16. So any operations with bf16 types now cause a fallback to SDAG. For the moment this includes data-processing only instructions (loads, stores, shufles, etc). The pass setup for GISel has been slightly adjusted to make sure that a verify pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass, which otherwise can cause verifier issues when falling back.
1 parent 6805d7e commit 4917d46

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+2483
-5032
lines changed

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,10 +1553,6 @@ bool IRTranslator::translateBitCast(const User &U,
15531553

15541554
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
15551555
MachineIRBuilder &MIRBuilder) {
1556-
if (U.getType()->getScalarType()->isBFloatTy() ||
1557-
U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
1558-
return false;
1559-
15601556
uint32_t Flags = 0;
15611557
if (const Instruction *I = dyn_cast<Instruction>(&U))
15621558
Flags = MachineInstr::copyFlagsFromInstruction(*I);
@@ -3618,6 +3614,15 @@ bool IRTranslator::translate(const Instruction &Inst) {
36183614
CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
36193615
CurBuilder->setMMRAMetadata(Inst.getMetadata(LLVMContext::MD_mmra));
36203616

3617+
// BF16 cannot currently be represented by LLT, to avoid miscompiles we
3618+
// prevent any instructions using them. FIXME: This can be removed once LLT
3619+
// supports bfloat.
3620+
if (Inst.getType()->getScalarType()->isBFloatTy() ||
3621+
any_of(Inst.operands(), [](Value *V) {
3622+
return V->getType()->getScalarType()->isBFloatTy();
3623+
}))
3624+
return false;
3625+
36213626
if (TLI->fallBackToDAGISel(Inst))
36223627
return false;
36233628

llvm/lib/CodeGen/TargetPassConfig.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ bool TargetPassConfig::addCoreISelPasses() {
10171017
if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
10181018
DebugifyIsSafe = false;
10191019

1020-
// Add instruction selector passes.
1020+
// Add instruction selector passes for global isel if enabled.
10211021
if (Selector == SelectorType::GlobalISel) {
10221022
SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true);
10231023
if (addIRTranslator())
@@ -1043,15 +1043,14 @@ bool TargetPassConfig::addCoreISelPasses() {
10431043
// Pass to reset the MachineFunction if the ISel failed.
10441044
addPass(createResetMachineFunctionPass(
10451045
reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
1046+
}
10461047

1047-
// Provide a fallback path when we do not want to abort on
1048-
// not-yet-supported input.
1049-
if (!isGlobalISelAbortEnabled() && addInstSelector())
1048+
// Run the SDAG InstSelector, providing a fallback path when we do not want to
1049+
// abort on not-yet-supported input.
1050+
if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
1051+
if (addInstSelector())
10501052
return true;
10511053

1052-
} else if (addInstSelector())
1053-
return true;
1054-
10551054
// Expand pseudo-instructions emitted by ISel. Don't run the verifier before
10561055
// FinalizeISel.
10571056
addPass(&FinalizeISelID);

llvm/test/CodeGen/AArch64/bf16-instructions.ll

Lines changed: 597 additions & 1075 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AArch64/concat-vector.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
3+
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4+
5+
; CHECK-GI: warning: Instruction selection used fallback path for concat_high_high_v8bf16
46

57
define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
68
; CHECK-SD-LABEL: concat1:

llvm/test/CodeGen/AArch64/dup.ll

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,21 @@
55
; CHECK-GI: warning: Instruction selection used fallback path for dup_v2i8
66
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2i8
77
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2i8
8+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v2bfloat
9+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2bfloat
10+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2bfloat
11+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v3bfloat
12+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v3bfloat
13+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v3bfloat
14+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v4bfloat
15+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v4bfloat
16+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v4bfloat
17+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v8bfloat
18+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v8bfloat
19+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v8bfloat
20+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for dup_v16bfloat
21+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v16bfloat
22+
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v16bfloat
823

924
define <2 x i8> @dup_v2i8(i8 %a) {
1025
; CHECK-LABEL: dup_v2i8:
@@ -1248,20 +1263,12 @@ entry:
12481263
}
12491264

12501265
define <16 x bfloat> @dup_v16bfloat(bfloat %a) {
1251-
; CHECK-SD-LABEL: dup_v16bfloat:
1252-
; CHECK-SD: // %bb.0: // %entry
1253-
; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $q0
1254-
; CHECK-SD-NEXT: dup v0.8h, v0.h[0]
1255-
; CHECK-SD-NEXT: mov v1.16b, v0.16b
1256-
; CHECK-SD-NEXT: ret
1257-
;
1258-
; CHECK-GI-LABEL: dup_v16bfloat:
1259-
; CHECK-GI: // %bb.0: // %entry
1260-
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
1261-
; CHECK-GI-NEXT: dup v2.8h, v0.h[0]
1262-
; CHECK-GI-NEXT: dup v1.8h, v0.h[0]
1263-
; CHECK-GI-NEXT: mov v0.16b, v2.16b
1264-
; CHECK-GI-NEXT: ret
1266+
; CHECK-LABEL: dup_v16bfloat:
1267+
; CHECK: // %bb.0: // %entry
1268+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
1269+
; CHECK-NEXT: dup v0.8h, v0.h[0]
1270+
; CHECK-NEXT: mov v1.16b, v0.16b
1271+
; CHECK-NEXT: ret
12651272
entry:
12661273
%b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
12671274
%c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
@@ -1280,17 +1287,11 @@ entry:
12801287
}
12811288

12821289
define <16 x bfloat> @loaddup_v16bfloat(ptr %p) {
1283-
; CHECK-SD-LABEL: loaddup_v16bfloat:
1284-
; CHECK-SD: // %bb.0: // %entry
1285-
; CHECK-SD-NEXT: ld1r { v0.8h }, [x0]
1286-
; CHECK-SD-NEXT: mov v1.16b, v0.16b
1287-
; CHECK-SD-NEXT: ret
1288-
;
1289-
; CHECK-GI-LABEL: loaddup_v16bfloat:
1290-
; CHECK-GI: // %bb.0: // %entry
1291-
; CHECK-GI-NEXT: ld1r { v0.8h }, [x0]
1292-
; CHECK-GI-NEXT: ld1r { v1.8h }, [x0]
1293-
; CHECK-GI-NEXT: ret
1290+
; CHECK-LABEL: loaddup_v16bfloat:
1291+
; CHECK: // %bb.0: // %entry
1292+
; CHECK-NEXT: ld1r { v0.8h }, [x0]
1293+
; CHECK-NEXT: mov v1.16b, v0.16b
1294+
; CHECK-NEXT: ret
12941295
entry:
12951296
%a = load bfloat, ptr %p
12961297
%b = insertelement <16 x bfloat> poison, bfloat %a, i64 0

llvm/test/CodeGen/AArch64/fptrunc.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
22
; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
33
; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4+
; RUN: llc -mtriple=aarch64 -global-isel=1 -mattr=+fullfp16,+bf16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
45

56
define float @fptrunc_f64_f32(double %a) {
67
; CHECK-LABEL: fptrunc_f64_f32:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
77
; GCN-LABEL: atomic_load_flat_monotonic_i8:
@@ -109,12 +109,27 @@ define half @atomic_load_flat_monotonic_f16(ptr %ptr) {
109109
}
110110

111111
define bfloat @atomic_load_flat_monotonic_bf16(ptr %ptr) {
112-
; GCN-LABEL: atomic_load_flat_monotonic_bf16:
113-
; GCN: ; %bb.0:
114-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115-
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
116-
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117-
; GCN-NEXT: s_setpc_b64 s[30:31]
112+
; GFX7-LABEL: atomic_load_flat_monotonic_bf16:
113+
; GFX7: ; %bb.0:
114+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115+
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
116+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
118+
; GFX7-NEXT: s_setpc_b64 s[30:31]
119+
;
120+
; GFX8-LABEL: atomic_load_flat_monotonic_bf16:
121+
; GFX8: ; %bb.0:
122+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123+
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
124+
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
125+
; GFX8-NEXT: s_setpc_b64 s[30:31]
126+
;
127+
; GFX9-LABEL: atomic_load_flat_monotonic_bf16:
128+
; GFX9: ; %bb.0:
129+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130+
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
131+
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
132+
; GFX9-NEXT: s_setpc_b64 s[30:31]
118133
%load = load atomic bfloat, ptr %ptr monotonic, align 2
119134
ret bfloat %load
120135
}
@@ -133,12 +148,28 @@ define i32 @atomic_load_flat_monotonic_f16_zext_to_i32(ptr %ptr) {
133148
}
134149

135150
define i32 @atomic_load_flat_monotonic_bf16_zext_to_i32(ptr %ptr) {
136-
; GCN-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
137-
; GCN: ; %bb.0:
138-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139-
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
140-
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
141-
; GCN-NEXT: s_setpc_b64 s[30:31]
151+
; GFX7-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
152+
; GFX7: ; %bb.0:
153+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154+
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
155+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
156+
; GFX7-NEXT: s_setpc_b64 s[30:31]
157+
;
158+
; GFX8-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
159+
; GFX8: ; %bb.0:
160+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161+
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
162+
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
163+
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
164+
; GFX8-NEXT: s_setpc_b64 s[30:31]
165+
;
166+
; GFX9-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
167+
; GFX9: ; %bb.0:
168+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169+
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
170+
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
172+
; GFX9-NEXT: s_setpc_b64 s[30:31]
142173
%load = load atomic bfloat, ptr %ptr monotonic, align 2
143174
%cast = bitcast bfloat %load to i16
144175
%ext = zext i16 %cast to i32

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
66

77
define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) {
88
; GFX6-LABEL: atomic_load_global_monotonic_i8:
@@ -331,16 +331,19 @@ define bfloat @atomic_load_global_monotonic_bf16(ptr addrspace(1) %ptr) {
331331
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332332
; GFX6-NEXT: s_mov_b32 s6, 0
333333
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
334-
; GFX6-NEXT: s_mov_b64 s[4:5], 0
334+
; GFX6-NEXT: s_mov_b32 s4, s6
335+
; GFX6-NEXT: s_mov_b32 s5, s6
335336
; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
336337
; GFX6-NEXT: s_waitcnt vmcnt(0)
338+
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
337339
; GFX6-NEXT: s_setpc_b64 s[30:31]
338340
;
339341
; GFX7-LABEL: atomic_load_global_monotonic_bf16:
340342
; GFX7: ; %bb.0:
341343
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342344
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
343345
; GFX7-NEXT: s_waitcnt vmcnt(0)
346+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
344347
; GFX7-NEXT: s_setpc_b64 s[30:31]
345348
;
346349
; GFX8-LABEL: atomic_load_global_monotonic_bf16:
@@ -403,7 +406,8 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
403406
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
404407
; GFX6-NEXT: s_mov_b32 s6, 0
405408
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
406-
; GFX6-NEXT: s_mov_b64 s[4:5], 0
409+
; GFX6-NEXT: s_mov_b32 s4, s6
410+
; GFX6-NEXT: s_mov_b32 s5, s6
407411
; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
408412
; GFX6-NEXT: s_waitcnt vmcnt(0)
409413
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -420,13 +424,15 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
420424
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421425
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
422426
; GFX8-NEXT: s_waitcnt vmcnt(0)
427+
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
423428
; GFX8-NEXT: s_setpc_b64 s[30:31]
424429
;
425430
; GFX9-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32:
426431
; GFX9: ; %bb.0:
427432
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428433
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
429434
; GFX9-NEXT: s_waitcnt vmcnt(0)
435+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
430436
; GFX9-NEXT: s_setpc_b64 s[30:31]
431437
%load = load atomic bfloat, ptr addrspace(1) %ptr monotonic, align 2
432438
%cast = bitcast bfloat %load to i16

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
; TODO: Merge with atomic_load_local.ll
77

@@ -261,6 +261,7 @@ define bfloat @atomic_load_local_monotonic_bf16(ptr addrspace(3) %ptr) {
261261
; GFX7-NEXT: s_mov_b32 m0, -1
262262
; GFX7-NEXT: ds_read_u16 v0, v0
263263
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
264+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
264265
; GFX7-NEXT: s_setpc_b64 s[30:31]
265266
;
266267
; GFX8-LABEL: atomic_load_local_monotonic_bf16:
@@ -325,13 +326,15 @@ define i32 @atomic_load_local_monotonic_bf16_zext_to_i32(ptr addrspace(3) %ptr)
325326
; GFX8-NEXT: s_mov_b32 m0, -1
326327
; GFX8-NEXT: ds_read_u16 v0, v0
327328
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
329+
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
328330
; GFX8-NEXT: s_setpc_b64 s[30:31]
329331
;
330332
; GFX9-LABEL: atomic_load_local_monotonic_bf16_zext_to_i32:
331333
; GFX9: ; %bb.0:
332334
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333335
; GFX9-NEXT: ds_read_u16 v0, v0
334336
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
337+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
335338
; GFX9-NEXT: s_setpc_b64 s[30:31]
336339
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
337340
%cast = bitcast bfloat %load to i16

0 commit comments

Comments
 (0)