Skip to content

Commit 170c0da

Browse files
authored
[AMDGPU] Fix edge case of buffer OOB handling (#115479)
Strengthen out-of-bounds guarantees for buffer accesses by disallowing buffer accesses with alignment lower than natural alignment. This is needed to specifically address the edge case where an access starts out-of-bounds and then enters in-bounds, as the hardware would treat the entire access as being out-of-bounds. This is normally not needed for most users, but at least one graphics device extension (VK_EXT_robustness2) has very strict requirements - in-bounds accesses must return correct value, and out-of-bounds accesses must return zero. The direct consequence of the patch is that a buffer access at negative address is not merged by load-store-vectorizer with one at a positive address, which fixes a CTS test. Targets that do not care about the new behavior are advised to use the new target feature relaxed-buffer-oob-mode that maintains the state from before the patch.
1 parent a21cfca commit 170c0da

File tree

7 files changed

+225
-14
lines changed

7 files changed

+225
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
119119
"Hardware supports unaligned local and region loads and stores"
120120
>;
121121

122+
def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
123+
"RelaxedBufferOOBMode",
124+
"true",
125+
"Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
126+
>;
127+
122128
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
123129
"HasApertureRegs",
124130
"true",

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
7878
bool BackOffBarrier = false;
7979
bool UnalignedScratchAccess = false;
8080
bool UnalignedAccessMode = false;
81+
bool RelaxedBufferOOBMode = false;
8182
bool HasApertureRegs = false;
8283
bool SupportsXNACK = false;
8384
bool KernargPreload = false;
@@ -607,6 +608,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
607608
return UnalignedAccessMode;
608609
}
609610

611+
bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
612+
610613
bool hasApertureRegs() const {
611614
return HasApertureRegs;
612615
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,6 +1883,20 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
18831883
Subtarget->hasUnalignedBufferAccessEnabled();
18841884
}
18851885

1886+
// Ensure robust out-of-bounds guarantees for buffer accesses are met if
1887+
// RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
1888+
// out-of-bounds behavior, but in the edge case where an access starts
1889+
// out-of-bounds and then enter in-bounds, the entire access would be treated
1890+
// as out-of-bounds. Prevent misaligned memory accesses by requiring the
1891+
// natural alignment of buffer accesses.
1892+
if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
1893+
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
1894+
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1895+
if (!Subtarget->hasRelaxedBufferOOBMode() &&
1896+
Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
1897+
return false;
1898+
}
1899+
18861900
// Smaller than dword value must be aligned.
18871901
if (Size < 32)
18881902
return false;
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
4+
5+
; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the underaligned loads and stores get split.
6+
; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).
7+
8+
define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
9+
; CHECK-LABEL: split_underaligned_load:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
12+
; CHECK-NEXT: v_mov_b32_e32 v2, s9
13+
; CHECK-NEXT: s_mov_b32 s15, s8
14+
; CHECK-NEXT: s_mov_b32 s14, s7
15+
; CHECK-NEXT: s_mov_b32 s13, s6
16+
; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
17+
; CHECK-NEXT: s_mov_b32 s12, s5
18+
; CHECK-NEXT: s_waitcnt vmcnt(0)
19+
; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
20+
; CHECK-NEXT: s_endpgm
21+
; SDAG-LABEL: split_underaligned_load:
22+
; SDAG: ; %bb.0: ; %entry
23+
; SDAG-NEXT: v_mov_b32_e32 v0, s4
24+
; SDAG-NEXT: v_mov_b32_e32 v2, s9
25+
; SDAG-NEXT: s_mov_b32 s15, s8
26+
; SDAG-NEXT: s_mov_b32 s14, s7
27+
; SDAG-NEXT: s_mov_b32 s13, s6
28+
; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
29+
; SDAG-NEXT: s_mov_b32 s12, s5
30+
; SDAG-NEXT: s_waitcnt vmcnt(0)
31+
; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
32+
; SDAG-NEXT: s_endpgm
33+
;
34+
; GISEL-LABEL: split_underaligned_load:
35+
; GISEL: ; %bb.0: ; %entry
36+
; GISEL-NEXT: v_mov_b32_e32 v0, s4
37+
; GISEL-NEXT: v_mov_b32_e32 v2, s9
38+
; GISEL-NEXT: s_mov_b32 s12, s5
39+
; GISEL-NEXT: s_mov_b32 s13, s6
40+
; GISEL-NEXT: s_mov_b32 s14, s7
41+
; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
42+
; GISEL-NEXT: s_mov_b32 s15, s8
43+
; GISEL-NEXT: s_waitcnt vmcnt(0)
44+
; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
45+
; GISEL-NEXT: s_endpgm
46+
entry:
47+
%gep = getelementptr i8, ptr addrspace(7) %p, i32 0
48+
%ld = load i64, ptr addrspace(7) %gep, align 4
49+
50+
%gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
51+
store i64 %ld, ptr addrspace(7) %gep2, align 4
52+
ret void
53+
}
54+
55+
; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the naturally aligned loads and stores do not get split.
56+
57+
define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
58+
; CHECK-LABEL: do_not_split_aligned_load:
59+
; CHECK: ; %bb.0: ; %entry
60+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
61+
; CHECK-NEXT: v_mov_b32_e32 v2, s9
62+
; CHECK-NEXT: s_mov_b32 s15, s8
63+
; CHECK-NEXT: s_mov_b32 s14, s7
64+
; CHECK-NEXT: s_mov_b32 s13, s6
65+
; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
66+
; CHECK-NEXT: s_mov_b32 s12, s5
67+
; CHECK-NEXT: s_waitcnt vmcnt(0)
68+
; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
69+
; CHECK-NEXT: s_endpgm
70+
; SDAG-LABEL: do_not_split_aligned_load:
71+
; SDAG: ; %bb.0: ; %entry
72+
; SDAG-NEXT: v_mov_b32_e32 v0, s4
73+
; SDAG-NEXT: v_mov_b32_e32 v2, s9
74+
; SDAG-NEXT: s_mov_b32 s15, s8
75+
; SDAG-NEXT: s_mov_b32 s14, s7
76+
; SDAG-NEXT: s_mov_b32 s13, s6
77+
; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
78+
; SDAG-NEXT: s_mov_b32 s12, s5
79+
; SDAG-NEXT: s_waitcnt vmcnt(0)
80+
; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
81+
; SDAG-NEXT: s_endpgm
82+
;
83+
; GISEL-LABEL: do_not_split_aligned_load:
84+
; GISEL: ; %bb.0: ; %entry
85+
; GISEL-NEXT: v_mov_b32_e32 v0, s4
86+
; GISEL-NEXT: v_mov_b32_e32 v2, s9
87+
; GISEL-NEXT: s_mov_b32 s12, s5
88+
; GISEL-NEXT: s_mov_b32 s13, s6
89+
; GISEL-NEXT: s_mov_b32 s14, s7
90+
; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
91+
; GISEL-NEXT: s_mov_b32 s15, s8
92+
; GISEL-NEXT: s_waitcnt vmcnt(0)
93+
; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
94+
; GISEL-NEXT: s_endpgm
95+
entry:
96+
%gep = getelementptr i8, ptr addrspace(7) %p, i32 0
97+
%ld = load i64, ptr addrspace(7) %gep, align 8
98+
99+
%gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
100+
store i64 %ld, ptr addrspace(7) %gep2, align 8
101+
ret void
102+
}

llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ entry:
77
%a2 = getelementptr i32, ptr addrspace(7) %out, i32 2
88
%a3 = getelementptr i32, ptr addrspace(7) %out, i32 3
99

10-
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 4
11-
store i32 0, ptr addrspace(7) %out
12-
store i32 1, ptr addrspace(7) %a1
13-
store i32 2, ptr addrspace(7) %a2
14-
store i32 3, ptr addrspace(7) %a3
10+
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 16
11+
store i32 0, ptr addrspace(7) %out, align 16
12+
store i32 1, ptr addrspace(7) %a1, align 4
13+
store i32 2, ptr addrspace(7) %a2, align 8
14+
store i32 3, ptr addrspace(7) %a3, align 4
1515
ret void
1616
}
1717

@@ -22,10 +22,10 @@ entry:
2222
%a2 = getelementptr i32, ptr addrspace(9) %out, i32 2
2323
%a3 = getelementptr i32, ptr addrspace(9) %out, i32 3
2424

25-
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 4
26-
store i32 0, ptr addrspace(9) %out
27-
store i32 1, ptr addrspace(9) %a1
28-
store i32 2, ptr addrspace(9) %a2
29-
store i32 3, ptr addrspace(9) %a3
25+
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 16
26+
store i32 0, ptr addrspace(9) %out, align 16
27+
store i32 1, ptr addrspace(9) %a1, align 4
28+
store i32 2, ptr addrspace(9) %a2, align 8
29+
store i32 3, ptr addrspace(9) %a3, align 4
3030
ret void
3131
}

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
1+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
23

34
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
45

@@ -72,9 +73,14 @@ entry:
7273
ret void
7374
}
7475

75-
; CHECK-LABEL: @merge_fat_ptrs(
76-
; CHECK: load <4 x i16>
77-
; CHECK: store <4 x i16> zeroinitializer
76+
; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
77+
; CHECK-OOB-RELAXED: load <4 x i16>
78+
; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
79+
; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
80+
; CHECK-OOB-STRICT: load <2 x i16>
81+
; CHECK-OOB-STRICT: load <2 x i16>
82+
; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
83+
; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
7884
define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 {
7985
entry:
8086
%a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefix=OOB-STRICT %s
3+
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=OOB-RELAXED %s
4+
5+
; The test checks that relaxed-buffer-oob-mode allows merging loads even if the target load is not naturally aligned.
6+
7+
define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0 {
8+
;
9+
; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_4(
10+
; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
11+
; OOB-STRICT-NEXT: [[ENTRY:.*:]]
12+
; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
13+
; OOB-STRICT-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
14+
; OOB-STRICT-NEXT: [[GEP_M4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -4
15+
; OOB-STRICT-NEXT: [[LD_M4:%.*]] = load i32, ptr addrspace(7) [[GEP_M4]], align 4
16+
; OOB-STRICT-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 0
17+
; OOB-STRICT-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(7) [[GEP_0]], align 4
18+
; OOB-STRICT-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i64 4
19+
; OOB-STRICT-NEXT: [[LD_4:%.*]] = load i32, ptr addrspace(7) [[GEP_4]], align 4
20+
; OOB-STRICT-NEXT: ret void
21+
;
22+
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_4(
23+
; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
24+
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
25+
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
26+
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
27+
; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
28+
; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
29+
; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
30+
; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
31+
; OOB-RELAXED-NEXT: ret void
32+
;
33+
entry:
34+
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
35+
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 4
36+
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
37+
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
38+
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
39+
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 4
40+
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
41+
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
42+
ret void
43+
}
44+
45+
; The test checks that strict OOB mode (relaxed-buffer-oob-mode not set) allows merging loads if the target load is naturally aligned.
46+
47+
define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0 {
48+
; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_16(
49+
; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
50+
; OOB-STRICT-NEXT: [[ENTRY:.*:]]
51+
; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
52+
; OOB-STRICT-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
53+
; OOB-STRICT-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
54+
; OOB-STRICT-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
55+
; OOB-STRICT-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
56+
; OOB-STRICT-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
57+
; OOB-STRICT-NEXT: ret void
58+
;
59+
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_16(
60+
; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0]] {
61+
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
62+
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
63+
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
64+
; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
65+
; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
66+
; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
67+
; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
68+
; OOB-RELAXED-NEXT: ret void
69+
;
70+
entry:
71+
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
72+
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 16
73+
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
74+
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
75+
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
76+
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 8
77+
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
78+
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
79+
ret void
80+
}

0 commit comments

Comments
 (0)