Skip to content

[AMDGPU] Fix edge case of buffer OOB handling #115479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;

def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
"RelaxedBufferOOBMode",
"true",
"Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
>;

def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
bool RelaxedBufferOOBMode = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
bool KernargPreload = false;
Expand Down Expand Up @@ -608,6 +609,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}

bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }

bool hasApertureRegs() const {
return HasApertureRegs;
}
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1877,6 +1877,20 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Subtarget->hasUnalignedBufferAccessEnabled();
}

// Ensure robust out-of-bounds guarantees for buffer accesses are met if
// RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
// out-of-bounds behavior, but in the edge case where an access starts
// out-of-bounds and then enter in-bounds, the entire access would be treated
// as out-of-bounds. Prevent misaligned memory accesses by requiring the
// natural alignment of buffer accesses.
if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
if (!Subtarget->hasRelaxedBufferOOBMode() &&
Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
return false;
}

// Smaller than dword value must be aligned.
if (Size < 32)
return false;
Expand Down
102 changes: 102 additions & 0 deletions llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s

; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the underaligned loads and stores get split.
; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).

define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
; CHECK-LABEL: split_underaligned_load:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v2, s9
; CHECK-NEXT: s_mov_b32 s15, s8
; CHECK-NEXT: s_mov_b32 s14, s7
; CHECK-NEXT: s_mov_b32 s13, s6
; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; CHECK-NEXT: s_mov_b32 s12, s5
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; CHECK-NEXT: s_endpgm
; SDAG-LABEL: split_underaligned_load:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_mov_b32_e32 v0, s4
; SDAG-NEXT: v_mov_b32_e32 v2, s9
; SDAG-NEXT: s_mov_b32 s15, s8
; SDAG-NEXT: s_mov_b32 s14, s7
; SDAG-NEXT: s_mov_b32 s13, s6
; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; SDAG-NEXT: s_mov_b32 s12, s5
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: split_underaligned_load:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v2, s9
; GISEL-NEXT: s_mov_b32 s12, s5
; GISEL-NEXT: s_mov_b32 s13, s6
; GISEL-NEXT: s_mov_b32 s14, s7
; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; GISEL-NEXT: s_mov_b32 s15, s8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld = load i64, ptr addrspace(7) %gep, align 4

%gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
store i64 %ld, ptr addrspace(7) %gep2, align 4
ret void
}

; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the naturally aligned loads and stores do not get split.

define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
; CHECK-LABEL: do_not_split_aligned_load:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v2, s9
; CHECK-NEXT: s_mov_b32 s15, s8
; CHECK-NEXT: s_mov_b32 s14, s7
; CHECK-NEXT: s_mov_b32 s13, s6
; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; CHECK-NEXT: s_mov_b32 s12, s5
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; CHECK-NEXT: s_endpgm
; SDAG-LABEL: do_not_split_aligned_load:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_mov_b32_e32 v0, s4
; SDAG-NEXT: v_mov_b32_e32 v2, s9
; SDAG-NEXT: s_mov_b32 s15, s8
; SDAG-NEXT: s_mov_b32 s14, s7
; SDAG-NEXT: s_mov_b32 s13, s6
; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; SDAG-NEXT: s_mov_b32 s12, s5
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: do_not_split_aligned_load:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v2, s9
; GISEL-NEXT: s_mov_b32 s12, s5
; GISEL-NEXT: s_mov_b32 s13, s6
; GISEL-NEXT: s_mov_b32 s14, s7
; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
; GISEL-NEXT: s_mov_b32 s15, s8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
; GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld = load i64, ptr addrspace(7) %gep, align 8

%gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
store i64 %ld, ptr addrspace(7) %gep2, align 8
ret void
}
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ entry:
%a2 = getelementptr i32, ptr addrspace(7) %out, i32 2
%a3 = getelementptr i32, ptr addrspace(7) %out, i32 3

; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 4
store i32 0, ptr addrspace(7) %out
store i32 1, ptr addrspace(7) %a1
store i32 2, ptr addrspace(7) %a2
store i32 3, ptr addrspace(7) %a3
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 16
store i32 0, ptr addrspace(7) %out, align 16
store i32 1, ptr addrspace(7) %a1, align 4
store i32 2, ptr addrspace(7) %a2, align 8
store i32 3, ptr addrspace(7) %a3, align 4
ret void
}

Expand All @@ -22,10 +22,10 @@ entry:
%a2 = getelementptr i32, ptr addrspace(9) %out, i32 2
%a3 = getelementptr i32, ptr addrspace(9) %out, i32 3

; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 4
store i32 0, ptr addrspace(9) %out
store i32 1, ptr addrspace(9) %a1
store i32 2, ptr addrspace(9) %a2
store i32 3, ptr addrspace(9) %a3
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 16
store i32 0, ptr addrspace(9) %out, align 16
store i32 1, ptr addrspace(9) %a1, align 4
store i32 2, ptr addrspace(9) %a2, align 8
store i32 3, ptr addrspace(9) %a3, align 4
ret void
}
14 changes: 10 additions & 4 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"

Expand Down Expand Up @@ -72,9 +73,14 @@ entry:
ret void
}

; CHECK-LABEL: @merge_fat_ptrs(
; CHECK: load <4 x i16>
; CHECK: store <4 x i16> zeroinitializer
; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
; CHECK-OOB-RELAXED: load <4 x i16>
; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
; CHECK-OOB-STRICT: load <2 x i16>
; CHECK-OOB-STRICT: load <2 x i16>
; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefix=OOB-STRICT %s
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=OOB-RELAXED %s

; The test checks that relaxed-buffer-oob-mode allows merging loads even if the target load is not naturally aligned.

define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0 {
;
; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_4(
; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
; OOB-STRICT-NEXT: [[ENTRY:.*:]]
; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-STRICT-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
; OOB-STRICT-NEXT: [[GEP_M4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -4
; OOB-STRICT-NEXT: [[LD_M4:%.*]] = load i32, ptr addrspace(7) [[GEP_M4]], align 4
; OOB-STRICT-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 0
; OOB-STRICT-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(7) [[GEP_0]], align 4
; OOB-STRICT-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i64 4
; OOB-STRICT-NEXT: [[LD_4:%.*]] = load i32, ptr addrspace(7) [[GEP_4]], align 4
; OOB-STRICT-NEXT: ret void
;
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_4(
; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; OOB-RELAXED-NEXT: ret void
;
entry:
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 4
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 4
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
ret void
}

; The test checks that strict OOB mode (relaxed-buffer-oob-mode not set) allows merging loads if the target load is naturally aligned.

define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0 {
; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_16(
; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
; OOB-STRICT-NEXT: [[ENTRY:.*:]]
; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-STRICT-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
; OOB-STRICT-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; OOB-STRICT-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; OOB-STRICT-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; OOB-STRICT-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; OOB-STRICT-NEXT: ret void
;
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_16(
; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0]] {
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; OOB-RELAXED-NEXT: ret void
;
entry:
%gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
%ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 16
%gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
%ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
%gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
%ld_0 = load i32, ptr addrspace(7) %gep_0, align 8
%gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
%ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
ret void
}
Loading