Skip to content

Commit a6e0f11

Browse files
committed
[AMDGPU] - Add address space for strided buffers
This is an experimental address space for strided buffers. These buffers can have structs as elements and a stride > 1. These pointers allow the indexed access in units of stride, i.e., they point at `buffer[index * stride]`. Thus, we can use the `idxen` modifier for buffer loads. We assign address space 9 to 192-bit buffer pointers which contain a 128-bit descriptor, a 32-bit offset and a 32-bit index. Essentially, they are fat buffer pointers with an additional 32-bit index.
1 parent 2ce9a79 commit a6e0f11

File tree

9 files changed

+153
-39
lines changed

9 files changed

+153
-39
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -703,23 +703,24 @@ supported for the ``amdgcn`` target.
703703
.. table:: AMDGPU Address Spaces
704704
:name: amdgpu-address-spaces-table
705705

706-
================================= =============== =========== ================ ======= ============================
707-
.. 64-Bit Process Address Space
708-
--------------------------------- --------------- ----------- ---------------- ------------------------------------
709-
Address Space Name LLVM IR Address HSA Segment Hardware Address NULL Value
710-
Space Number Name Name Size
711-
================================= =============== =========== ================ ======= ============================
712-
Generic 0 flat flat 64 0x0000000000000000
713-
Global 1 global global 64 0x0000000000000000
714-
Region 2 N/A GDS 32 *not implemented for AMDHSA*
715-
Local 3 group LDS 32 0xFFFFFFFF
716-
Constant 4 constant *same as global* 64 0x0000000000000000
717-
Private 5 private scratch 32 0xFFFFFFFF
718-
Constant 32-bit 6 *TODO* 0x00000000
719-
Buffer Fat Pointer (experimental) 7 *TODO*
720-
Buffer Resource (experimental) 8 *TODO*
721-
Streamout Registers 128 N/A GS_REGS
722-
================================= =============== =========== ================ ======= ============================
706+
===================================== =============== =========== ================ ======= ============================
707+
.. 64-Bit Process Address Space
708+
------------------------------------- --------------- ----------- ---------------- ------------------------------------
709+
Address Space Name LLVM IR Address HSA Segment Hardware Address NULL Value
710+
Space Number Name Name Size
711+
===================================== =============== =========== ================ ======= ============================
712+
Generic 0 flat flat 64 0x0000000000000000
713+
Global 1 global global 64 0x0000000000000000
714+
Region 2 N/A GDS 32 *not implemented for AMDHSA*
715+
Local 3 group LDS 32 0xFFFFFFFF
716+
Constant 4 constant *same as global* 64 0x0000000000000000
717+
Private 5 private scratch 32 0xFFFFFFFF
718+
Constant 32-bit 6 *TODO* 0x00000000
719+
Buffer Fat Pointer (experimental) 7 *TODO*
720+
Buffer Resource (experimental) 8 *TODO*
721+
Buffer Strided Pointer (experimental) 9 *TODO*
722+
Streamout Registers 128 N/A GS_REGS
723+
===================================== =============== =========== ================ ======= ============================
723724

724725
**Generic**
725726
The generic address space is supported unless the *Target Properties* column
@@ -836,7 +837,7 @@ supported for the ``amdgcn`` target.
836837
the backend.
837838

838839
The buffer descriptor used to construct a buffer fat pointer must be *raw*:
839-
the stride must be 0, the "add tid" flag bust be 0, the swizzle enable bits
840+
the stride must be 0, the "add tid" flag must be 0, the swizzle enable bits
840841
must be off, and the extent must be measured in bytes. (On subtargets where
841842
bounds checking may be disabled, buffer fat pointers may choose to enable
842843
it or not).
@@ -864,6 +865,17 @@ supported for the ``amdgcn`` target.
864865
(bits `127:96`). The specific interpretation of these fields varies by the
865866
target architecture and is detailed in the ISA descriptions.
866867

868+
**Buffer Strided Pointer**
869+
The buffer index pointer is an experimental address space. It is supposed to
870+
model a 128-bit buffer descriptor and a 32-bit offset, like the **Buffer Fat
871+
Pointer**. Additionally, it contains an index into the descriptor, which
872+
allows the direct addressing of structured elements.
873+
874+
The buffer descriptor must be *raw*:
875+
the stride is the size of a structured element, the "add tid" flag must be 0, the
876+
swizzle eneable bits must be off, and the extent (NumRecords) must be measured in
877+
elements.
878+
867879
**Streamout Registers**
868880
Dedicated registers used by the GS NGG Streamout Instructions. The register
869881
file is modelled as a memory in a distinct address space because it is indexed

llvm/include/llvm/Support/AMDGPUAddrSpace.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ enum : unsigned {
4242

4343
BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources.
4444

45+
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
46+
///< pointers with an additional index.
47+
4548
/// Internal address spaces. Can be freely renumbered.
4649
STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
4750
/// end Internal address spaces.

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -406,24 +406,25 @@ inline bool isExtendedGlobalAddrSpace(unsigned AS) {
406406
}
407407

408408
static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
409-
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range");
409+
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 9, "Addr space out of range");
410410

411411
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
412412
return true;
413413

414-
// This array is indexed by address space value enum elements 0 ... to 8
414+
// This array is indexed by address space value enum elements 0 ... to 9
415415
// clang-format off
416-
static const bool ASAliasRules[9][9] = {
417-
/* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */
418-
/* Flat */ {true, true, false, true, true, true, true, true, true},
419-
/* Global */ {true, true, false, false, true, false, true, true, true},
420-
/* Region */ {false, false, true, false, false, false, false, false, false},
421-
/* Group */ {true, false, false, true, false, false, false, false, false},
422-
/* Constant */ {true, true, false, false, false, false, true, true, true},
423-
/* Private */ {true, false, false, false, false, true, false, false, false},
424-
/* Constant 32-bit */ {true, true, false, false, true, false, false, true, true},
425-
/* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true},
426-
/* Buffer Resource */ {true, true, false, false, true, false, true, true, true},
416+
static const bool ASAliasRules[10][10] = {
417+
/* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
418+
/* Flat */ {true, true, false, true, true, true, true, true, true, true},
419+
/* Global */ {true, true, false, false, true, false, true, true, true, true},
420+
/* Region */ {false, false, true, false, false, false, false, false, false, false},
421+
/* Group */ {true, false, false, true, false, false, false, false, false, false},
422+
/* Constant */ {true, true, false, false, false, false, true, true, true, true},
423+
/* Private */ {true, false, false, false, false, true, false, false, false, false},
424+
/* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
425+
/* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
426+
/* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
427+
/* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
427428
};
428429
// clang-format on
429430

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
633633
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634634
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635635
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636+
const LLT BufferStridedPtr =
637+
GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
636638

637639
const LLT CodePtr = FlatPtr;
638640

@@ -1103,7 +1105,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
11031105
}
11041106

11051107
getActionDefinitionsBuilder(G_PTR_ADD)
1106-
.unsupportedFor({BufferFatPtr, RsrcPtr})
1108+
.unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
11071109
.legalIf(all(isPointer(0), sameSize(0, 1)))
11081110
.scalarize(0)
11091111
.scalarSameSizeAs(1, 0);
@@ -1393,7 +1395,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
13931395
// The custom pointers (fat pointers, buffer resources) don't work with load
13941396
// and store at this level. Fat pointers should have been lowered to
13951397
// intrinsics before the translation to MIR.
1396-
Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1398+
Actions.unsupportedIf(
1399+
typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
13971400

13981401
// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
13991402
// ptrtoint. This is needed to account for the fact that we can't have i128

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ static StringRef computeDataLayout(const Triple &TT) {
540540
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
541541
"-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
542542
"128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
543-
"G1-ni:7:8";
543+
"G1-ni:7:8:9";
544544
}
545545

546546
LLVM_READNONE

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
368368
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369369
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370370
AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371-
AddrSpace == AMDGPUAS::BUFFER_RESOURCE) {
371+
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
372+
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
372373
return 512;
373374
}
374375

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,12 +1040,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
10401040
MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
10411041
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
10421042
return MVT::v5i32;
1043+
if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1044+
DL.getPointerSizeInBits(AS) == 192)
1045+
return MVT::v6i32;
10431046
return AMDGPUTargetLowering::getPointerTy(DL, AS);
10441047
}
10451048
/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
10461049
/// v8i32 when padding is added.
1050+
/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1051+
/// also v8i32 with padding.
10471052
MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1048-
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1053+
if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1054+
DL.getPointerSizeInBits(AS) == 160) ||
1055+
(AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1056+
DL.getPointerSizeInBits(AS) == 192))
10491057
return MVT::v8i32;
10501058
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
10511059
}
@@ -1405,7 +1413,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
14051413

14061414
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
14071415
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1408-
AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) {
1416+
AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1417+
AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
14091418
// If the offset isn't a multiple of 4, it probably isn't going to be
14101419
// correctly aligned.
14111420
// FIXME: Can we get the real alignment here?

llvm/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,73 @@ define void @test_8_5(ptr %p) {
248248
load i8, ptr addrspace(3) @shm
249249
ret void
250250
}
251+
252+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8* %p1
253+
define void @test_9_0(ptr addrspace(9) %p, ptr addrspace(0) %p1) {
254+
load i8, ptr addrspace(9) %p
255+
load i8, ptr addrspace(0) %p1
256+
ret void
257+
}
258+
259+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(1)* %p1
260+
define void @test_9_1(ptr addrspace(9) %p, ptr addrspace(1) %p1) {
261+
load i8, ptr addrspace(9) %p
262+
load i8, ptr addrspace(1) %p1
263+
ret void
264+
}
265+
266+
; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(2)* %p1
267+
define void @test_9_2(ptr addrspace(9) %p, ptr addrspace(2) %p1) {
268+
load i8, ptr addrspace(9) %p
269+
load i8, ptr addrspace(2) %p1
270+
ret void
271+
}
272+
273+
; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(3)* %p1
274+
define void @test_9_3(ptr addrspace(9) %p, ptr addrspace(3) %p1) {
275+
load i8, ptr addrspace(9) %p
276+
load i8, ptr addrspace(3) %p1
277+
ret void
278+
}
279+
280+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(4)* %p1
281+
define void @test_9_4(ptr addrspace(9) %p, ptr addrspace(4) %p1) {
282+
load i8, ptr addrspace(9) %p
283+
load i8, ptr addrspace(4) %p1
284+
ret void
285+
}
286+
287+
; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(5)* %p1
288+
define void @test_9_5(ptr addrspace(9) %p, ptr addrspace(5) %p1) {
289+
load i8, ptr addrspace(9) %p
290+
load i8, ptr addrspace(5) %p1
291+
ret void
292+
}
293+
294+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(6)* %p1
295+
define void @test_9_6(ptr addrspace(9) %p, ptr addrspace(6) %p1) {
296+
load i8, ptr addrspace(9) %p
297+
load i8, ptr addrspace(6) %p1
298+
ret void
299+
}
300+
301+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(7)* %p1
302+
define void @test_9_7(ptr addrspace(9) %p, ptr addrspace(7) %p1) {
303+
load i8, ptr addrspace(9) %p
304+
load i8, ptr addrspace(7) %p1
305+
ret void
306+
}
307+
308+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(8)* %p1
309+
define void @test_9_8(ptr addrspace(9) %p, ptr addrspace(8) %p1) {
310+
load i8, ptr addrspace(9) %p
311+
load i8, ptr addrspace(8) %p1
312+
ret void
313+
}
314+
315+
; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(9)* %p1
316+
define void @test_9_9(ptr addrspace(9) %p, ptr addrspace(9) %p1) {
317+
load i8, ptr addrspace(9) %p
318+
load i8, ptr addrspace(9) %p1
319+
ret void
320+
}

llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: opt -S -mtriple=amdgcn-- -passes=load-store-vectorizer < %s | FileCheck -check-prefix=OPT %s
22

3-
; OPT-LABEL: @func(
4-
define void @func(ptr addrspace(7) %out) {
3+
; OPT-LABEL: @buffer_fat_ptrs(
4+
define void @buffer_fat_ptrs(ptr addrspace(7) %out) {
55
entry:
66
%a1 = getelementptr i32, ptr addrspace(7) %out, i32 1
77
%a2 = getelementptr i32, ptr addrspace(7) %out, i32 2
@@ -14,3 +14,18 @@ entry:
1414
store i32 3, ptr addrspace(7) %a3
1515
ret void
1616
}
17+
18+
; OPT-LABEL: @buffer_strided_ptrs(
19+
define void @buffer_strided_ptrs(ptr addrspace(9) %out) {
20+
entry:
21+
%a1 = getelementptr i32, ptr addrspace(9) %out, i32 1
22+
%a2 = getelementptr i32, ptr addrspace(9) %out, i32 2
23+
%a3 = getelementptr i32, ptr addrspace(9) %out, i32 3
24+
25+
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 4
26+
store i32 0, ptr addrspace(9) %out
27+
store i32 1, ptr addrspace(9) %a1
28+
store i32 2, ptr addrspace(9) %a2
29+
store i32 3, ptr addrspace(9) %a3
30+
ret void
31+
}

0 commit comments

Comments
 (0)