Skip to content

Commit 1f5f381

Browse files
authored
AMDGPU: Implement intrinsic/builtins for gfx1250 load transpose instructions (#146289)
1 parent b822a32 commit 1f5f381

File tree

11 files changed

+622
-3
lines changed

11 files changed

+622
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
640640
// GFX1250+ only builtins.
641641
//===----------------------------------------------------------------------===//
642642

643+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
644+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
645+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
646+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32")
647+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32")
648+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32")
649+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
650+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32")
651+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
652+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32")
653+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
654+
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
655+
643656
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
644657

645658
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
545545
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
546546
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
547547
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
548+
case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
549+
case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
550+
case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
551+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
552+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
553+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
554+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
555+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
556+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
557+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
558+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
559+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
548560
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
549561
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
550562
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
@@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
555567
switch (BuiltinID) {
556568
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
557569
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
570+
case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
558571
IID = Intrinsic::amdgcn_global_load_tr_b64;
559572
break;
560573
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
563576
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
564577
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
565578
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
579+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
580+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
581+
case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
566582
IID = Intrinsic::amdgcn_global_load_tr_b128;
567583
break;
584+
case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
585+
IID = Intrinsic::amdgcn_global_load_tr4_b64;
586+
break;
587+
case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
588+
IID = Intrinsic::amdgcn_global_load_tr6_b96;
589+
break;
590+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
591+
IID = Intrinsic::amdgcn_ds_load_tr4_b64;
592+
break;
593+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
594+
IID = Intrinsic::amdgcn_ds_load_tr6_b96;
595+
break;
596+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
597+
IID = Intrinsic::amdgcn_ds_load_tr8_b64;
598+
break;
599+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
600+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
601+
case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
602+
IID = Intrinsic::amdgcn_ds_load_tr16_b128;
603+
break;
568604
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
569605
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
570606
break;
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
4+
5+
typedef int v2i __attribute__((ext_vector_type(2)));
6+
typedef int v3i __attribute__((ext_vector_type(3)));
7+
typedef int v4i __attribute__((ext_vector_type(4)));
8+
typedef short v8s __attribute__((ext_vector_type(8)));
9+
typedef half v8h __attribute__((ext_vector_type(8)));
10+
typedef __bf16 v8y __attribute__((ext_vector_type(8)));
11+
12+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32(
13+
// CHECK-GFX1250-NEXT: entry:
14+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
15+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
16+
//
17+
v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr)
18+
{
19+
return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr);
20+
}
21+
22+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32(
23+
// CHECK-GFX1250-NEXT: entry:
24+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
25+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
26+
//
27+
v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr)
28+
{
29+
return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr);
30+
}
31+
32+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32(
33+
// CHECK-GFX1250-NEXT: entry:
34+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]])
35+
// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
36+
//
37+
v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr)
38+
{
39+
return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr);
40+
}
41+
42+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16(
43+
// CHECK-GFX1250-NEXT: entry:
44+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
45+
// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
46+
//
47+
v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr)
48+
{
49+
return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr);
50+
}
51+
52+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16(
53+
// CHECK-GFX1250-NEXT: entry:
54+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
55+
// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
56+
//
57+
v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr)
58+
{
59+
return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr);
60+
}
61+
62+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16(
63+
// CHECK-GFX1250-NEXT: entry:
64+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
65+
// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
66+
//
67+
v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr)
68+
{
69+
return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr);
70+
}
71+
72+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32(
73+
// CHECK-GFX1250-NEXT: entry:
74+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
75+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
76+
//
77+
v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr)
78+
{
79+
return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr);
80+
}
81+
82+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32(
83+
// CHECK-GFX1250-NEXT: entry:
84+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
85+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
86+
//
87+
v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr)
88+
{
89+
return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr);
90+
}
91+
92+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32(
93+
// CHECK-GFX1250-NEXT: entry:
94+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]])
95+
// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
96+
//
97+
v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr)
98+
{
99+
return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr);
100+
}
101+
102+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16(
103+
// CHECK-GFX1250-NEXT: entry:
104+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]])
105+
// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
106+
//
107+
v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr)
108+
{
109+
return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr);
110+
}
111+
112+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16(
113+
// CHECK-GFX1250-NEXT: entry:
114+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]])
115+
// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
116+
//
117+
v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr)
118+
{
119+
return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr);
120+
}
121+
122+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16(
123+
// CHECK-GFX1250-NEXT: entry:
124+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]])
125+
// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
126+
//
127+
v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr)
128+
{
129+
return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr);
130+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
29962996

29972997
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
29982998
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
2999+
def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
3000+
def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>;
3001+
def int_amdgcn_ds_load_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
3002+
def int_amdgcn_ds_load_tr16_b128 : AMDGPULoadIntrinsic<local_ptr_ty>;
3003+
def int_amdgcn_ds_load_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
3004+
def int_amdgcn_ds_load_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
29993005
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
30003006
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
30013007
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
51055105
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
51065106
case Intrinsic::amdgcn_global_load_tr_b64:
51075107
case Intrinsic::amdgcn_global_load_tr_b128:
5108+
case Intrinsic::amdgcn_global_load_tr4_b64:
5109+
case Intrinsic::amdgcn_global_load_tr6_b96:
5110+
case Intrinsic::amdgcn_ds_load_tr8_b64:
5111+
case Intrinsic::amdgcn_ds_load_tr16_b128:
5112+
case Intrinsic::amdgcn_ds_load_tr4_b64:
5113+
case Intrinsic::amdgcn_ds_load_tr6_b96:
51085114
case Intrinsic::amdgcn_ds_read_tr4_b64:
51095115
case Intrinsic::amdgcn_ds_read_tr6_b96:
51105116
case Intrinsic::amdgcn_ds_read_tr8_b64:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>;
344344

345345
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
346346
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
347+
def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>;
348+
def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>;
349+
def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>;
350+
def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>;
351+
def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>;
352+
def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>;
347353

348354
def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
349355
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in {
789789
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
790790
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
791791
defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
792-
} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
792+
} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
793793
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
794794
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
795-
} // let WaveSizePredicate = isWave32, mayStore = 0
795+
} // End WaveSizePredicate = isWave32, mayStore = 0
796796

797-
} // let SubtargetPredicate = isGFX1250Plus
797+
} // End SubtargetPredicate = isGFX1250Plus
798798

799799
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
800800
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
@@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa
12761276
(inst $ptr, Offset:$offset, (i1 0))
12771277
>;
12781278

1279+
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
1280+
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
1281+
def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>;
1282+
def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>;
1283+
} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
1284+
1285+
def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>;
1286+
foreach vt = [v8i16, v8f16, v8bf16] in
1287+
def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>;
1288+
} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus
1289+
12791290
let SubtargetPredicate = HasGFX950Insts in {
12801291
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
12811292
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in {
18481848
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>;
18491849
}
18501850

1851+
let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] in {
1852+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>;
1853+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
1854+
}
1855+
18511856
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
18521857
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
18531858
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14441444
MachineMemOperand::MOVolatile;
14451445
return true;
14461446
}
1447+
case Intrinsic::amdgcn_ds_load_tr6_b96:
1448+
case Intrinsic::amdgcn_ds_load_tr4_b64:
1449+
case Intrinsic::amdgcn_ds_load_tr8_b64:
1450+
case Intrinsic::amdgcn_ds_load_tr16_b128:
1451+
case Intrinsic::amdgcn_global_load_tr6_b96:
1452+
case Intrinsic::amdgcn_global_load_tr4_b64:
14471453
case Intrinsic::amdgcn_global_load_tr_b64:
14481454
case Intrinsic::amdgcn_global_load_tr_b128:
14491455
case Intrinsic::amdgcn_ds_read_tr4_b64:
@@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
15481554
case Intrinsic::amdgcn_atomic_cond_sub_u32:
15491555
case Intrinsic::amdgcn_ds_append:
15501556
case Intrinsic::amdgcn_ds_consume:
1557+
case Intrinsic::amdgcn_ds_load_tr8_b64:
1558+
case Intrinsic::amdgcn_ds_load_tr16_b128:
1559+
case Intrinsic::amdgcn_ds_load_tr4_b64:
1560+
case Intrinsic::amdgcn_ds_load_tr6_b96:
15511561
case Intrinsic::amdgcn_ds_read_tr4_b64:
15521562
case Intrinsic::amdgcn_ds_read_tr6_b96:
15531563
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
15621572
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
15631573
case Intrinsic::amdgcn_global_load_tr_b64:
15641574
case Intrinsic::amdgcn_global_load_tr_b128:
1575+
case Intrinsic::amdgcn_global_load_tr4_b64:
1576+
case Intrinsic::amdgcn_global_load_tr6_b96:
15651577
Ptr = II->getArgOperand(0);
15661578
break;
15671579
case Intrinsic::amdgcn_load_to_lds:

0 commit comments

Comments
 (0)