Skip to content

Commit 0a0f22c

Browse files
committed
review feedback: split and add tests, remove 32-bit cvta.*shared::cluster, allow isel to lower shared::cta <-> shared::cluster addrspacecasts
1 parent 59c9c8a commit 0a0f22c

File tree

8 files changed

+398
-235
lines changed

8 files changed

+398
-235
lines changed

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4773,8 +4773,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47734773
NewCall =
47744774
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
47754775
Value *Res = NewCall;
4776-
Res = Builder.CreateAddrSpaceCast(
4777-
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
47784776
Res = Builder.CreateAddrSpaceCast(
47794777
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
47804778
NewCall->takeName(CI);
@@ -4792,10 +4790,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47924790
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
47934791
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
47944792
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4795-
4793+
// Create a new call with the correct address space.
47964794
SmallVector<Value *, 4> Args(CI->args());
4797-
Args[0] = Builder.CreateAddrSpaceCast(
4798-
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
47994795
Args[0] = Builder.CreateAddrSpaceCast(
48004796
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
48014797

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -983,8 +983,10 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
983983
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
984984
break;
985985
case ADDRESS_SPACE_SHARED_CLUSTER:
986-
Opc = TM.is64Bit() ? NVPTX::cvta_shared_cluster_64
987-
: NVPTX::cvta_shared_cluster;
986+
if (!TM.is64Bit())
987+
report_fatal_error(
988+
"Shared cluster address space is only supported in 64-bit mode");
989+
Opc = NVPTX::cvta_shared_cluster_64;
988990
break;
989991
case ADDRESS_SPACE_CONST:
990992
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
@@ -1009,8 +1011,10 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
10091011
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
10101012
break;
10111013
case ADDRESS_SPACE_SHARED_CLUSTER:
1012-
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_cluster_64
1013-
: NVPTX::cvta_to_shared_cluster;
1014+
if (!TM.is64Bit())
1015+
report_fatal_error(
1016+
"Shared cluster address space is only supported in 64-bit mode");
1017+
Opc = NVPTX::cvta_to_shared_cluster_64;
10141018
break;
10151019
case ADDRESS_SPACE_CONST:
10161020
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3019,8 +3019,42 @@ SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
30193019
unsigned SrcAS = N->getSrcAddressSpace();
30203020
unsigned DestAS = N->getDestAddressSpace();
30213021
if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3022-
DestAS != llvm::ADDRESS_SPACE_GENERIC)
3022+
DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3023+
// Shared and SharedCluster can be converted to each other through generic
3024+
// space
3025+
if (SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3026+
DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) {
3027+
const MVT GenerictVT =
3028+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);
3029+
const MVT SharedClusterVT =
3030+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_SHARED_CLUSTER);
3031+
SDValue GenericConversion =
3032+
DAG.getAddrSpaceCast(SDLoc(), GenerictVT, Op.getOperand(0),
3033+
ADDRESS_SPACE_SHARED, ADDRESS_SPACE_GENERIC);
3034+
SDValue SharedClusterConversion = DAG.getAddrSpaceCast(
3035+
SDLoc(), SharedClusterVT, GenericConversion, ADDRESS_SPACE_GENERIC,
3036+
ADDRESS_SPACE_SHARED_CLUSTER);
3037+
return SharedClusterConversion;
3038+
}
3039+
3040+
if (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER &&
3041+
DestAS == llvm::ADDRESS_SPACE_SHARED) {
3042+
const MVT GenerictVT =
3043+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);
3044+
const MVT SharedVT =
3045+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_SHARED);
3046+
SDValue GenericConversion = DAG.getAddrSpaceCast(
3047+
SDLoc(), GenerictVT, Op.getOperand(0), ADDRESS_SPACE_SHARED_CLUSTER,
3048+
ADDRESS_SPACE_GENERIC);
3049+
SDValue SharedConversion =
3050+
DAG.getAddrSpaceCast(SDLoc(), SharedVT, GenericConversion,
3051+
ADDRESS_SPACE_GENERIC, ADDRESS_SPACE_SHARED);
3052+
return SharedConversion;
3053+
}
3054+
30233055
return DAG.getUNDEF(Op.getValueType());
3056+
}
3057+
30243058
return Op;
30253059
}
30263060

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,15 +2098,15 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
20982098
defvar frag_pat = (frag node:$a, node:$b);
20992099
defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
21002100
defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
2101-
defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
2101+
defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
21022102
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
21032103
}
21042104

21052105
multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
21062106
defvar frag_pat = (frag node:$a, node:$b, node:$c);
21072107
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
21082108
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
2109-
defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
2109+
defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
21102110
defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
21112111
}
21122112

@@ -2387,29 +2387,39 @@ def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
23872387
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
23882388

23892389

2390-
multiclass NG_TO_G<string Str> {
2390+
multiclass NG_TO_G<string Str, list<Predicate> Preds = []> {
23912391
def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2392-
"cvta." # Str # ".u32 \t$result, $src;", []>;
2392+
"cvta." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
23932393
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2394-
"cvta." # Str # ".u64 \t$result, $src;", []>;
2394+
"cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
23952395
}
23962396

2397-
multiclass G_TO_NG<string Str> {
2397+
multiclass NG_TO_G_64<string Str, list<Predicate> Preds = []> {
2398+
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2399+
"cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
2400+
}
2401+
2402+
multiclass G_TO_NG<string Str, list<Predicate> Preds = []> {
23982403
def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2399-
"cvta.to." # Str # ".u32 \t$result, $src;", []>;
2404+
"cvta.to." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
2405+
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2406+
"cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
2407+
}
2408+
2409+
multiclass G_TO_NG_64<string Str, list<Predicate> Preds = []> {
24002410
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2401-
"cvta.to." # Str # ".u64 \t$result, $src;", []>;
2411+
"cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
24022412
}
24032413

24042414
defm cvta_local : NG_TO_G<"local">;
24052415
defm cvta_shared : NG_TO_G<"shared">;
2406-
defm cvta_shared_cluster : NG_TO_G<"shared::cluster">;
2416+
defm cvta_shared_cluster : NG_TO_G_64<"shared::cluster", [hasClusters]>;
24072417
defm cvta_global : NG_TO_G<"global">;
24082418
defm cvta_const : NG_TO_G<"const">;
24092419

24102420
defm cvta_to_local : G_TO_NG<"local">;
24112421
defm cvta_to_shared : G_TO_NG<"shared">;
2412-
defm cvta_to_shared_cluster : G_TO_NG<"shared::cluster">;
2422+
defm cvta_to_shared_cluster : G_TO_NG_64<"shared::cluster", [hasClusters]>;
24132423
defm cvta_to_global : G_TO_NG<"global">;
24142424
defm cvta_to_const : G_TO_NG<"const">;
24152425

llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -278,18 +278,18 @@ define void @nvvm_shared_cluster_intrinsics(ptr addrspace(3) %p0, i32 %offset) {
278278

279279
; CHECK-LABEL: @nvvm_cp_async_bulk_intrinsics
280280
define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, ptr addrspace(3) %src_shared, i32 %size) {
281-
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
282-
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
281+
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
282+
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
283283
call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
284284
call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
285285
ret void
286286
}
287287

288288
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
289289
define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
290-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 false, i1 false)
291-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 false, i1 false)
292-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %6, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 false, i1 false)
290+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 false, i1 false)
291+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 false, i1 false)
292+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 false, i1 false)
293293
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 0, i1 0)
294294
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 0, i1 0)
295295
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 0, i1 0)
@@ -298,11 +298,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs
298298

299299
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
300300
define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
301-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 false, i1 false)
302-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 false, i1 false)
303-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %6, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 false, i1 false)
304-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %8, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 false, i1 false)
305-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %10, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 false, i1 false)
301+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 false, i1 false)
302+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 false, i1 false)
303+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 false, i1 false)
304+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 false, i1 false)
305+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 false, i1 false)
306306
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 0, i1 0)
307307
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 0, i1 0)
308308
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 0, i1 0)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64
2+
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64
3+
; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify %}
4+
; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 --nvptx-short-ptr | %ptxas-verify %}
5+
6+
; ALL-LABEL: conv_shared_cluster_to_generic
7+
define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
8+
; CLS32: cvta.shared::cluster.u32
9+
; PTRCONV: cvt.u64.u32
10+
; NOPTRCONV-NOT: cvt.u64.u32
11+
; CLS64: cvta.shared::cluster.u64
12+
; ALL: ld.u32
13+
%genptr = addrspacecast ptr addrspace(7) %ptr to ptr
14+
%val = load i32, ptr %genptr
15+
ret i32 %val
16+
}
17+
18+
; ALL-LABEL: conv_generic_to_shared_cluster
19+
define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
20+
; CLS32: cvta.to.shared::cluster.u32
21+
; CLS64: cvta.to.shared::cluster.u64
22+
; PTRCONV: cvt.u32.u64
23+
; NOPTRCONV-NOT: cvt.u32.u64
24+
; ALL: ld.shared::cluster.u32
25+
%specptr = addrspacecast ptr %ptr to ptr addrspace(7)
26+
%val = load i32, ptr addrspace(7) %specptr
27+
ret i32 %val
28+
}
29+
30+
; ALL-LABEL: conv_shared_to_shared_cluster
31+
define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
32+
; CLS64: cvta.shared.u64
33+
; CLS64: cvta.to.shared::cluster.u64
34+
; ALL: ld.shared::cluster.u32
35+
%specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7)
36+
%val = load i32, ptr addrspace(7) %specptr
37+
ret i32 %val
38+
}
39+
40+
; ALL-LABEL: conv_shared_cluster_to_shared
41+
define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
42+
; CLS64: cvta.shared::cluster.u64
43+
; CLS64: cvta.to.shared.u64
44+
; ALL: ld.shared.u32
45+
%specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3)
46+
%val = load i32, ptr addrspace(3) %specptr
47+
ret i32 %val
48+
}

0 commit comments

Comments
 (0)