Skip to content

Commit dd96135

Browse files
committed
[NVPTX] Add intrinsics for st.bulk instruction
Adds NVVM intrinsics and NVPTX codegen for the `st.bulk` instruction introduced in ptx8.6 for sm_100. Tests added in `CodeGen/NVPTX/st_bulk.ll` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk
1 parent e3f5269 commit dd96135

File tree

3 files changed

+61
-0
lines changed

3 files changed

+61
-0
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5261,4 +5261,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
52615261
}
52625262
}
52635263

5264+
//
5265+
// Bulk store intrinsics
5266+
//
5267+
5268+
def int_nvvm_st_bulk: Intrinsic<[],
5269+
[llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty],
5270+
[IntrArgMemOnly, IntrWriteMem,
5271+
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
5272+
5273+
def int_nvvm_st_bulk_shared_cta : Intrinsic<[],
5274+
[llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty],
5275+
[IntrArgMemOnly, IntrWriteMem,
5276+
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
5277+
52645278
} // let TargetPrefix = "nvvm"

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7816,3 +7816,17 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
78167816
}
78177817

78187818
} // isConvergent
7819+
7820+
// Bulk store instructions
7821+
7822+
def INT_NVVM_ST_BULK_GENERIC :
7823+
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
7824+
"st.bulk [$dest_addr], $size, 0;",
7825+
[(int_nvvm_st_bulk i64:$dest_addr, i64:$size, (i64 0))]>,
7826+
Requires<[hasSM<100>, hasPTX<86>]>;
7827+
7828+
def INT_NVVM_ST_BULK_SHARED_CTA:
7829+
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
7830+
"st.bulk.shared::cta [$dest_addr], $size, 0;",
7831+
[(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>,
7832+
Requires<[hasSM<100>, hasPTX<86>]>;

llvm/test/CodeGen/NVPTX/st_bulk.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s
3+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
4+
5+
declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64)
6+
define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) {
7+
; CHECK-LABEL: st_bulk(
8+
; CHECK: {
9+
; CHECK-NEXT: .reg .b64 %rd<3>;
10+
; CHECK-EMPTY:
11+
; CHECK-NEXT: // %bb.0:
12+
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0];
13+
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1];
14+
; CHECK-NEXT: st.bulk [%rd1], %rd2, 0;
15+
; CHECK-NEXT: ret;
16+
call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0)
17+
ret void
18+
}
19+
20+
declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64)
21+
define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
22+
; CHECK-LABEL: st_bulk_shared_cta(
23+
; CHECK: {
24+
; CHECK-NEXT: .reg .b64 %rd<3>;
25+
; CHECK-EMPTY:
26+
; CHECK-NEXT: // %bb.0:
27+
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
28+
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
29+
; CHECK-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0;
30+
; CHECK-NEXT: ret;
31+
call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
32+
ret void
33+
}

0 commit comments

Comments
 (0)