Skip to content

Commit b4696d4

Browse files
committed
[NVPTX] Add intrinsics for st.bulk instruction
Adds NVVM intrinsics and NVPTX codegen for the `st.bulk` instruction introduced in ptx8.6 for sm_100. Tests added in `CodeGen/NVPTX/st_bulk.ll` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk
1 parent 22f5268 commit b4696d4

File tree

3 files changed

+55
-0
lines changed

3 files changed

+55
-0
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5186,4 +5186,12 @@ foreach cta_group = ["cg1", "cg2"] in {
51865186
}
51875187
}
51885188

5189+
//
5190+
// Bulk store intrinsics
5191+
//
5192+
5193+
def int_nvvm_st_bulk: Intrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty], [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
5194+
5195+
def int_nvvm_st_bulk_shared_cta : Intrinsic<[], [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty], [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
5196+
51895197
} // let TargetPrefix = "nvvm"

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7731,3 +7731,17 @@ def tcgen05_fence_after_thread_sync: NVPTXInst<(outs), (ins),
77317731
Requires<[hasTcgen05Instructions]>;
77327732

77337733
} // hasSideEffects
7734+
7735+
// Bulk store instructions
7736+
7737+
def INT_NVVM_ST_BULK_GENERIC :
7738+
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
7739+
"st.bulk [$dest_addr], $size, 0;",
7740+
[(int_nvvm_st_bulk i64:$dest_addr, i64:$size, (i64 0))]>,
7741+
Requires<[hasSM<100>, hasPTX<86>]>;
7742+
7743+
def INT_NVVM_ST_BULK_SHARED_CTA:
7744+
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
7745+
"st.bulk.shared::cta [$dest_addr], $size, 0;",
7746+
[(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>,
7747+
Requires<[hasSM<100>, hasPTX<86>]>;

llvm/test/CodeGen/NVPTX/st_bulk.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s
3+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
4+
5+
declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64)
6+
define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) {
7+
; CHECK-LABEL: st_bulk(
8+
; CHECK: {
9+
; CHECK-NEXT: .reg .b64 %rd<3>;
10+
; CHECK-EMPTY:
11+
; CHECK-NEXT: // %bb.0:
12+
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0];
13+
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1];
14+
; CHECK-NEXT: st.bulk [%rd1], %rd2, 0;
15+
; CHECK-NEXT: ret;
16+
call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0)
17+
ret void
18+
}
19+
20+
declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64)
21+
define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
22+
; CHECK-LABEL: st_bulk_shared_cta(
23+
; CHECK: {
24+
; CHECK-NEXT: .reg .b64 %rd<3>;
25+
; CHECK-EMPTY:
26+
; CHECK-NEXT: // %bb.0:
27+
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
28+
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
29+
; CHECK-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0;
30+
; CHECK-NEXT: ret;
31+
call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
32+
ret void
33+
}

0 commit comments

Comments
 (0)