-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LLVM][AMDGPU] Add Intrinsic and Builtin for ds_bpermute_fi_b32 #124616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-clang Author: Acim Maravic (Acim-Maravic) ChangesFull diff: https://github.com/llvm/llvm-project/pull/124616.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1b29a8e359c205..39e295aced96b2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -504,6 +504,8 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gf
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")
+
//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 5b5ae419f0a4a9..234ad4fd8cde61 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -296,3 +296,26 @@ void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
}
+
+// CHECK-LABEL: @test_ds_bpermute_fi_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 [[TMP0]], i32 [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+// CHECK-NEXT: ret void
+//
+void test_ds_bpermute_fi_b32(global int* out, int a, int b)
+{
+ *out = __builtin_amdgcn_ds_bpermute_fi_b32(a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cc3584833202bf..f721d5267cd2a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2923,6 +2923,12 @@ def int_amdgcn_s_prefetch_data :
"", [SDNPMemOperand]
>;
+// llvm.amdgcn.ds.bpermute.fi.b32 <index> <src>
+def int_amdgcn_ds_bpermute_fi_b32 :
+ ClangBuiltin<"__builtin_amdgcn_ds_bpermute_fi_b32">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 224c368cff4a1f..2e5f42c3bdc405 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4675,6 +4675,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_ds_bpermute_fi_b32:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_cvt_pkrtz:
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index bc217e10e0fbd7..bc48f3a5268fbf 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -699,7 +699,8 @@ def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
int_amdgcn_ds_permute>;
def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
-def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32">;
+def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
+ int_amdgcn_ds_bpermute_fi_b32>;
}
} // let SubtargetPredicate = isGFX8Plus
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
new file mode 100644
index 00000000000000..ea85055b65d127
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+declare i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32, i32) #0
+
+define amdgpu_kernel void @ds_bpermute_fi_b32(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
+; GFX12-LABEL: ds_bpermute_fi_b32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_fi_b32_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_imm_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v0, v0, v1 offset:4
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_imm_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_add_co_i32 s2, s2, 4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_endpgm
+ %index = add i32 %base_index, 4
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_fi_b32_imm_index(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_imm_index:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x30
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v1, v0, v1 offset:64
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_imm_index:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x30
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 64 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_endpgm
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 64, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @ds_bpermute_fi_b32_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_add_shl:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v2, v2, v3 offset:4
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_add_shl:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_lshl_add_u32 v2, v2, 2, 4
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v2, v2, v3
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %index = add i32 %base_index, 1
+ %byte_index = shl i32 %index, 2
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %byte_index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @ds_bpermute_fi_b32_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_or_shl:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 62, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v2, v2, v3 offset:4
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_or_shl:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v2, 62, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshl_or_b32 v2, v2, 2, 4
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v2, v2, v3
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i32 %base_index, 62
+ %index = or i32 %masked, 1
+ %byte_index = shl i32 %index, 2
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %byte_index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
|
@llvm/pr-subscribers-llvm-ir Author: Acim Maravic (Acim-Maravic) ChangesFull diff: https://github.com/llvm/llvm-project/pull/124616.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1b29a8e359c205..39e295aced96b2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -504,6 +504,8 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gf
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")
+
//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 5b5ae419f0a4a9..234ad4fd8cde61 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -296,3 +296,26 @@ void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
}
+
+// CHECK-LABEL: @test_ds_bpermute_fi_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 [[TMP0]], i32 [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+// CHECK-NEXT: ret void
+//
+void test_ds_bpermute_fi_b32(global int* out, int a, int b)
+{
+ *out = __builtin_amdgcn_ds_bpermute_fi_b32(a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cc3584833202bf..f721d5267cd2a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2923,6 +2923,12 @@ def int_amdgcn_s_prefetch_data :
"", [SDNPMemOperand]
>;
+// llvm.amdgcn.ds.bpermute.fi.b32 <index> <src>
+def int_amdgcn_ds_bpermute_fi_b32 :
+ ClangBuiltin<"__builtin_amdgcn_ds_bpermute_fi_b32">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 224c368cff4a1f..2e5f42c3bdc405 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4675,6 +4675,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_ds_bpermute_fi_b32:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_cvt_pkrtz:
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index bc217e10e0fbd7..bc48f3a5268fbf 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -699,7 +699,8 @@ def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
int_amdgcn_ds_permute>;
def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
-def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32">;
+def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
+ int_amdgcn_ds_bpermute_fi_b32>;
}
} // let SubtargetPredicate = isGFX8Plus
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
new file mode 100644
index 00000000000000..ea85055b65d127
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+declare i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32, i32) #0
+
+define amdgpu_kernel void @ds_bpermute_fi_b32(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
+; GFX12-LABEL: ds_bpermute_fi_b32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_fi_b32_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_imm_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v0, v0, v1 offset:4
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_imm_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_add_co_i32 s2, s2, 4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_endpgm
+ %index = add i32 %base_index, 4
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_fi_b32_imm_index(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_imm_index:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x30
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v1, v0, v1 offset:64
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_imm_index:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x30
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 64 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v0, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_endpgm
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 64, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @ds_bpermute_fi_b32_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_add_shl:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v2, v2, v3 offset:4
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_add_shl:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_lshl_add_u32 v2, v2, 2, 4
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v2, v2, v3
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %index = add i32 %base_index, 1
+ %byte_index = shl i32 %index, 2
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %byte_index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @ds_bpermute_fi_b32_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; GFX12-SDAG-LABEL: ds_bpermute_fi_b32_or_shl:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 62, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GFX12-SDAG-NEXT: ds_bpermute_fi_b32 v2, v2, v3 offset:4
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: ds_bpermute_fi_b32_or_shl:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v2, 62, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshl_or_b32 v2, v2, 2, 4
+; GFX12-GISEL-NEXT: ds_bpermute_fi_b32 v2, v2, v3
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i32 %base_index, 62
+ %index = or i32 %masked, 1
+ %byte_index = shl i32 %index, 2
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 %byte_index, i32 %src) #0
+ store i32 %bpermute, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
negative tests
@@ -504,6 +504,8 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gf | |||
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") | |||
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") | |||
|
|||
TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing the target rejection tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added.
@@ -699,7 +699,8 @@ def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", | |||
int_amdgcn_ds_permute>; | |||
def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", | |||
int_amdgcn_ds_bpermute>; | |||
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32">; | |||
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Predicate on this should be isGFX12Plus not isGFX8Plus, and you should check that trying to use llvm.amdgcn.ds.bpermute.fi.b32 pre-GFX12 fails cleanly with a "cannot select" error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I overlooked that... Updated.
dea0bd7
to
c9c461b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/12659 Here is the relevant piece of the build log for the reference
|
No description provided.