Skip to content

Commit 50509cd

Browse files
arsenmpravinjagtap
authored andcommitted
AMDGPU: Add v_permlane16_swap_b32 and v_permlane32_swap_b32 for gfx950 (llvm#117260)
This was a bit annoying because these introduce a new special case encoding usage. op_sel is repurposed as a subset of dpp controls, and is eligible for VOP3->VOP1 shrinking. For some reason fi also uses an enum value, so we need to convert the raw boolean to 1 instead of -1. The 2 registers are swapped, so this has 2 defs. Ideally the builtin would return a pair, but that's difficult so return a vector instead. This would make a hypothetical builtin that supports v2f16 directly uglier.
1 parent 2feecef commit 50509cd

28 files changed

+736
-5
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,9 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8, "V16fV4iV8iV16fiIiI
459459
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")
460460
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiIi", "nc", "gfx950-insts")
461461

462+
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
463+
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
464+
462465
//===----------------------------------------------------------------------===//
463466
// GFX12+ only builtins.
464467
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19288,6 +19288,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1928819288
CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1928919289
return Builder.CreateCall(F, {Arg});
1929019290
}
19291+
case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
19292+
case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
19293+
// Because builtin types are limited, and the intrinsic uses a struct/pair
19294+
// output, marshal the pair-of-i32 to <2 x i32>.
19295+
Value *VDstOld = EmitScalarExpr(E->getArg(0));
19296+
Value *VSrcOld = EmitScalarExpr(E->getArg(1));
19297+
Value *FI = EmitScalarExpr(E->getArg(2));
19298+
Value *BoundCtrl = EmitScalarExpr(E->getArg(3));
19299+
Function *F =
19300+
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
19301+
? Intrinsic::amdgcn_permlane16_swap
19302+
: Intrinsic::amdgcn_permlane32_swap);
19303+
llvm::CallInst *Call =
19304+
Builder.CreateCall(F, {VDstOld, VSrcOld, FI, BoundCtrl});
19305+
19306+
llvm::Value *Elt0 = Builder.CreateExtractValue(Call, 0);
19307+
llvm::Value *Elt1 = Builder.CreateExtractValue(Call, 1);
19308+
19309+
llvm::Type *ResultType = ConvertType(E->getType());
19310+
19311+
llvm::Value *Insert0 = Builder.CreateInsertElement(
19312+
llvm::PoisonValue::get(ResultType), Elt0, UINT64_C(0));
19313+
llvm::Value *AsVector =
19314+
Builder.CreateInsertElement(Insert0, Elt1, UINT64_C(1));
19315+
return AsVector;
19316+
}
1929119317
case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
1929219318
return emitBuiltinWithOneOverloadedType<4>(
1929319319
*this, E, Intrinsic::amdgcn_make_buffer_rsrc);

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9090
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9191
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92-
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92+
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
9393
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9494
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9595
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
// REQUIRES: amdgpu-registered-target
1212

1313
typedef unsigned int uint;
14-
void test_prng_b32(global uint* out, uint a) {
14+
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
15+
16+
void test(global uint* out, global uint2* out_v2u32, uint a, uint b) {
1517
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
18+
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
19+
*out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
1620
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// REQUIRES: amdgpu-registered-target
44

55
typedef unsigned int uint;
6+
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
67

78
// CHECK-LABEL: @test_prng_b32(
89
// CHECK-NEXT: entry:
@@ -19,3 +20,89 @@ typedef unsigned int uint;
1920
void test_prng_b32(global uint* out, uint a) {
2021
*out = __builtin_amdgcn_prng_b32(a);
2122
}
23+
24+
// CHECK-LABEL: @test_permlane16_swap(
25+
// CHECK-NEXT: entry:
26+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
27+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
28+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
29+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
30+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
31+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
32+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
33+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
34+
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
35+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
36+
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
37+
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
38+
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
39+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
40+
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
41+
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
42+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
43+
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
44+
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
45+
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
46+
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
47+
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
48+
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
49+
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
50+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
51+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
52+
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
53+
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
54+
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
55+
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
56+
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
57+
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
58+
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
59+
// CHECK-NEXT: ret void
60+
//
61+
void test_permlane16_swap(global uint2* out, uint old, uint src) {
62+
*out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
63+
*out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
64+
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
65+
}
66+
67+
// CHECK-LABEL: @test_permlane32_swap(
68+
// CHECK-NEXT: entry:
69+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
70+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
71+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
72+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
73+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
74+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
75+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
76+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
77+
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
78+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
79+
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
80+
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
81+
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
82+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
83+
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
84+
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
85+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
86+
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
87+
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
88+
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
89+
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
90+
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
91+
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
92+
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
93+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
94+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
95+
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
96+
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
97+
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
98+
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
99+
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
100+
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
101+
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
102+
// CHECK-NEXT: ret void
103+
//
104+
void test_permlane32_swap(global uint2* out, uint old, uint src) {
105+
*out = __builtin_amdgcn_permlane32_swap(old, src, false, false);
106+
*out = __builtin_amdgcn_permlane32_swap(old, src, true, false);
107+
*out = __builtin_amdgcn_permlane32_swap(old, src, false, true);
108+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,13 @@ void test_smfmac_f32_32x32x64_fp8_fp8(global float16* out, int4 a, int8 b, float
148148
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
149149
*out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' must be a constant integer}}
150150
}
151+
152+
void test_permlane16_swap(__global int* out, int old, int src, bool X) {
153+
*out = __builtin_amdgcn_permlane16_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
154+
*out = __builtin_amdgcn_permlane16_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane16_swap' must be a constant integer}}
155+
}
156+
157+
void test_permlane32_swap(__global int* out, int old, int src, bool X) {
158+
*out = __builtin_amdgcn_permlane32_swap(old, src, X, false); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
159+
*out = __builtin_amdgcn_permlane32_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
160+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
2727
__global float4* out12, int4 a12, int8 b12, float4 c12,
2828
__global float16* out13, int4 a13, int8 b13, float16 c13,
2929
__global float4* out14, int8 a14, int8 b14, float4 c14, int d14, int e14,
30-
__global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15) {
30+
__global float16* out15, int8 a15, int8 b15, float16 c15, int d15, int e15,
31+
__global uint2* out16, int a16, int b16) {
3132
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
3233
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
3334
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
@@ -50,4 +51,6 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
5051
*out13 = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a13, b13, c13, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8' needs target feature gfx950-insts}}
5152
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
5253
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
54+
*out16 = __builtin_amdgcn_permlane16_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
55+
*out16 = __builtin_amdgcn_permlane32_swap(a16, b16, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
5356
}

llvm/docs/AMDGPUUsage.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,6 +1401,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
14011401

14021402
llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4 Emit `v_mfma_scale_f32_32x32x64_f8f6f4`
14031403

1404+
llvm.amdgcn.permlane16.swap Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
1405+
Swaps the values across lanes of first 2 operands. Odd rows of the first operand are
1406+
swapped with even rows of the second operand (one row is 16 lanes).
1407+
Returns a pair for the swapped registers. The first element of the return corresponds
1408+
to the swapped element of the first argument.
1409+
1410+
1411+
llvm.amdgcn.permlane32.swap Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
1412+
Swaps the values across lanes of first 2 operands. Rows 2 and 3 of the first operand are
1413+
swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
1414+
Returns a pair for the swapped registers. The first element of the return
1415+
corresponds to the swapped element of the first argument.
1416+
14041417
============================================== ==========================================================
14051418

14061419
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,6 +3195,20 @@ def int_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_
31953195
def int_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31963196
}
31973197

3198+
// { vdst_new, vsrc_new } llvm.amdgcn.permlane16.swap <vdst_old> <vsrc_old> <fi> <bound_control>
3199+
def int_amdgcn_permlane16_swap :
3200+
Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
3201+
llvm_i1_ty, llvm_i1_ty],
3202+
[IntrNoMem, IntrConvergent, IntrWillReturn,
3203+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;
3204+
3205+
// { vdst_new, vsrc_new } llvm.amdgcn.permlane32.swap <vdst_old> <vsrc_old> <fi> <bound_control>
3206+
def int_amdgcn_permlane32_swap :
3207+
Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
3208+
llvm_i1_ty, llvm_i1_ty],
3209+
[IntrNoMem, IntrConvergent, IntrWillReturn,
3210+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;
3211+
31983212
//===----------------------------------------------------------------------===//
31993213
// Special Intrinsics for backend internal use only. No frontend
32003214
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,10 +372,23 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
372372
"Additional instructions for GFX940+"
373373
>;
374374

375+
def FeaturePermlane16Swap : SubtargetFeature<"permlane16-swap",
376+
"HasPermlane16Swap",
377+
"true",
378+
"Has v_permlane16_swap_b32 instructions"
379+
>;
380+
381+
def FeaturePermlane32Swap : SubtargetFeature<"permlane32-swap",
382+
"HasPermlane32Swap",
383+
"true",
384+
"Has v_permlane32_swap_b32 instructions"
385+
>;
386+
375387
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
376388
"GFX950Insts",
377389
"true",
378-
"Additional instructions for GFX950+"
390+
"Additional instructions for GFX950+",
391+
[FeaturePermlane16Swap, FeaturePermlane32Swap]
379392
>;
380393

381394
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
@@ -1988,6 +2001,14 @@ def HasGFX950Insts :
19882001
Predicate<"Subtarget->hasGFX950Insts()">,
19892002
AssemblerPredicate<(all_of FeatureGFX950Insts)>;
19902003

2004+
def HasPermlane16Swap :
2005+
Predicate<"Subtarget->hasPermlane16Swap()">,
2006+
AssemblerPredicate<(all_of FeaturePermlane16Swap)>;
2007+
2008+
def HasPermlane32Swap :
2009+
Predicate<"Subtarget->hasPermlane32Swap()">,
2010+
AssemblerPredicate<(all_of FeaturePermlane32Swap)>;
2011+
19912012
def isGFX8GFX9NotGFX940 :
19922013
Predicate<"!Subtarget->hasGFX940Insts() &&"
19932014
"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"

0 commit comments

Comments
 (0)