Skip to content

Commit 2ad43fa

Browse files
authored
[AMDGPU] Fix operand types for V_DOT2_F32_BF16 (#82044)
1 parent 9563746 commit 2ad43fa

File tree

7 files changed

+139
-13
lines changed

7 files changed

+139
-13
lines changed

clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
1515
// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat>
1616
// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat
1717
// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]])
18-
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false)
19-
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true)
18+
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], float %fC, i1 false)
19+
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], float %fC, i1 true)
2020
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
2121
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
2222
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
28352835
DefaultAttrsIntrinsic<
28362836
[llvm_float_ty], // %r
28372837
[
2838-
llvm_v2i16_ty, // %a
2839-
llvm_v2i16_ty, // %b
2838+
llvm_v2bf16_ty, // %a
2839+
llvm_v2bf16_ty, // %b
28402840
llvm_float_ty, // %c
28412841
llvm_i1_ty // %clamp
28422842
],

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2494,7 +2494,7 @@ def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
24942494

24952495
def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
24962496
def VOP_BF16_V2BF16_V2BF16_BF16: VOPProfile <[bf16, v2bf16, v2bf16, bf16]>;
2497-
def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
2497+
def VOP_F32_V2BF16_V2BF16_F32 : VOPProfile <[f32, v2bf16, v2bf16, f32]>;
24982498

24992499
def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
25002500

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
396396
} // End OtherPredicates = [HasDot1Insts]
397397

398398
def DOT2_BF16_Profile
399-
: VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1> {
399+
: VOP3P_Profile<VOP_F32_V2BF16_V2BF16_F32, VOP3_REGULAR, /*HasDPP*/ 1> {
400400
let HasSrc1Mods = 1;
401401
}
402402

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
33
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
44

5-
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float %c, i1 %clamp)
5+
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
66

77
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
88
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
@@ -25,10 +25,10 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
2525
ptr addrspace(1) %b,
2626
ptr addrspace(1) %c) {
2727
entry:
28-
%a.val = load <2 x i16>, ptr addrspace(1) %a
29-
%b.val = load <2 x i16>, ptr addrspace(1) %b
28+
%a.val = load <2 x bfloat>, ptr addrspace(1) %a
29+
%b.val = load <2 x bfloat>, ptr addrspace(1) %b
3030
%c.val = load float, ptr addrspace(1) %c
31-
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1)
31+
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 1)
3232
store float %r.val, ptr addrspace(1) %r
3333
ret void
3434
}
@@ -55,10 +55,10 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
5555
ptr addrspace(1) %b,
5656
ptr addrspace(1) %c) {
5757
entry:
58-
%a.val = load <2 x i16>, ptr addrspace(1) %a
59-
%b.val = load <2 x i16>, ptr addrspace(1) %b
58+
%a.val = load <2 x bfloat>, ptr addrspace(1) %a
59+
%b.val = load <2 x bfloat>, ptr addrspace(1) %b
6060
%c.val = load float, ptr addrspace(1) %c
61-
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0)
61+
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 0)
6262
store float %r.val, ptr addrspace(1) %r
6363
ret void
6464
}

llvm/test/MC/AMDGPU/bf16_imm.s

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,66 @@ v_dot2_bf16_bf16 v2, v0, 0x3e22, v2
4848

4949
v_dot2_bf16_bf16 v2, v0, v2, 0.15915494
5050
// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03]
51+
52+
v_dot2_f32_bf16 v2, v1, 0, v2
53+
// CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c]
54+
55+
v_dot2_f32_bf16 v2, v1, 0.5, v2
56+
// CHECK: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c]
57+
58+
v_dot2_f32_bf16 v2, v1, -0.5, v2
59+
// CHECK: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c]
60+
61+
v_dot2_f32_bf16 v2, v1, 1.0, v2
62+
// CHECK: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c]
63+
64+
v_dot2_f32_bf16 v2, v1, -1.0, v2
65+
// CHECK: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c]
66+
67+
v_dot2_f32_bf16 v2, v1, 2.0, v2
68+
// CHECK: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c]
69+
70+
v_dot2_f32_bf16 v2, v1, -2.0, v2
71+
// CHECK: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c]
72+
73+
v_dot2_f32_bf16 v2, v1, 4.0, v2
74+
// CHECK: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c]
75+
76+
v_dot2_f32_bf16 v2, v1, -4.0, v2
77+
// CHECK: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c]
78+
79+
v_dot2_f32_bf16 v2, v1, 0.15915494, v2
80+
// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c]
81+
82+
v_dot2_f32_bf16 v2, v1, 0x3e22, v2
83+
// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c]
84+
85+
v_dot2_f32_bf16 v2, 0.5, v1, v2
86+
// CHECK: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c]
87+
88+
v_dot2_f32_bf16 v2, -0.5, v1, v2
89+
// CHECK: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c]
90+
91+
v_dot2_f32_bf16 v2, 1.0, v1, v2
92+
// CHECK: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c]
93+
94+
v_dot2_f32_bf16 v2, -1.0, v1, v2
95+
// CHECK: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c]
96+
97+
v_dot2_f32_bf16 v2, 2.0, v1, v2
98+
// CHECK: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c]
99+
100+
v_dot2_f32_bf16 v2, -2.0, v1, v2
101+
// CHECK: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c]
102+
103+
v_dot2_f32_bf16 v2, 4.0, v1, v2
104+
// CHECK: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c]
105+
106+
v_dot2_f32_bf16 v2, -4.0, v1, v2
107+
// CHECK: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c]
108+
109+
v_dot2_f32_bf16 v2, 100.0, v1, v2
110+
// CHECK: v_dot2_f32_bf16 v2, 0x42c8, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00]
111+
112+
v_dot2_f32_bf16 v2, v1, 100.0, v2
113+
// CHECK: v_dot2_f32_bf16 v2, v1, 0x42c8, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00]

llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,66 @@
3636

3737
# CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04]
3838
0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04
39+
40+
# CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c]
41+
0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c
42+
43+
# CHECK: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c]
44+
0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c
45+
46+
# CHECK: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c]
47+
0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c
48+
49+
# CHECK: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c]
50+
0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c
51+
52+
# CHECK: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c]
53+
0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c
54+
55+
# CHECK: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c]
56+
0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c
57+
58+
# CHECK: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c]
59+
0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c
60+
61+
# CHECK: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c]
62+
0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c
63+
64+
# CHECK: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c]
65+
0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c
66+
67+
# CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c]
68+
0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c
69+
70+
# CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c]
71+
0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c
72+
73+
# CHECK: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c]
74+
0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c
75+
76+
# CHECK: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c]
77+
0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c
78+
79+
# CHECK: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c]
80+
0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c
81+
82+
# CHECK: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c]
83+
0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c
84+
85+
# CHECK: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c]
86+
0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c
87+
88+
# CHECK: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c]
89+
0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c
90+
91+
# CHECK: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c]
92+
0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c
93+
94+
# CHECK: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c]
95+
0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c
96+
97+
# CHECK: v_dot2_f32_bf16 v2, 0x42c8, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00]
98+
0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00
99+
100+
# CHECK: v_dot2_f32_bf16 v2, v1, 0x42c8, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00]
101+
0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00

0 commit comments

Comments
 (0)