Skip to content

Commit a3a4104

Browse files
committed
also reorder types for swmmac, dst before src types
1 parent f359284 commit a3a4104

File tree

5 files changed

+64
-61
lines changed

5 files changed

+64
-61
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18329,6 +18329,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1832918329
// the form:
1833018330
// D = A * B + C
1833118331
// We need to specify one type for matrices AB and one for matrices CD.
18332+
// Sparse matrix operations can have different types for A and B as well as
18333+
// an additional type for sparsity index.
18334+
// Destination type should be put before types used for source operands.
1833218335
SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1833318336
// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1833418337
// There is no need for the variable opsel argument, so always set it to
@@ -18341,14 +18344,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1834118344
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1834218345
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1834318346
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18344-
ArgsForMatchingMatrixTypes = {2, 0};
18347+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1834518348
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1834618349
break;
1834718350
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1834818351
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1834918352
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1835018353
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18351-
ArgsForMatchingMatrixTypes = {2, 0};
18354+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1835218355
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1835318356
break;
1835418357
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
@@ -18357,7 +18360,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1835718360
LLVM_FALLTHROUGH;
1835818361
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1835918362
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18360-
ArgsForMatchingMatrixTypes = {2, 0};
18363+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1836118364
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1836218365
break;
1836318366
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
@@ -18366,111 +18369,111 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1836618369
LLVM_FALLTHROUGH;
1836718370
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1836818371
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18369-
ArgsForMatchingMatrixTypes = {2, 0};
18372+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1837018373
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1837118374
break;
1837218375
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1837318376
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18374-
ArgsForMatchingMatrixTypes = {2, 0};
18377+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1837518378
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1837618379
break;
1837718380
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1837818381
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18379-
ArgsForMatchingMatrixTypes = {2, 0};
18382+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1838018383
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1838118384
break;
1838218385
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1838318386
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1838418387
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1838518388
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18386-
ArgsForMatchingMatrixTypes = {4, 1};
18389+
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1838718390
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1838818391
break;
1838918392
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1839018393
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1839118394
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1839218395
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18393-
ArgsForMatchingMatrixTypes = {4, 1};
18396+
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1839418397
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1839518398
break;
1839618399
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1839718400
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18398-
ArgsForMatchingMatrixTypes = {2, 0};
18401+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1839918402
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1840018403
break;
1840118404
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1840218405
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18403-
ArgsForMatchingMatrixTypes = {2, 0};
18406+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1840418407
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1840518408
break;
1840618409
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1840718410
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18408-
ArgsForMatchingMatrixTypes = {2, 0};
18411+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1840918412
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1841018413
break;
1841118414
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1841218415
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18413-
ArgsForMatchingMatrixTypes = {2, 0};
18416+
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1841418417
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1841518418
break;
1841618419
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1841718420
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18418-
ArgsForMatchingMatrixTypes = {4, 1};
18421+
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1841918422
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1842018423
break;
1842118424
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1842218425
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18423-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18426+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1842418427
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1842518428
break;
1842618429
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1842718430
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18428-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18431+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1842918432
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1843018433
break;
1843118434
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1843218435
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18433-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18436+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1843418437
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1843518438
break;
1843618439
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1843718440
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18438-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18441+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1843918442
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1844018443
break;
1844118444
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1844218445
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18443-
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18446+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1844418447
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1844518448
break;
1844618449
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1844718450
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18448-
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18451+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1844918452
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1845018453
break;
1845118454
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1845218455
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18453-
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18456+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1845418457
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1845518458
break;
1845618459
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1845718460
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18458-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18461+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1845918462
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1846018463
break;
1846118464
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1846218465
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18463-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18466+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1846418467
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1846518468
break;
1846618469
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1846718470
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18468-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18471+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1846918472
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1847018473
break;
1847118474
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1847218475
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
18473-
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18476+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1847418477
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1847518478
break;
1847618479
}

clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ typedef __bf16 v16bf __attribute__((ext_vector_type(16)));
1515

1616
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
1717
// CHECK-GFX1200-NEXT: entry:
18-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
18+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
1919
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
2020
// CHECK-GFX1200-NEXT: ret void
2121
//
@@ -26,7 +26,7 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f
2626

2727
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
2828
// CHECK-GFX1200-NEXT: entry:
29-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8bf16.v16bf16.v8f32.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
29+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8bf16.v16bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
3030
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
3131
// CHECK-GFX1200-NEXT: ret void
3232
//
@@ -37,7 +37,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8bf a, v16bf b,
3737

3838
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
3939
// CHECK-GFX1200-NEXT: entry:
40-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
40+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
4141
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
4242
// CHECK-GFX1200-NEXT: ret void
4343
//
@@ -48,7 +48,7 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h
4848

4949
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
5050
// CHECK-GFX1200-NEXT: entry:
51-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8bf16.v16bf16.v8bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i16 [[INDEX:%.*]])
51+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8bf16.v8bf16.v16bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i16 [[INDEX:%.*]])
5252
// CHECK-GFX1200-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
5353
// CHECK-GFX1200-NEXT: ret void
5454
//
@@ -59,7 +59,7 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8bf* out, v8bf a, v16bf b
5959

6060
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
6161
// CHECK-GFX1200-NEXT: entry:
62-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v2i32.v4i32.v8i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
62+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
6363
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
6464
// CHECK-GFX1200-NEXT: ret void
6565
//
@@ -70,7 +70,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i
7070

7171
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
7272
// CHECK-GFX1200-NEXT: entry:
73-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.i32.v2i32.v8i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
73+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
7474
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
7575
// CHECK-GFX1200-NEXT: ret void
7676
//
@@ -81,7 +81,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i
8181

8282
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
8383
// CHECK-GFX1200-NEXT: entry:
84-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v2i32.v4i32.v8i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
84+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
8585
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
8686
// CHECK-GFX1200-NEXT: ret void
8787
//
@@ -92,7 +92,7 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i
9292

9393
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
9494
// CHECK-GFX1200-NEXT: entry:
95-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
95+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
9696
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
9797
// CHECK-GFX1200-NEXT: ret void
9898
//
@@ -103,7 +103,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b,
103103

104104
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
105105
// CHECK-GFX1200-NEXT: entry:
106-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
106+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
107107
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
108108
// CHECK-GFX1200-NEXT: ret void
109109
//
@@ -114,7 +114,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b,
114114

115115
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
116116
// CHECK-GFX1200-NEXT: entry:
117-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
117+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
118118
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
119119
// CHECK-GFX1200-NEXT: ret void
120120
//
@@ -125,7 +125,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b,
125125

126126
// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
127127
// CHECK-GFX1200-NEXT: entry:
128-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
128+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
129129
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
130130
// CHECK-GFX1200-NEXT: ret void
131131
//

0 commit comments

Comments
 (0)