@@ -2325,6 +2325,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
2325
2325
ret <4 x float > %result
2326
2326
}
2327
2327
2328
+ ; --------------------------------------------------------------------
2329
+ ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
2330
+ ; --------------------------------------------------------------------
2331
+
2332
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2333
+
2334
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2335
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2336
+ ; SDAG: ; %bb.0: ; %bb
2337
+ ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2338
+ ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2339
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2340
+ ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2341
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2342
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2343
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2344
+ ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2345
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2346
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2347
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2348
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2349
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2350
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2351
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2352
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2353
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2354
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2355
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2356
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2357
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2358
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2359
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2360
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2361
+ ; SDAG-NEXT: s_nop 0
2362
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2363
+ ; SDAG-NEXT: s_nop 6
2364
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2365
+ ; SDAG-NEXT: s_endpgm
2366
+ ;
2367
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2368
+ ; GISEL: ; %bb.0: ; %bb
2369
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2370
+ ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2371
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2372
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2373
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2374
+ ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2375
+ ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2376
+ ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2377
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2378
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2379
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2380
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2381
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2382
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2383
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2384
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2385
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2386
+ ; GISEL-NEXT: s_nop 0
2387
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2388
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2389
+ ; GISEL-NEXT: s_nop 5
2390
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2391
+ ; GISEL-NEXT: s_endpgm
2392
+ bb:
2393
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2394
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2395
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2396
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2397
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2398
+ ret void
2399
+ }
2400
+
2401
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2402
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2403
+ ; SDAG: ; %bb.0:
2404
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2405
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2406
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2407
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2408
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2409
+ ; SDAG-NEXT: s_nop 1
2410
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
2411
+ ; SDAG-NEXT: s_nop 6
2412
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2413
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2414
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2415
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2416
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2417
+ ;
2418
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2419
+ ; GISEL: ; %bb.0:
2420
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2421
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
2422
+ ; GISEL-NEXT: s_nop 6
2423
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2424
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2425
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2426
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2427
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2428
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2429
+ ret <4 x float > %result
2430
+ }
2431
+
2432
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2433
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2434
+ ; SDAG: ; %bb.0:
2435
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2436
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2437
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2438
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2439
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2440
+ ; SDAG-NEXT: s_nop 1
2441
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2442
+ ; SDAG-NEXT: s_nop 6
2443
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2444
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2445
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2446
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2447
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2448
+ ;
2449
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2450
+ ; GISEL: ; %bb.0:
2451
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2453
+ ; GISEL-NEXT: s_nop 6
2454
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2455
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2456
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2457
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2458
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2459
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2460
+ ret <4 x float > %result
2461
+ }
2462
+
2463
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2464
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2465
+ ; SDAG: ; %bb.0:
2466
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2467
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2468
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2469
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2470
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2471
+ ; SDAG-NEXT: s_nop 1
2472
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2473
+ ; SDAG-NEXT: s_nop 6
2474
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2475
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2476
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2477
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2478
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2479
+ ;
2480
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2481
+ ; GISEL: ; %bb.0:
2482
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2483
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2484
+ ; GISEL-NEXT: s_nop 6
2485
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2486
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2487
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2488
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2489
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2490
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2491
+ ret <4 x float > %result
2492
+ }
2493
+
2494
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2495
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2496
+ ; SDAG: ; %bb.0:
2497
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2499
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2500
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2501
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2502
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2503
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2504
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2505
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2506
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2507
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2508
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2509
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2510
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2511
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2512
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2513
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2514
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2515
+ ; SDAG-NEXT: s_nop 1
2516
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2517
+ ; SDAG-NEXT: s_nop 6
2518
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2519
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2520
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2521
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2522
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2523
+ ;
2524
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2525
+ ; GISEL: ; %bb.0:
2526
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2527
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2528
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2529
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2530
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2531
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2532
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2533
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2534
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2535
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2536
+ ; GISEL-NEXT: s_nop 1
2537
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2538
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2539
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2540
+ ret <4 x float > %result
2541
+ }
2542
+
2328
2543
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
2329
2544
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2330
2545
; GCN: {{.*}}
0 commit comments