@@ -1430,4 +1430,217 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
1430
1430
ret <16 x float > %result
1431
1431
}
1432
1432
1433
+ ; --------------------------------------------------------------------
1434
+ ; llvm.amdgcn.smfmac.i32.16x16x128.i8
1435
+ ; --------------------------------------------------------------------
1436
+
1437
+ declare <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 >, <8 x i32 >, <4 x i32 >, i32 , i32 , i32 )
1438
+
1439
+ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
1440
+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1441
+ ; SDAG: ; %bb.0: ; %bb
1442
+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1443
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1444
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1445
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1446
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1447
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
1448
+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
1449
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1450
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
1451
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
1452
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
1453
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
1454
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
1455
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
1456
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
1457
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
1458
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1459
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
1460
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
1461
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
1462
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
1463
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
1464
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
1465
+ ; SDAG-NEXT: s_nop 0
1466
+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
1467
+ ; SDAG-NEXT: s_nop 6
1468
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
1469
+ ; SDAG-NEXT: s_endpgm
1470
+ ;
1471
+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1472
+ ; GISEL: ; %bb.0: ; %bb
1473
+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1474
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1475
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1476
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
1477
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
1478
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
1479
+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
1480
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1481
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
1482
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
1483
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
1484
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
1485
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
1486
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
1487
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
1488
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
1489
+ ; GISEL-NEXT: s_nop 0
1490
+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
1491
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1492
+ ; GISEL-NEXT: s_nop 5
1493
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
1494
+ ; GISEL-NEXT: s_endpgm
1495
+ bb:
1496
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
1497
+ %gep = getelementptr <4 x i32 >, ptr addrspace (1 ) %arg , i32 %id
1498
+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %gep
1499
+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %a , <8 x i32 > %b , <4 x i32 > %in.1 , i32 %idx , i32 1 , i32 2 )
1500
+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
1501
+ ret void
1502
+ }
1503
+
1504
+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1505
+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
1506
+ ; SDAG: ; %bb.0:
1507
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1508
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1509
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1510
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1511
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1512
+ ; SDAG-NEXT: s_nop 1
1513
+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
1514
+ ; SDAG-NEXT: s_nop 6
1515
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1516
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1517
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1518
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1519
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1520
+ ;
1521
+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
1522
+ ; GISEL: ; %bb.0:
1523
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1524
+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
1525
+ ; GISEL-NEXT: s_nop 6
1526
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1527
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1528
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1529
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1530
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1531
+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
1532
+ ret <4 x i32 > %result
1533
+ }
1534
+
1535
+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1536
+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1537
+ ; SDAG: ; %bb.0:
1538
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1539
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1540
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1541
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1542
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1543
+ ; SDAG-NEXT: s_nop 1
1544
+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
1545
+ ; SDAG-NEXT: s_nop 6
1546
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1547
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1548
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1549
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1550
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1551
+ ;
1552
+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1553
+ ; GISEL: ; %bb.0:
1554
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1555
+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
1556
+ ; GISEL-NEXT: s_nop 6
1557
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1558
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1559
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1560
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1561
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1562
+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
1563
+ ret <4 x i32 > %result
1564
+ }
1565
+
1566
+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 ) {
1567
+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1568
+ ; SDAG: ; %bb.0:
1569
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1570
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1571
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1572
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1573
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1574
+ ; SDAG-NEXT: s_nop 1
1575
+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
1576
+ ; SDAG-NEXT: s_nop 6
1577
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1578
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1579
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1580
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1581
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1582
+ ;
1583
+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1584
+ ; GISEL: ; %bb.0:
1585
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1586
+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
1587
+ ; GISEL-NEXT: s_nop 6
1588
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1589
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1590
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1591
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1592
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1593
+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
1594
+ ret <4 x i32 > %result
1595
+ }
1596
+
1597
+ define <4 x i32 > @test_smfmac_i32_16x16x128_i8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x i32 > inreg %arg2 , i32 inreg %arg3 ) {
1598
+ ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1599
+ ; SDAG: ; %bb.0:
1600
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1601
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
1602
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
1603
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
1604
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
1605
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
1606
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
1607
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
1608
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
1609
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
1610
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
1611
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
1612
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
1613
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
1614
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
1615
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
1616
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
1617
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
1618
+ ; SDAG-NEXT: s_nop 1
1619
+ ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
1620
+ ; SDAG-NEXT: s_nop 6
1621
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1622
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1623
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1624
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1625
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
1626
+ ;
1627
+ ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1628
+ ; GISEL: ; %bb.0:
1629
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1630
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
1631
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
1632
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
1633
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
1634
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
1635
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
1636
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
1637
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
1638
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
1639
+ ; GISEL-NEXT: s_nop 1
1640
+ ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
1641
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
1642
+ %result = call <4 x i32 > @llvm.amdgcn.smfmac.i32.16x16x128.i8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x i32 > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
1643
+ ret <4 x i32 > %result
1644
+ }
1645
+
1433
1646
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments