@@ -1510,4 +1510,243 @@ bb:
1510
1510
ret void
1511
1511
}
1512
1512
1513
+ define amdgpu_gs void @sgpr_base_large_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1514
+ ; GFX9-LABEL: sgpr_base_large_offset:
1515
+ ; GFX9: ; %bb.0: ; %entry
1516
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1517
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1518
+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1519
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1520
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1521
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1522
+ ; GFX9-NEXT: s_endpgm
1523
+ ;
1524
+ ; GFX10-LABEL: sgpr_base_large_offset:
1525
+ ; GFX10: ; %bb.0: ; %entry
1526
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1527
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1528
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1529
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1530
+ ; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1531
+ ; GFX10-NEXT: scratch_load_dword v2, off, s0
1532
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1533
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1534
+ ; GFX10-NEXT: s_endpgm
1535
+ ;
1536
+ ; GFX940-LABEL: sgpr_base_large_offset:
1537
+ ; GFX940: ; %bb.0: ; %entry
1538
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1539
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1540
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1541
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1542
+ ; GFX940-NEXT: s_endpgm
1543
+ ;
1544
+ ; GFX11-LABEL: sgpr_base_large_offset:
1545
+ ; GFX11: ; %bb.0: ; %entry
1546
+ ; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1547
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0
1548
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1549
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1550
+ ; GFX11-NEXT: s_nop 0
1551
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552
+ ; GFX11-NEXT: s_endpgm
1553
+ ;
1554
+ ; GFX12-LABEL: sgpr_base_large_offset:
1555
+ ; GFX12: ; %bb.0: ; %entry
1556
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1557
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1558
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1559
+ ; GFX12-NEXT: s_nop 0
1560
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1561
+ ; GFX12-NEXT: s_endpgm
1562
+ entry:
1563
+ %large_offset = getelementptr i8 , ptr addrspace (5 ) %sgpr_base , i32 65512
1564
+ %load = load i32 , ptr addrspace (5 ) %large_offset , align 4
1565
+ store i32 %load , ptr addrspace (1 ) %out
1566
+ ret void
1567
+ }
1568
+
1569
+ define amdgpu_gs void @sgpr_base_large_offset_split (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1570
+ ; GFX9-LABEL: sgpr_base_large_offset_split:
1571
+ ; GFX9: ; %bb.0: ; %entry
1572
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1573
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1574
+ ; GFX9-NEXT: s_and_b32 s0, s2, -4
1575
+ ; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1576
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1577
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1578
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1579
+ ; GFX9-NEXT: s_endpgm
1580
+ ;
1581
+ ; GFX10-LABEL: sgpr_base_large_offset_split:
1582
+ ; GFX10: ; %bb.0: ; %entry
1583
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1584
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1585
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1586
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1587
+ ; GFX10-NEXT: s_and_b32 s0, s2, -4
1588
+ ; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1589
+ ; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1590
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1591
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1592
+ ; GFX10-NEXT: s_endpgm
1593
+ ;
1594
+ ; GFX940-LABEL: sgpr_base_large_offset_split:
1595
+ ; GFX940: ; %bb.0: ; %entry
1596
+ ; GFX940-NEXT: s_and_b32 s0, s0, -4
1597
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1598
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1599
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1600
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1601
+ ; GFX940-NEXT: s_endpgm
1602
+ ;
1603
+ ; GFX11-LABEL: sgpr_base_large_offset_split:
1604
+ ; GFX11: ; %bb.0: ; %entry
1605
+ ; GFX11-NEXT: s_and_b32 s0, s0, -4
1606
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1607
+ ; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1608
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1609
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1610
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1611
+ ; GFX11-NEXT: s_nop 0
1612
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1613
+ ; GFX11-NEXT: s_endpgm
1614
+ ;
1615
+ ; GFX12-LABEL: sgpr_base_large_offset_split:
1616
+ ; GFX12: ; %bb.0: ; %entry
1617
+ ; GFX12-NEXT: s_and_b32 s0, s0, -4
1618
+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1619
+ ; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1620
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1621
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1622
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1623
+ ; GFX12-NEXT: s_nop 0
1624
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1625
+ ; GFX12-NEXT: s_endpgm
1626
+ entry:
1627
+ ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1628
+ %sgpr_base_i32 = ptrtoint ptr addrspace (5 ) %sgpr_base to i32
1629
+ %sgpr_base_i32_align4 = and i32 %sgpr_base_i32 , 4294967292
1630
+ %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace (5 )
1631
+ %split_offset = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base_align4 , i32 0 , i32 16842728
1632
+ %load = load volatile i32 , ptr addrspace (5 ) %split_offset , align 4
1633
+ store i32 %load , ptr addrspace (1 ) %out
1634
+ ret void
1635
+ }
1636
+
1637
+ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset (ptr addrspace (5 ) inreg %sgpr_base , i32 inreg %sidx , i32 %vidx ) {
1638
+ ; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1639
+ ; GFX9: ; %bb.0: ; %bb
1640
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1641
+ ; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1642
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1643
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1644
+ ; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1645
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1646
+ ; GFX9-NEXT: scratch_store_dword v0, v1, off
1647
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1648
+ ; GFX9-NEXT: s_endpgm
1649
+ ;
1650
+ ; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1651
+ ; GFX10: ; %bb.0: ; %bb
1652
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1653
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1654
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1655
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1656
+ ; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1657
+ ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1658
+ ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1659
+ ; GFX10-NEXT: scratch_store_dword v0, v1, off
1660
+ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1661
+ ; GFX10-NEXT: s_endpgm
1662
+ ;
1663
+ ; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1664
+ ; GFX940: ; %bb.0: ; %bb
1665
+ ; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1666
+ ; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1667
+ ; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1668
+ ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1669
+ ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1670
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1671
+ ; GFX940-NEXT: s_endpgm
1672
+ ;
1673
+ ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1674
+ ; GFX11: ; %bb.0: ; %bb
1675
+ ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1676
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1677
+ ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1678
+ ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1679
+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1680
+ ; GFX11-NEXT: s_endpgm
1681
+ ;
1682
+ ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1683
+ ; GFX12: ; %bb.0: ; %bb
1684
+ ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1685
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1686
+ ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1687
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1688
+ ; GFX12-NEXT: s_wait_storecnt 0x0
1689
+ ; GFX12-NEXT: s_endpgm
1690
+ bb:
1691
+ %add1 = add nsw i32 %sidx , %vidx
1692
+ %add2 = add nsw i32 %add1 , 65512
1693
+ %gep = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base , i32 0 , i32 %add2
1694
+ store volatile i32 15 , ptr addrspace (5 ) %gep , align 4
1695
+ ret void
1696
+ }
1697
+
1698
+ define amdgpu_gs void @sgpr_base_negative_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %scevgep ) {
1699
+ ; GFX9-LABEL: sgpr_base_negative_offset:
1700
+ ; GFX9: ; %bb.0: ; %entry
1701
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1702
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1703
+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1704
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1705
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1706
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1707
+ ; GFX9-NEXT: s_endpgm
1708
+ ;
1709
+ ; GFX10-LABEL: sgpr_base_negative_offset:
1710
+ ; GFX10: ; %bb.0: ; %entry
1711
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1712
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1713
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1714
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1715
+ ; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1716
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1717
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1718
+ ; GFX10-NEXT: s_endpgm
1719
+ ;
1720
+ ; GFX940-LABEL: sgpr_base_negative_offset:
1721
+ ; GFX940: ; %bb.0: ; %entry
1722
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1723
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1724
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1725
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1726
+ ; GFX940-NEXT: s_endpgm
1727
+ ;
1728
+ ; GFX11-LABEL: sgpr_base_negative_offset:
1729
+ ; GFX11: ; %bb.0: ; %entry
1730
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1731
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1732
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1733
+ ; GFX11-NEXT: s_nop 0
1734
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1735
+ ; GFX11-NEXT: s_endpgm
1736
+ ;
1737
+ ; GFX12-LABEL: sgpr_base_negative_offset:
1738
+ ; GFX12: ; %bb.0: ; %entry
1739
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1740
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1741
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1742
+ ; GFX12-NEXT: s_nop 0
1743
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1744
+ ; GFX12-NEXT: s_endpgm
1745
+ entry:
1746
+ %scevgep28 = getelementptr i8 , ptr addrspace (5 ) %scevgep , i32 -24
1747
+ %0 = load i32 , ptr addrspace (5 ) %scevgep28 , align 4
1748
+ store i32 %0 , ptr addrspace (1 ) %out
1749
+ ret void
1750
+ }
1751
+
1513
1752
declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments