@@ -1513,4 +1513,243 @@ bb:
1513
1513
ret void
1514
1514
}
1515
1515
1516
+ define amdgpu_gs void @sgpr_base_large_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1517
+ ; GFX9-LABEL: sgpr_base_large_offset:
1518
+ ; GFX9: ; %bb.0: ; %entry
1519
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1520
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1521
+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1522
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1523
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1524
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1525
+ ; GFX9-NEXT: s_endpgm
1526
+ ;
1527
+ ; GFX10-LABEL: sgpr_base_large_offset:
1528
+ ; GFX10: ; %bb.0: ; %entry
1529
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1530
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1531
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1532
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1533
+ ; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1534
+ ; GFX10-NEXT: scratch_load_dword v2, off, s0
1535
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1536
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1537
+ ; GFX10-NEXT: s_endpgm
1538
+ ;
1539
+ ; GFX940-LABEL: sgpr_base_large_offset:
1540
+ ; GFX940: ; %bb.0: ; %entry
1541
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1542
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1543
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1544
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1545
+ ; GFX940-NEXT: s_endpgm
1546
+ ;
1547
+ ; GFX11-LABEL: sgpr_base_large_offset:
1548
+ ; GFX11: ; %bb.0: ; %entry
1549
+ ; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1550
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0
1551
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1552
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1553
+ ; GFX11-NEXT: s_nop 0
1554
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1555
+ ; GFX11-NEXT: s_endpgm
1556
+ ;
1557
+ ; GFX12-LABEL: sgpr_base_large_offset:
1558
+ ; GFX12: ; %bb.0: ; %entry
1559
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1560
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1561
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1562
+ ; GFX12-NEXT: s_nop 0
1563
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1564
+ ; GFX12-NEXT: s_endpgm
1565
+ entry:
1566
+ %large_offset = getelementptr i8 , ptr addrspace (5 ) %sgpr_base , i32 65512
1567
+ %load = load i32 , ptr addrspace (5 ) %large_offset , align 4
1568
+ store i32 %load , ptr addrspace (1 ) %out
1569
+ ret void
1570
+ }
1571
+
1572
+ define amdgpu_gs void @sgpr_base_large_offset_split (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %sgpr_base ) {
1573
+ ; GFX9-LABEL: sgpr_base_large_offset_split:
1574
+ ; GFX9: ; %bb.0: ; %entry
1575
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1576
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1577
+ ; GFX9-NEXT: s_and_b32 s0, s2, -4
1578
+ ; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1579
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1580
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1581
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1582
+ ; GFX9-NEXT: s_endpgm
1583
+ ;
1584
+ ; GFX10-LABEL: sgpr_base_large_offset_split:
1585
+ ; GFX10: ; %bb.0: ; %entry
1586
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1587
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1588
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1589
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1590
+ ; GFX10-NEXT: s_and_b32 s0, s2, -4
1591
+ ; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1592
+ ; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1593
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1594
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1595
+ ; GFX10-NEXT: s_endpgm
1596
+ ;
1597
+ ; GFX940-LABEL: sgpr_base_large_offset_split:
1598
+ ; GFX940: ; %bb.0: ; %entry
1599
+ ; GFX940-NEXT: s_and_b32 s0, s0, -4
1600
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1601
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1602
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1603
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1604
+ ; GFX940-NEXT: s_endpgm
1605
+ ;
1606
+ ; GFX11-LABEL: sgpr_base_large_offset_split:
1607
+ ; GFX11: ; %bb.0: ; %entry
1608
+ ; GFX11-NEXT: s_and_b32 s0, s0, -4
1609
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1610
+ ; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1611
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1612
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1613
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1614
+ ; GFX11-NEXT: s_nop 0
1615
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1616
+ ; GFX11-NEXT: s_endpgm
1617
+ ;
1618
+ ; GFX12-LABEL: sgpr_base_large_offset_split:
1619
+ ; GFX12: ; %bb.0: ; %entry
1620
+ ; GFX12-NEXT: s_and_b32 s0, s0, -4
1621
+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1622
+ ; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1623
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1624
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1625
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1626
+ ; GFX12-NEXT: s_nop 0
1627
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1628
+ ; GFX12-NEXT: s_endpgm
1629
+ entry:
1630
+ ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1631
+ %sgpr_base_i32 = ptrtoint ptr addrspace (5 ) %sgpr_base to i32
1632
+ %sgpr_base_i32_align4 = and i32 %sgpr_base_i32 , 4294967292
1633
+ %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace (5 )
1634
+ %split_offset = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base_align4 , i32 0 , i32 16842728
1635
+ %load = load volatile i32 , ptr addrspace (5 ) %split_offset , align 4
1636
+ store i32 %load , ptr addrspace (1 ) %out
1637
+ ret void
1638
+ }
1639
+
1640
+ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset (ptr addrspace (5 ) inreg %sgpr_base , i32 inreg %sidx , i32 %vidx ) {
1641
+ ; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1642
+ ; GFX9: ; %bb.0: ; %bb
1643
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1644
+ ; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1645
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1646
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1647
+ ; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1648
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1649
+ ; GFX9-NEXT: scratch_store_dword v0, v1, off
1650
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1651
+ ; GFX9-NEXT: s_endpgm
1652
+ ;
1653
+ ; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1654
+ ; GFX10: ; %bb.0: ; %bb
1655
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1656
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1657
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1658
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1659
+ ; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1660
+ ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1661
+ ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1662
+ ; GFX10-NEXT: scratch_store_dword v0, v1, off
1663
+ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1664
+ ; GFX10-NEXT: s_endpgm
1665
+ ;
1666
+ ; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1667
+ ; GFX940: ; %bb.0: ; %bb
1668
+ ; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1669
+ ; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1670
+ ; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1671
+ ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1672
+ ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1673
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1674
+ ; GFX940-NEXT: s_endpgm
1675
+ ;
1676
+ ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1677
+ ; GFX11: ; %bb.0: ; %bb
1678
+ ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1679
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1680
+ ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1681
+ ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1682
+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1683
+ ; GFX11-NEXT: s_endpgm
1684
+ ;
1685
+ ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1686
+ ; GFX12: ; %bb.0: ; %bb
1687
+ ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1688
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1689
+ ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1690
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1691
+ ; GFX12-NEXT: s_wait_storecnt 0x0
1692
+ ; GFX12-NEXT: s_endpgm
1693
+ bb:
1694
+ %add1 = add nsw i32 %sidx , %vidx
1695
+ %add2 = add nsw i32 %add1 , 65512
1696
+ %gep = getelementptr inbounds [33554432 x i8 ], ptr addrspace (5 ) %sgpr_base , i32 0 , i32 %add2
1697
+ store volatile i32 15 , ptr addrspace (5 ) %gep , align 4
1698
+ ret void
1699
+ }
1700
+
1701
+ define amdgpu_gs void @sgpr_base_negative_offset (ptr addrspace (1 ) %out , ptr addrspace (5 ) inreg %scevgep ) {
1702
+ ; GFX9-LABEL: sgpr_base_negative_offset:
1703
+ ; GFX9: ; %bb.0: ; %entry
1704
+ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1705
+ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1706
+ ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1707
+ ; GFX9-NEXT: scratch_load_dword v2, off, s0
1708
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
1709
+ ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1710
+ ; GFX9-NEXT: s_endpgm
1711
+ ;
1712
+ ; GFX10-LABEL: sgpr_base_negative_offset:
1713
+ ; GFX10: ; %bb.0: ; %entry
1714
+ ; GFX10-NEXT: s_add_u32 s0, s0, s5
1715
+ ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1716
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1717
+ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1718
+ ; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1719
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
1720
+ ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1721
+ ; GFX10-NEXT: s_endpgm
1722
+ ;
1723
+ ; GFX940-LABEL: sgpr_base_negative_offset:
1724
+ ; GFX940: ; %bb.0: ; %entry
1725
+ ; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1726
+ ; GFX940-NEXT: scratch_load_dword v2, off, s0
1727
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
1728
+ ; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1729
+ ; GFX940-NEXT: s_endpgm
1730
+ ;
1731
+ ; GFX11-LABEL: sgpr_base_negative_offset:
1732
+ ; GFX11: ; %bb.0: ; %entry
1733
+ ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1734
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1735
+ ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1736
+ ; GFX11-NEXT: s_nop 0
1737
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1738
+ ; GFX11-NEXT: s_endpgm
1739
+ ;
1740
+ ; GFX12-LABEL: sgpr_base_negative_offset:
1741
+ ; GFX12: ; %bb.0: ; %entry
1742
+ ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1743
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
1744
+ ; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1745
+ ; GFX12-NEXT: s_nop 0
1746
+ ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1747
+ ; GFX12-NEXT: s_endpgm
1748
+ entry:
1749
+ %scevgep28 = getelementptr i8 , ptr addrspace (5 ) %scevgep , i32 -24
1750
+ %0 = load i32 , ptr addrspace (5 ) %scevgep28 , align 4
1751
+ store i32 %0 , ptr addrspace (1 ) %out
1752
+ ret void
1753
+ }
1754
+
1516
1755
declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments