Skip to content

Commit 03d1337

Browse files
petar-avramovictru
authored andcommitted
AMDGPU: Add test for 16 bit unsigned scratch offsets (llvm#110255)
Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset. (cherry picked from commit e9d12a6)
1 parent 53010fc commit 03d1337

File tree

2 files changed

+683
-0
lines changed

2 files changed

+683
-0
lines changed

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,4 +1513,243 @@ bb:
15131513
ret void
15141514
}
15151515

1516+
define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1517+
; GFX9-LABEL: sgpr_base_large_offset:
1518+
; GFX9: ; %bb.0: ; %entry
1519+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1520+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1521+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1522+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1523+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1524+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1525+
; GFX9-NEXT: s_endpgm
1526+
;
1527+
; GFX10-LABEL: sgpr_base_large_offset:
1528+
; GFX10: ; %bb.0: ; %entry
1529+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1530+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1531+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1532+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1533+
; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1534+
; GFX10-NEXT: scratch_load_dword v2, off, s0
1535+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1536+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1537+
; GFX10-NEXT: s_endpgm
1538+
;
1539+
; GFX940-LABEL: sgpr_base_large_offset:
1540+
; GFX940: ; %bb.0: ; %entry
1541+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1542+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1543+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1544+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1545+
; GFX940-NEXT: s_endpgm
1546+
;
1547+
; GFX11-LABEL: sgpr_base_large_offset:
1548+
; GFX11: ; %bb.0: ; %entry
1549+
; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1550+
; GFX11-NEXT: scratch_load_b32 v2, off, s0
1551+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1552+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1553+
; GFX11-NEXT: s_nop 0
1554+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1555+
; GFX11-NEXT: s_endpgm
1556+
;
1557+
; GFX12-LABEL: sgpr_base_large_offset:
1558+
; GFX12: ; %bb.0: ; %entry
1559+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1560+
; GFX12-NEXT: s_wait_loadcnt 0x0
1561+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1562+
; GFX12-NEXT: s_nop 0
1563+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1564+
; GFX12-NEXT: s_endpgm
1565+
entry:
1566+
%large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
1567+
%load = load i32, ptr addrspace(5) %large_offset, align 4
1568+
store i32 %load, ptr addrspace(1) %out
1569+
ret void
1570+
}
1571+
1572+
define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1573+
; GFX9-LABEL: sgpr_base_large_offset_split:
1574+
; GFX9: ; %bb.0: ; %entry
1575+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1576+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1577+
; GFX9-NEXT: s_and_b32 s0, s2, -4
1578+
; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1579+
; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1580+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1581+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1582+
; GFX9-NEXT: s_endpgm
1583+
;
1584+
; GFX10-LABEL: sgpr_base_large_offset_split:
1585+
; GFX10: ; %bb.0: ; %entry
1586+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1587+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1588+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1589+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1590+
; GFX10-NEXT: s_and_b32 s0, s2, -4
1591+
; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1592+
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1593+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1594+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1595+
; GFX10-NEXT: s_endpgm
1596+
;
1597+
; GFX940-LABEL: sgpr_base_large_offset_split:
1598+
; GFX940: ; %bb.0: ; %entry
1599+
; GFX940-NEXT: s_and_b32 s0, s0, -4
1600+
; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1601+
; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1602+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1603+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1604+
; GFX940-NEXT: s_endpgm
1605+
;
1606+
; GFX11-LABEL: sgpr_base_large_offset_split:
1607+
; GFX11: ; %bb.0: ; %entry
1608+
; GFX11-NEXT: s_and_b32 s0, s0, -4
1609+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1610+
; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1611+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1612+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1613+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1614+
; GFX11-NEXT: s_nop 0
1615+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1616+
; GFX11-NEXT: s_endpgm
1617+
;
1618+
; GFX12-LABEL: sgpr_base_large_offset_split:
1619+
; GFX12: ; %bb.0: ; %entry
1620+
; GFX12-NEXT: s_and_b32 s0, s0, -4
1621+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1622+
; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1623+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1624+
; GFX12-NEXT: s_wait_loadcnt 0x0
1625+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1626+
; GFX12-NEXT: s_nop 0
1627+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1628+
; GFX12-NEXT: s_endpgm
1629+
entry:
1630+
;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1631+
%sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
1632+
%sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
1633+
%sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
1634+
%split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
1635+
%load = load volatile i32, ptr addrspace(5) %split_offset, align 4
1636+
store i32 %load, ptr addrspace(1) %out
1637+
ret void
1638+
}
1639+
1640+
define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
1641+
; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1642+
; GFX9: ; %bb.0: ; %bb
1643+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1644+
; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1645+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1646+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1647+
; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1648+
; GFX9-NEXT: v_mov_b32_e32 v1, 15
1649+
; GFX9-NEXT: scratch_store_dword v0, v1, off
1650+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1651+
; GFX9-NEXT: s_endpgm
1652+
;
1653+
; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1654+
; GFX10: ; %bb.0: ; %bb
1655+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1656+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1657+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1658+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1659+
; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1660+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1661+
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1662+
; GFX10-NEXT: scratch_store_dword v0, v1, off
1663+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1664+
; GFX10-NEXT: s_endpgm
1665+
;
1666+
; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1667+
; GFX940: ; %bb.0: ; %bb
1668+
; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1669+
; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1670+
; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1671+
; GFX940-NEXT: v_mov_b32_e32 v1, 15
1672+
; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1673+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1674+
; GFX940-NEXT: s_endpgm
1675+
;
1676+
; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1677+
; GFX11: ; %bb.0: ; %bb
1678+
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1679+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1680+
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1681+
; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1682+
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1683+
; GFX11-NEXT: s_endpgm
1684+
;
1685+
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1686+
; GFX12: ; %bb.0: ; %bb
1687+
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1688+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1689+
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1690+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1691+
; GFX12-NEXT: s_wait_storecnt 0x0
1692+
; GFX12-NEXT: s_endpgm
1693+
bb:
1694+
%add1 = add nsw i32 %sidx, %vidx
1695+
%add2 = add nsw i32 %add1, 65512
1696+
%gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
1697+
store volatile i32 15, ptr addrspace(5) %gep, align 4
1698+
ret void
1699+
}
1700+
1701+
define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
1702+
; GFX9-LABEL: sgpr_base_negative_offset:
1703+
; GFX9: ; %bb.0: ; %entry
1704+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1705+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1706+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1707+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1708+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1709+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1710+
; GFX9-NEXT: s_endpgm
1711+
;
1712+
; GFX10-LABEL: sgpr_base_negative_offset:
1713+
; GFX10: ; %bb.0: ; %entry
1714+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1715+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1716+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1717+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1718+
; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1719+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1720+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1721+
; GFX10-NEXT: s_endpgm
1722+
;
1723+
; GFX940-LABEL: sgpr_base_negative_offset:
1724+
; GFX940: ; %bb.0: ; %entry
1725+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1726+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1727+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1728+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1729+
; GFX940-NEXT: s_endpgm
1730+
;
1731+
; GFX11-LABEL: sgpr_base_negative_offset:
1732+
; GFX11: ; %bb.0: ; %entry
1733+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1734+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1735+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1736+
; GFX11-NEXT: s_nop 0
1737+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1738+
; GFX11-NEXT: s_endpgm
1739+
;
1740+
; GFX12-LABEL: sgpr_base_negative_offset:
1741+
; GFX12: ; %bb.0: ; %entry
1742+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1743+
; GFX12-NEXT: s_wait_loadcnt 0x0
1744+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1745+
; GFX12-NEXT: s_nop 0
1746+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1747+
; GFX12-NEXT: s_endpgm
1748+
entry:
1749+
%scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
1750+
%0 = load i32, ptr addrspace(5) %scevgep28, align 4
1751+
store i32 %0, ptr addrspace(1) %out
1752+
ret void
1753+
}
1754+
15161755
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)