Skip to content

Commit e9d12a6

Browse files
AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255)
Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset.
1 parent f627c45 commit e9d12a6

File tree

2 files changed

+683
-0
lines changed

2 files changed

+683
-0
lines changed

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,4 +1510,243 @@ bb:
15101510
ret void
15111511
}
15121512

1513+
define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1514+
; GFX9-LABEL: sgpr_base_large_offset:
1515+
; GFX9: ; %bb.0: ; %entry
1516+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1517+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1518+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1519+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1520+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1521+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1522+
; GFX9-NEXT: s_endpgm
1523+
;
1524+
; GFX10-LABEL: sgpr_base_large_offset:
1525+
; GFX10: ; %bb.0: ; %entry
1526+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1527+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1528+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1529+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1530+
; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1531+
; GFX10-NEXT: scratch_load_dword v2, off, s0
1532+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1533+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1534+
; GFX10-NEXT: s_endpgm
1535+
;
1536+
; GFX940-LABEL: sgpr_base_large_offset:
1537+
; GFX940: ; %bb.0: ; %entry
1538+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1539+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1540+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1541+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1542+
; GFX940-NEXT: s_endpgm
1543+
;
1544+
; GFX11-LABEL: sgpr_base_large_offset:
1545+
; GFX11: ; %bb.0: ; %entry
1546+
; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1547+
; GFX11-NEXT: scratch_load_b32 v2, off, s0
1548+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1549+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1550+
; GFX11-NEXT: s_nop 0
1551+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552+
; GFX11-NEXT: s_endpgm
1553+
;
1554+
; GFX12-LABEL: sgpr_base_large_offset:
1555+
; GFX12: ; %bb.0: ; %entry
1556+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1557+
; GFX12-NEXT: s_wait_loadcnt 0x0
1558+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1559+
; GFX12-NEXT: s_nop 0
1560+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1561+
; GFX12-NEXT: s_endpgm
1562+
entry:
1563+
%large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
1564+
%load = load i32, ptr addrspace(5) %large_offset, align 4
1565+
store i32 %load, ptr addrspace(1) %out
1566+
ret void
1567+
}
1568+
1569+
define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1570+
; GFX9-LABEL: sgpr_base_large_offset_split:
1571+
; GFX9: ; %bb.0: ; %entry
1572+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1573+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1574+
; GFX9-NEXT: s_and_b32 s0, s2, -4
1575+
; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1576+
; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1577+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1578+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1579+
; GFX9-NEXT: s_endpgm
1580+
;
1581+
; GFX10-LABEL: sgpr_base_large_offset_split:
1582+
; GFX10: ; %bb.0: ; %entry
1583+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1584+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1585+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1586+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1587+
; GFX10-NEXT: s_and_b32 s0, s2, -4
1588+
; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1589+
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1590+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1591+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1592+
; GFX10-NEXT: s_endpgm
1593+
;
1594+
; GFX940-LABEL: sgpr_base_large_offset_split:
1595+
; GFX940: ; %bb.0: ; %entry
1596+
; GFX940-NEXT: s_and_b32 s0, s0, -4
1597+
; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1598+
; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1599+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1600+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1601+
; GFX940-NEXT: s_endpgm
1602+
;
1603+
; GFX11-LABEL: sgpr_base_large_offset_split:
1604+
; GFX11: ; %bb.0: ; %entry
1605+
; GFX11-NEXT: s_and_b32 s0, s0, -4
1606+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1607+
; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1608+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1609+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1610+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1611+
; GFX11-NEXT: s_nop 0
1612+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1613+
; GFX11-NEXT: s_endpgm
1614+
;
1615+
; GFX12-LABEL: sgpr_base_large_offset_split:
1616+
; GFX12: ; %bb.0: ; %entry
1617+
; GFX12-NEXT: s_and_b32 s0, s0, -4
1618+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1619+
; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1620+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1621+
; GFX12-NEXT: s_wait_loadcnt 0x0
1622+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1623+
; GFX12-NEXT: s_nop 0
1624+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1625+
; GFX12-NEXT: s_endpgm
1626+
entry:
1627+
;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1628+
%sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
1629+
%sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
1630+
%sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
1631+
%split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
1632+
%load = load volatile i32, ptr addrspace(5) %split_offset, align 4
1633+
store i32 %load, ptr addrspace(1) %out
1634+
ret void
1635+
}
1636+
1637+
define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
1638+
; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1639+
; GFX9: ; %bb.0: ; %bb
1640+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1641+
; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1642+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1643+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1644+
; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1645+
; GFX9-NEXT: v_mov_b32_e32 v1, 15
1646+
; GFX9-NEXT: scratch_store_dword v0, v1, off
1647+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1648+
; GFX9-NEXT: s_endpgm
1649+
;
1650+
; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1651+
; GFX10: ; %bb.0: ; %bb
1652+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1653+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1654+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1655+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1656+
; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1657+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1658+
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1659+
; GFX10-NEXT: scratch_store_dword v0, v1, off
1660+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1661+
; GFX10-NEXT: s_endpgm
1662+
;
1663+
; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1664+
; GFX940: ; %bb.0: ; %bb
1665+
; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1666+
; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1667+
; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1668+
; GFX940-NEXT: v_mov_b32_e32 v1, 15
1669+
; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1670+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1671+
; GFX940-NEXT: s_endpgm
1672+
;
1673+
; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1674+
; GFX11: ; %bb.0: ; %bb
1675+
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1676+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1677+
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1678+
; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1679+
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1680+
; GFX11-NEXT: s_endpgm
1681+
;
1682+
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1683+
; GFX12: ; %bb.0: ; %bb
1684+
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1685+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1686+
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1687+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1688+
; GFX12-NEXT: s_wait_storecnt 0x0
1689+
; GFX12-NEXT: s_endpgm
1690+
bb:
1691+
%add1 = add nsw i32 %sidx, %vidx
1692+
%add2 = add nsw i32 %add1, 65512
1693+
%gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
1694+
store volatile i32 15, ptr addrspace(5) %gep, align 4
1695+
ret void
1696+
}
1697+
1698+
define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
1699+
; GFX9-LABEL: sgpr_base_negative_offset:
1700+
; GFX9: ; %bb.0: ; %entry
1701+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1702+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1703+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1704+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1705+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1706+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1707+
; GFX9-NEXT: s_endpgm
1708+
;
1709+
; GFX10-LABEL: sgpr_base_negative_offset:
1710+
; GFX10: ; %bb.0: ; %entry
1711+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1712+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1713+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1714+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1715+
; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1716+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1717+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1718+
; GFX10-NEXT: s_endpgm
1719+
;
1720+
; GFX940-LABEL: sgpr_base_negative_offset:
1721+
; GFX940: ; %bb.0: ; %entry
1722+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1723+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1724+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1725+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1726+
; GFX940-NEXT: s_endpgm
1727+
;
1728+
; GFX11-LABEL: sgpr_base_negative_offset:
1729+
; GFX11: ; %bb.0: ; %entry
1730+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1731+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1732+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1733+
; GFX11-NEXT: s_nop 0
1734+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1735+
; GFX11-NEXT: s_endpgm
1736+
;
1737+
; GFX12-LABEL: sgpr_base_negative_offset:
1738+
; GFX12: ; %bb.0: ; %entry
1739+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1740+
; GFX12-NEXT: s_wait_loadcnt 0x0
1741+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1742+
; GFX12-NEXT: s_nop 0
1743+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1744+
; GFX12-NEXT: s_endpgm
1745+
entry:
1746+
%scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
1747+
%0 = load i32, ptr addrspace(5) %scevgep28, align 4
1748+
store i32 %0, ptr addrspace(1) %out
1749+
ret void
1750+
}
1751+
15131752
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)