Skip to content

Commit 43076c2

Browse files
AMDGPU: Add test for 16 bit unsigned scratch offsets
Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset.
1 parent f6e771c commit 43076c2

File tree

2 files changed

+683
-0
lines changed

2 files changed

+683
-0
lines changed

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,4 +1510,243 @@ bb:
15101510
ret void
15111511
}
15121512

1513+
define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1514+
; GFX9-LABEL: sgpr_base_large_offset:
1515+
; GFX9: ; %bb.0: ; %entry
1516+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1517+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1518+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
1519+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1520+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1521+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1522+
; GFX9-NEXT: s_endpgm
1523+
;
1524+
; GFX10-LABEL: sgpr_base_large_offset:
1525+
; GFX10: ; %bb.0: ; %entry
1526+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1527+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1528+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1529+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1530+
; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
1531+
; GFX10-NEXT: scratch_load_dword v2, off, s0
1532+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1533+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1534+
; GFX10-NEXT: s_endpgm
1535+
;
1536+
; GFX940-LABEL: sgpr_base_large_offset:
1537+
; GFX940: ; %bb.0: ; %entry
1538+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
1539+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1540+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1541+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1542+
; GFX940-NEXT: s_endpgm
1543+
;
1544+
; GFX11-LABEL: sgpr_base_large_offset:
1545+
; GFX11: ; %bb.0: ; %entry
1546+
; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
1547+
; GFX11-NEXT: scratch_load_b32 v2, off, s0
1548+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1549+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1550+
; GFX11-NEXT: s_nop 0
1551+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552+
; GFX11-NEXT: s_endpgm
1553+
;
1554+
; GFX12-LABEL: sgpr_base_large_offset:
1555+
; GFX12: ; %bb.0: ; %entry
1556+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
1557+
; GFX12-NEXT: s_wait_loadcnt 0x0
1558+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1559+
; GFX12-NEXT: s_nop 0
1560+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1561+
; GFX12-NEXT: s_endpgm
1562+
entry:
1563+
%large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
1564+
%load = load i32, ptr addrspace(5) %large_offset, align 4
1565+
store i32 %load, ptr addrspace(1) %out
1566+
ret void
1567+
}
1568+
1569+
define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
1570+
; GFX9-LABEL: sgpr_base_large_offset_split:
1571+
; GFX9: ; %bb.0: ; %entry
1572+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1573+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1574+
; GFX9-NEXT: s_and_b32 s0, s2, -4
1575+
; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
1576+
; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
1577+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1578+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1579+
; GFX9-NEXT: s_endpgm
1580+
;
1581+
; GFX10-LABEL: sgpr_base_large_offset_split:
1582+
; GFX10: ; %bb.0: ; %entry
1583+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1584+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1585+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1586+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1587+
; GFX10-NEXT: s_and_b32 s0, s2, -4
1588+
; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
1589+
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
1590+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1591+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1592+
; GFX10-NEXT: s_endpgm
1593+
;
1594+
; GFX940-LABEL: sgpr_base_large_offset_split:
1595+
; GFX940: ; %bb.0: ; %entry
1596+
; GFX940-NEXT: s_and_b32 s0, s0, -4
1597+
; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
1598+
; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
1599+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1600+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1601+
; GFX940-NEXT: s_endpgm
1602+
;
1603+
; GFX11-LABEL: sgpr_base_large_offset_split:
1604+
; GFX11: ; %bb.0: ; %entry
1605+
; GFX11-NEXT: s_and_b32 s0, s0, -4
1606+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1607+
; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
1608+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
1609+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1610+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1611+
; GFX11-NEXT: s_nop 0
1612+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1613+
; GFX11-NEXT: s_endpgm
1614+
;
1615+
; GFX12-LABEL: sgpr_base_large_offset_split:
1616+
; GFX12: ; %bb.0: ; %entry
1617+
; GFX12-NEXT: s_and_b32 s0, s0, -4
1618+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1619+
; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
1620+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
1621+
; GFX12-NEXT: s_wait_loadcnt 0x0
1622+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1623+
; GFX12-NEXT: s_nop 0
1624+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1625+
; GFX12-NEXT: s_endpgm
1626+
entry:
1627+
;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
1628+
%sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
1629+
%sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
1630+
%sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
1631+
%split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
1632+
%load = load volatile i32, ptr addrspace(5) %split_offset, align 4
1633+
store i32 %load, ptr addrspace(1) %out
1634+
ret void
1635+
}
1636+
1637+
define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
1638+
; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1639+
; GFX9: ; %bb.0: ; %bb
1640+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1641+
; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
1642+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
1643+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1644+
; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
1645+
; GFX9-NEXT: v_mov_b32_e32 v1, 15
1646+
; GFX9-NEXT: scratch_store_dword v0, v1, off
1647+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1648+
; GFX9-NEXT: s_endpgm
1649+
;
1650+
; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1651+
; GFX10: ; %bb.0: ; %bb
1652+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1653+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1654+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1655+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1656+
; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
1657+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1658+
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
1659+
; GFX10-NEXT: scratch_store_dword v0, v1, off
1660+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1661+
; GFX10-NEXT: s_endpgm
1662+
;
1663+
; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1664+
; GFX940: ; %bb.0: ; %bb
1665+
; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
1666+
; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
1667+
; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
1668+
; GFX940-NEXT: v_mov_b32_e32 v1, 15
1669+
; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
1670+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1671+
; GFX940-NEXT: s_endpgm
1672+
;
1673+
; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1674+
; GFX11: ; %bb.0: ; %bb
1675+
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1676+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1677+
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
1678+
; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1679+
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1680+
; GFX11-NEXT: s_endpgm
1681+
;
1682+
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
1683+
; GFX12: ; %bb.0: ; %bb
1684+
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
1685+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1686+
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
1687+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
1688+
; GFX12-NEXT: s_wait_storecnt 0x0
1689+
; GFX12-NEXT: s_endpgm
1690+
bb:
1691+
%add1 = add nsw i32 %sidx, %vidx
1692+
%add2 = add nsw i32 %add1, 65512
1693+
%gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
1694+
store volatile i32 15, ptr addrspace(5) %gep, align 4
1695+
ret void
1696+
}
1697+
1698+
define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
1699+
; GFX9-LABEL: sgpr_base_negative_offset:
1700+
; GFX9: ; %bb.0: ; %entry
1701+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
1702+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1703+
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
1704+
; GFX9-NEXT: scratch_load_dword v2, off, s0
1705+
; GFX9-NEXT: s_waitcnt vmcnt(0)
1706+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
1707+
; GFX9-NEXT: s_endpgm
1708+
;
1709+
; GFX10-LABEL: sgpr_base_negative_offset:
1710+
; GFX10: ; %bb.0: ; %entry
1711+
; GFX10-NEXT: s_add_u32 s0, s0, s5
1712+
; GFX10-NEXT: s_addc_u32 s1, s1, 0
1713+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1714+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1715+
; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
1716+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1717+
; GFX10-NEXT: global_store_dword v[0:1], v2, off
1718+
; GFX10-NEXT: s_endpgm
1719+
;
1720+
; GFX940-LABEL: sgpr_base_negative_offset:
1721+
; GFX940: ; %bb.0: ; %entry
1722+
; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
1723+
; GFX940-NEXT: scratch_load_dword v2, off, s0
1724+
; GFX940-NEXT: s_waitcnt vmcnt(0)
1725+
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
1726+
; GFX940-NEXT: s_endpgm
1727+
;
1728+
; GFX11-LABEL: sgpr_base_negative_offset:
1729+
; GFX11: ; %bb.0: ; %entry
1730+
; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1731+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1732+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1733+
; GFX11-NEXT: s_nop 0
1734+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1735+
; GFX11-NEXT: s_endpgm
1736+
;
1737+
; GFX12-LABEL: sgpr_base_negative_offset:
1738+
; GFX12: ; %bb.0: ; %entry
1739+
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
1740+
; GFX12-NEXT: s_wait_loadcnt 0x0
1741+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
1742+
; GFX12-NEXT: s_nop 0
1743+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1744+
; GFX12-NEXT: s_endpgm
1745+
entry:
1746+
%scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
1747+
%0 = load i32, ptr addrspace(5) %scevgep28, align 4
1748+
store i32 %0, ptr addrspace(1) %out
1749+
ret void
1750+
}
1751+
15131752
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)