Skip to content

Commit 69abfd3

Browse files
authored
[AMDGPU] Allow casts between the Global and Constant Addr Spaces in isValidAddrSpaceCast (llvm#112493)
So far, isValidAddrSpaceCast only allows casts to the flat address space and between the constant(32) address spaces. It does not allow casting between the global and constant address spaces, even though they alias. That affects, e.g., the lowering of memmoves from the constant to the global address space in LowerMemIntrinsics, since that requires aliasing address spaces to be castable. This patch relaxes isValidAddrSpaceCast and allows such casts. It also includes a memmove test that would crash with the previous implementation because the memmove IR lowering would not be applicable for the move from constant AS to global AS.
1 parent b735c66 commit 69abfd3

File tree

3 files changed

+2513
-28
lines changed

3 files changed

+2513
-28
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "AMDGPU.h"
2121
#include "llvm/CodeGen/BasicTTIImpl.h"
22+
#include "llvm/Support/AMDGPUAddrSpace.h"
2223
#include <optional>
2324

2425
namespace llvm {
@@ -174,24 +175,23 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
174175
bool isAlwaysUniform(const Value *V) const;
175176

176177
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177-
if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178-
switch (FromAS) {
179-
case AMDGPUAS::GLOBAL_ADDRESS:
180-
case AMDGPUAS::CONSTANT_ADDRESS:
181-
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
182-
case AMDGPUAS::LOCAL_ADDRESS:
183-
case AMDGPUAS::PRIVATE_ADDRESS:
184-
return true;
185-
default:
186-
break;
187-
}
178+
// Address space casts must cast between different address spaces.
179+
if (FromAS == ToAS)
188180
return false;
189-
}
190-
if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
191-
ToAS == AMDGPUAS::CONSTANT_ADDRESS) ||
192-
(FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
193-
ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
194-
return true;
181+
182+
if (FromAS == AMDGPUAS::FLAT_ADDRESS)
183+
return AMDGPU::isExtendedGlobalAddrSpace(ToAS) ||
184+
ToAS == AMDGPUAS::LOCAL_ADDRESS ||
185+
ToAS == AMDGPUAS::PRIVATE_ADDRESS;
186+
187+
if (AMDGPU::isExtendedGlobalAddrSpace(FromAS))
188+
return AMDGPU::isFlatGlobalAddrSpace(ToAS) ||
189+
ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
190+
191+
if (FromAS == AMDGPUAS::LOCAL_ADDRESS ||
192+
FromAS == AMDGPUAS::PRIVATE_ADDRESS)
193+
return ToAS == AMDGPUAS::FLAT_ADDRESS;
194+
195195
return false;
196196
}
197197

llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Lines changed: 167 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,8 +1336,8 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
13361336
; MAX1024-NEXT: ret void
13371337
;
13381338
; ALL-LABEL: @memmove_flat_align1_global_align1(
1339-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
1340-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1339+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(1)
1340+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]]
13411341
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
13421342
; ALL: memmove_bwd_loop:
13431343
; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
@@ -1404,8 +1404,8 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
14041404
; MAX1024-NEXT: ret void
14051405
;
14061406
; ALL-LABEL: @memmove_flat_align1_private_align1(
1407-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
1408-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1407+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(5)
1408+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]]
14091409
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
14101410
; ALL: memmove_bwd_loop:
14111411
; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
@@ -1514,7 +1514,59 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
15141514

15151515
define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) {
15161516
; OPT-LABEL: @memmove_global_align1_p999_align1(
1517-
; OPT-NEXT: call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(999) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1517+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1518+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1519+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1520+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1521+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1522+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999)
1523+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]]
1524+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1525+
; OPT: memmove_copy_backwards:
1526+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1527+
; OPT: memmove_bwd_residual_loop:
1528+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1529+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1530+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1531+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(999) [[TMP6]], align 1
1532+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1533+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP7]], align 1
1534+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1535+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1536+
; OPT: memmove_bwd_middle:
1537+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1538+
; OPT: memmove_bwd_main_loop:
1539+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1540+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1541+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1542+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1
1543+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]]
1544+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1
1545+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1546+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1547+
; OPT: memmove_copy_forward:
1548+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1549+
; OPT: memmove_fwd_main_loop:
1550+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1551+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1552+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1
1553+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]]
1554+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1
1555+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1556+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1557+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1558+
; OPT: memmove_fwd_middle:
1559+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1560+
; OPT: memmove_fwd_residual_loop:
1561+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1562+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1563+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(999) [[TMP17]], align 1
1564+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1565+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP18]], align 1
1566+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1567+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1568+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1569+
; OPT: memmove_done:
15181570
; OPT-NEXT: ret void
15191571
;
15201572
call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false)
@@ -1523,7 +1575,59 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d
15231575

15241576
define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) {
15251577
; OPT-LABEL: @memmove_p999_align1_p1_align1(
1526-
; OPT-NEXT: call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1578+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1579+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1580+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1581+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1582+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1583+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1)
1584+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]]
1585+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1586+
; OPT: memmove_copy_backwards:
1587+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1588+
; OPT: memmove_bwd_residual_loop:
1589+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1590+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1591+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1592+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1
1593+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1594+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1595+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1596+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1597+
; OPT: memmove_bwd_middle:
1598+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1599+
; OPT: memmove_bwd_main_loop:
1600+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1601+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1602+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1603+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1
1604+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1605+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1606+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1607+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1608+
; OPT: memmove_copy_forward:
1609+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1610+
; OPT: memmove_fwd_main_loop:
1611+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1612+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1613+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1
1614+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1615+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1616+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1617+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1618+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1619+
; OPT: memmove_fwd_middle:
1620+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1621+
; OPT: memmove_fwd_residual_loop:
1622+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1623+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1624+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP17]], align 1
1625+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1626+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1627+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1628+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1629+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1630+
; OPT: memmove_done:
15271631
; OPT-NEXT: ret void
15281632
;
15291633
call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false)
@@ -1532,7 +1636,59 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst
15321636

15331637
define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) {
15341638
; OPT-LABEL: @memmove_p999_align1_p998_align1(
1535-
; OPT-NEXT: call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(998) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1639+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1640+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1641+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1642+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1643+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1644+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998)
1645+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]]
1646+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1647+
; OPT: memmove_copy_backwards:
1648+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1649+
; OPT: memmove_bwd_residual_loop:
1650+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1651+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1652+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1653+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(998) [[TMP6]], align 1
1654+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1655+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1656+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1657+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1658+
; OPT: memmove_bwd_middle:
1659+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1660+
; OPT: memmove_bwd_main_loop:
1661+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1662+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1663+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1664+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1
1665+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1666+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1667+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1668+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1669+
; OPT: memmove_copy_forward:
1670+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1671+
; OPT: memmove_fwd_main_loop:
1672+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1673+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1674+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1
1675+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1676+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1677+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1678+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1679+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1680+
; OPT: memmove_fwd_middle:
1681+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1682+
; OPT: memmove_fwd_residual_loop:
1683+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1684+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1685+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(998) [[TMP17]], align 1
1686+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1687+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1688+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1689+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1690+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1691+
; OPT: memmove_done:
15361692
; OPT-NEXT: ret void
15371693
;
15381694
call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false)
@@ -1726,8 +1882,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
17261882
; MAX1024-NEXT: ret void
17271883
;
17281884
; ALL-LABEL: @memmove_flat_align1_local_align1(
1729-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
1730-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1885+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1886+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]]
17311887
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
17321888
; ALL: memmove_bwd_loop:
17331889
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
@@ -1761,8 +1917,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
17611917
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
17621918
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
17631919
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
1764-
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
1765-
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
1920+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1921+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]]
17661922
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
17671923
; OPT: memmove_copy_backwards:
17681924
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]

0 commit comments

Comments
 (0)