Skip to content

Commit 30f4ac1

Browse files
committed
[SepGEP] Reorder trivial GEP chains to separate constants
Change-Id: I813c3c402093fc73bed70a50cdfa24d396e1b771
1 parent 8ea7f1d commit 30f4ac1

File tree

7 files changed

+628
-139
lines changed

7 files changed

+628
-139
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,11 @@ class SeparateConstOffsetFromGEP {
391391
/// and returns true if the splitting succeeds.
392392
bool splitGEP(GetElementPtrInst *GEP);
393393

394+
/// Tries to reorder the given GEP with the GEP that produces the base if
395+
/// doing so results in producing a constant offset as the outermost
396+
/// index.
397+
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
398+
394399
/// Lower a GEP with multiple indices into multiple GEPs with a single index.
395400
/// Function splitGEP already split the original GEP into a variadic part and
396401
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -964,6 +969,51 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
964969
Variadic->eraseFromParent();
965970
}
966971

972+
bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
973+
TargetTransformInfo &TTI) {
974+
Type *GEPType = GEP->getResultElementType();
975+
// TODO: support reordering for non-trivial GEP chains
976+
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1 ||
977+
GEP->isInBounds())
978+
return false;
979+
980+
auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
981+
if (!PtrGEP || PtrGEP->isInBounds())
982+
return false;
983+
Type *PtrGEPType = PtrGEP->getResultElementType();
984+
// TODO: support reordering for non-trivial GEP chains
985+
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
986+
return false;
987+
988+
// TODO: support reordering for non-trivial GEP chains
989+
if (PtrGEPType != GEPType ||
990+
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
991+
return false;
992+
993+
bool NestedNeedsExtraction;
994+
int64_t NestedByteOffset =
995+
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
996+
if (!NestedNeedsExtraction)
997+
return false;
998+
999+
unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
1000+
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
1001+
/*BaseGV=*/nullptr, NestedByteOffset,
1002+
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1003+
return false;
1004+
1005+
IRBuilder<> Builder(GEP);
1006+
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
1007+
// For trivial GEP chains, we can swap the indicies.
1008+
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
1009+
SmallVector<Value *, 4>(GEP->indices()));
1010+
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
1011+
SmallVector<Value *, 4>(PtrGEP->indices()));
1012+
GEP->replaceAllUsesWith(NewGEP);
1013+
RecursivelyDeleteTriviallyDeadInstructions(GEP);
1014+
return true;
1015+
}
1016+
9671017
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9681018
// Skip vector GEPs.
9691019
if (GEP->getType()->isVectorTy())
@@ -979,11 +1029,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9791029
bool NeedsExtraction;
9801030
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
9811031

982-
if (!NeedsExtraction)
983-
return Changed;
984-
9851032
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
9861033

1034+
if (!NeedsExtraction) {
1035+
Changed |= reorderGEP(GEP, TTI);
1036+
return Changed;
1037+
}
1038+
9871039
// If LowerGEP is disabled, before really splitting the GEP, check whether the
9881040
// backend supports the addressing mode we are about to produce. If no, this
9891041
// splitting probably won't be beneficial.

llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll

Lines changed: 116 additions & 135 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/PowerPC/licm-remat.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ define linkonce_odr void @ZN6snappyDecompressor_(ptr %this, ptr %writer) {
2121
; CHECK-LABEL: ZN6snappyDecompressor_:
2222
; CHECK: # %bb.0: # %entry
2323
; CHECK: addis 4, 2, .L__ModuleStringPool@toc@ha
24-
; CHECK: addi 25, 4, .L__ModuleStringPool@toc@l
24+
; CHECK: addi 26, 4, .L__ModuleStringPool@toc@l
2525
; CHECK: .LBB0_2: # %for.cond
2626
; CHECK-NOT: addis {{[0-9]+}}, 2, .L__ModuleStringPool@toc@ha
2727
; CHECK: bctrl
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=separate-const-offset-from-gep < %s | FileCheck %s
3+
4+
define void @inbounds(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1, i64 %in.idx2) {
5+
; CHECK-LABEL: define void @inbounds(
6+
; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]], i64 [[IN_IDX2:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: entry:
8+
; CHECK-NEXT: [[CONST1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[IN_PTR]], i64 2
9+
; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[CONST1]], i64 [[IN_IDX1]]
10+
; CHECK-NEXT: [[CONST2:%.*]] = getelementptr <16 x i16>, ptr [[IN_PTR]], i64 24
11+
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds <16 x i16>, ptr [[CONST2]], i64 [[IN_IDX1]]
12+
; CHECK-NEXT: [[CONST3:%.*]] = getelementptr inbounds <16 x i16>, ptr [[IN_PTR]], i64 6
13+
; CHECK-NEXT: [[IDX3:%.*]] = getelementptr <16 x i16>, ptr [[CONST3]], i64 [[IN_IDX2]]
14+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr <16 x i16>, ptr [[IN_PTR]], i64 [[IN_IDX2]]
15+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <16 x i16>, ptr [[TMP0]], i64 28
16+
; CHECK-NEXT: ret void
17+
;
18+
entry:
19+
%const1 = getelementptr inbounds <16 x i16>, ptr %in.ptr, i64 2
20+
%idx1 = getelementptr inbounds <16 x i16>, ptr %const1, i64 %in.idx1
21+
%const2 = getelementptr <16 x i16>, ptr %in.ptr, i64 24
22+
%idx2 = getelementptr inbounds <16 x i16>, ptr %const2, i64 %in.idx1
23+
%const3 = getelementptr inbounds <16 x i16>, ptr %in.ptr, i64 6
24+
%idx3 = getelementptr <16 x i16>, ptr %const3, i64 %in.idx2
25+
%const4 = getelementptr <16 x i16>, ptr %in.ptr, i64 28
26+
%idx4 = getelementptr <16 x i16>, ptr %const4, i64 %in.idx2
27+
ret void
28+
}
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --start-before=separate-const-offset-from-gep < %s | FileCheck %s
3+
4+
define protected amdgpu_kernel void @sink_addr(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
5+
; CHECK-LABEL: sink_addr:
6+
; CHECK: ; %bb.0: ; %entry
7+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
8+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
9+
; CHECK-NEXT: s_lshl_b32 s3, s1, 1
10+
; CHECK-NEXT: s_add_i32 s0, s0, s3
11+
; CHECK-NEXT: s_lshl_b32 s2, s2, 1
12+
; CHECK-NEXT: s_add_i32 s0, s0, s2
13+
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
14+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
15+
; CHECK-NEXT: ; %bb.1: ; %bb.1
16+
; CHECK-NEXT: v_mov_b32_e32 v12, s0
17+
; CHECK-NEXT: ds_read_b128 v[0:3], v12
18+
; CHECK-NEXT: ds_read_b128 v[4:7], v12 offset:512
19+
; CHECK-NEXT: ds_read_b128 v[8:11], v12 offset:1024
20+
; CHECK-NEXT: ds_read_b128 v[12:15], v12 offset:1536
21+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
22+
; CHECK-NEXT: ;;#ASMSTART
23+
; CHECK-NEXT: ; use v[0:3]
24+
; CHECK-NEXT: ;;#ASMEND
25+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
26+
; CHECK-NEXT: ;;#ASMSTART
27+
; CHECK-NEXT: ; use v[4:7]
28+
; CHECK-NEXT: ;;#ASMEND
29+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
30+
; CHECK-NEXT: ;;#ASMSTART
31+
; CHECK-NEXT: ; use v[8:11]
32+
; CHECK-NEXT: ;;#ASMEND
33+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
34+
; CHECK-NEXT: ;;#ASMSTART
35+
; CHECK-NEXT: ; use v[12:15]
36+
; CHECK-NEXT: ;;#ASMEND
37+
; CHECK-NEXT: .LBB0_2: ; %end
38+
; CHECK-NEXT: s_add_i32 s1, s0, 0x200
39+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
40+
; CHECK-NEXT: s_add_i32 s2, s0, 0x400
41+
; CHECK-NEXT: ;;#ASMSTART
42+
; CHECK-NEXT: ; use v0
43+
; CHECK-NEXT: ;;#ASMEND
44+
; CHECK-NEXT: v_mov_b32_e32 v0, s1
45+
; CHECK-NEXT: s_add_i32 s3, s0, 0x600
46+
; CHECK-NEXT: ;;#ASMSTART
47+
; CHECK-NEXT: ; use v0
48+
; CHECK-NEXT: ;;#ASMEND
49+
; CHECK-NEXT: v_mov_b32_e32 v0, s2
50+
; CHECK-NEXT: ;;#ASMSTART
51+
; CHECK-NEXT: ; use v0
52+
; CHECK-NEXT: ;;#ASMEND
53+
; CHECK-NEXT: v_mov_b32_e32 v0, s3
54+
; CHECK-NEXT: ;;#ASMSTART
55+
; CHECK-NEXT: ; use v0
56+
; CHECK-NEXT: ;;#ASMEND
57+
; CHECK-NEXT: s_endpgm
58+
entry:
59+
%base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
60+
%idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
61+
%const1 = getelementptr half, ptr addrspace(3) %base, i32 256
62+
%idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
63+
%const2 = getelementptr half, ptr addrspace(3) %base, i32 512
64+
%idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
65+
%const3 = getelementptr half, ptr addrspace(3) %base, i32 768
66+
%idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
67+
%cmp0 = icmp eq i32 %in.idx0, 0
68+
br i1 %cmp0, label %bb.1, label %end
69+
70+
bb.1:
71+
%val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
72+
%val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
73+
%val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
74+
%val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
75+
call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
76+
call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
77+
call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
78+
call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
79+
br label %end
80+
81+
end:
82+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
83+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
84+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
85+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
86+
ret void
87+
}
88+
89+
define protected amdgpu_kernel void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
90+
; CHECK-LABEL: illegal_addr_mode:
91+
; CHECK: ; %bb.0: ; %entry
92+
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
93+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
94+
; CHECK-NEXT: s_lshl_b32 s0, s5, 1
95+
; CHECK-NEXT: s_lshl_b32 s1, s6, 1
96+
; CHECK-NEXT: s_add_i32 s3, s4, s0
97+
; CHECK-NEXT: s_add_i32 s3, s3, s1
98+
; CHECK-NEXT: s_add_i32 s2, s3, 0x12a60
99+
; CHECK-NEXT: s_add_i32 s1, s3, 0x12c60
100+
; CHECK-NEXT: s_add_i32 s0, s3, 0x12ed8
101+
; CHECK-NEXT: s_cmp_lg_u32 s5, 0
102+
; CHECK-NEXT: s_cbranch_scc1 .LBB1_2
103+
; CHECK-NEXT: ; %bb.1: ; %bb.1
104+
; CHECK-NEXT: v_mov_b32_e32 v0, s3
105+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
106+
; CHECK-NEXT: v_mov_b32_e32 v8, s1
107+
; CHECK-NEXT: v_mov_b32_e32 v12, s0
108+
; CHECK-NEXT: ds_read_b128 v[0:3], v0
109+
; CHECK-NEXT: ds_read_b128 v[4:7], v4
110+
; CHECK-NEXT: ds_read_b128 v[8:11], v8
111+
; CHECK-NEXT: ds_read_b128 v[12:15], v12
112+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
113+
; CHECK-NEXT: ;;#ASMSTART
114+
; CHECK-NEXT: ; use v[0:3]
115+
; CHECK-NEXT: ;;#ASMEND
116+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
117+
; CHECK-NEXT: ;;#ASMSTART
118+
; CHECK-NEXT: ; use v[4:7]
119+
; CHECK-NEXT: ;;#ASMEND
120+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
121+
; CHECK-NEXT: ;;#ASMSTART
122+
; CHECK-NEXT: ; use v[8:11]
123+
; CHECK-NEXT: ;;#ASMEND
124+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
125+
; CHECK-NEXT: ;;#ASMSTART
126+
; CHECK-NEXT: ; use v[12:15]
127+
; CHECK-NEXT: ;;#ASMEND
128+
; CHECK-NEXT: .LBB1_2: ; %end
129+
; CHECK-NEXT: v_mov_b32_e32 v0, s3
130+
; CHECK-NEXT: ;;#ASMSTART
131+
; CHECK-NEXT: ; use v0
132+
; CHECK-NEXT: ;;#ASMEND
133+
; CHECK-NEXT: v_mov_b32_e32 v0, s2
134+
; CHECK-NEXT: ;;#ASMSTART
135+
; CHECK-NEXT: ; use v0
136+
; CHECK-NEXT: ;;#ASMEND
137+
; CHECK-NEXT: v_mov_b32_e32 v0, s1
138+
; CHECK-NEXT: ;;#ASMSTART
139+
; CHECK-NEXT: ; use v0
140+
; CHECK-NEXT: ;;#ASMEND
141+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
142+
; CHECK-NEXT: ;;#ASMSTART
143+
; CHECK-NEXT: ; use v0
144+
; CHECK-NEXT: ;;#ASMEND
145+
; CHECK-NEXT: s_endpgm
146+
entry:
147+
%base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
148+
%idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
149+
%const1 = getelementptr half, ptr addrspace(3) %base, i32 38192
150+
%idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
151+
%const2 = getelementptr half, ptr addrspace(3) %base, i32 38448
152+
%idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
153+
%const3 = getelementptr half, ptr addrspace(3) %base, i32 38764
154+
%idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
155+
%cmp0 = icmp eq i32 %in.idx0, 0
156+
br i1 %cmp0, label %bb.1, label %end
157+
158+
bb.1:
159+
%val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
160+
%val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
161+
%val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
162+
%val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
163+
call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
164+
call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
165+
call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
166+
call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
167+
br label %end
168+
169+
end:
170+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
171+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
172+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
173+
call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
174+
ret void
175+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2+
; RUN: opt -mtriple=nvptx64-nvidia-cuda -S -passes=separate-const-offset-from-gep < %s | FileCheck %s
3+
4+
define protected amdgpu_kernel void @sink_addr(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1) {
5+
; CHECK-LABEL: define protected amdgpu_kernel void @sink_addr(
6+
; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) {
7+
; CHECK-NEXT: entry:
8+
; CHECK-NEXT: [[IDX0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 [[IN_IDX1]]
9+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
10+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[IN_IDX1]]
11+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 256
12+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
13+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i64 [[IN_IDX1]]
14+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 512
15+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
16+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 [[IN_IDX1]]
17+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i64 768
18+
; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
19+
; CHECK-NEXT: br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
20+
; CHECK: bb.1:
21+
; CHECK-NEXT: [[VAL0:%.*]] = load <8 x i64>, ptr [[IDX0]], align 16
22+
; CHECK-NEXT: [[VAL1:%.*]] = load <8 x i64>, ptr [[TMP2]], align 16
23+
; CHECK-NEXT: [[VAL2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 16
24+
; CHECK-NEXT: [[VAL3:%.*]] = load <8 x i64>, ptr [[TMP8]], align 16
25+
; CHECK-NEXT: call void asm sideeffect "
26+
; CHECK-NEXT: call void asm sideeffect "
27+
; CHECK-NEXT: call void asm sideeffect "
28+
; CHECK-NEXT: call void asm sideeffect "
29+
; CHECK-NEXT: br label [[END]]
30+
; CHECK: end:
31+
; CHECK-NEXT: call void asm sideeffect "
32+
; CHECK-NEXT: call void asm sideeffect "
33+
; CHECK-NEXT: call void asm sideeffect "
34+
; CHECK-NEXT: call void asm sideeffect "
35+
; CHECK-NEXT: ret void
36+
;
37+
entry:
38+
%idx0 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 %in.idx1
39+
%const1 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 256
40+
%idx1 = getelementptr i64, ptr %const1, i64 %in.idx1
41+
%const2 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 512
42+
%idx2 = getelementptr i64, ptr %const2, i64 %in.idx1
43+
%const3 = getelementptr [8192 x i64], ptr %in.ptr, i64 %in.idx0, i64 768
44+
%idx3 = getelementptr i64, ptr %const3, i64 %in.idx1
45+
%cmp0 = icmp eq i64 %in.idx0, 0
46+
br i1 %cmp0, label %bb.1, label %end
47+
48+
bb.1:
49+
%val0 = load <8 x i64>, ptr %idx0, align 16
50+
%val1 = load <8 x i64>, ptr %idx1, align 16
51+
%val2 = load <8 x i64>, ptr %idx2, align 16
52+
%val3 = load <8 x i64>, ptr %idx3, align 16
53+
call void asm sideeffect "; use $0", "v"(<8 x i64> %val0)
54+
call void asm sideeffect "; use $0", "v"(<8 x i64> %val1)
55+
call void asm sideeffect "; use $0", "v"(<8 x i64> %val2)
56+
call void asm sideeffect "; use $0", "v"(<8 x i64> %val3)
57+
br label %end
58+
59+
end:
60+
call void asm sideeffect "; use $0", "v"(ptr %idx0)
61+
call void asm sideeffect "; use $0", "v"(ptr %idx1)
62+
call void asm sideeffect "; use $0", "v"(ptr %idx2)
63+
call void asm sideeffect "; use $0", "v"(ptr %idx3)
64+
ret void
65+
}

0 commit comments

Comments
 (0)