Skip to content

Commit 50c8466

Browse files
committed
[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop
Currently when we encounter a negative step in the induction variable isDereferenceableAndAlignedInLoop bails out because the element size is signed greater than the step. This patch adds support for negative steps in cases where we detect the start address for the load is of the form base + offset. In this case the address decrements in each iteration so we need to calculate the access size differently. The motivation for this patch comes from PR #88385 where a reviewer requested reusing isDereferenceableAndAlignedInLoop, but that PR itself does support reverse loops.
1 parent 3f616cf commit 50c8466

File tree

2 files changed

+51
-34
lines changed

2 files changed

+51
-34
lines changed

llvm/lib/Analysis/Loads.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -293,21 +293,24 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
293293

294294
// TODO: Handle overlapping accesses.
295295
// We should be computing AccessSize as (TC - 1) * Step + EltSize.
296-
if (EltSize.sgt(Step->getAPInt()))
296+
bool StepIsNegative = Step->getAPInt().isNegative();
297+
APInt AbsStep = Step->getAPInt().abs();
298+
if (EltSize.ugt(AbsStep))
297299
return false;
298300

299301
// Compute the total access size for access patterns with unit stride and
300302
// patterns with gaps. For patterns with unit stride, Step and EltSize are the
301303
// same.
302304
// For patterns with gaps (i.e. non unit stride), we are
303305
// accessing EltSize bytes at every Step.
304-
APInt AccessSize = TC * Step->getAPInt();
306+
APInt AccessSize = TC * AbsStep;
305307

306308
assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
307309
"implied by addrec definition");
308310
Value *Base = nullptr;
309311
if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
310-
Base = StartS->getValue();
312+
if (!StepIsNegative)
313+
Base = StartS->getValue();
311314
} else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
312315
// Handle (NewBase + offset) as start value.
313316
const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
@@ -318,11 +321,24 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
318321
// TODO: generalize if a case found which warrants
319322
if (Offset->getAPInt().urem(Alignment.value()) != 0)
320323
return false;
324+
if (StepIsNegative) {
325+
// In the last iteration of the loop the address we access we will be
326+
// lower than the first by (TC - 1) * Step. So we need to make sure
327+
// that there is enough room in Offset to accomodate this.
328+
APInt SubOffset = (TC - 1) * AbsStep;
329+
if (Offset->getAPInt().ult(SubOffset))
330+
return false;
331+
// We can safely use the new base because the decrementing pointer is
332+
// always guaranteed to be >= new base. The total access size needs to
333+
// take into account the start offset and the loaded element size.
334+
AccessSize = Offset->getAPInt() + EltSize;
335+
} else {
336+
bool Overflow = false;
337+
AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
338+
if (Overflow)
339+
return false;
340+
}
321341
Base = NewBase->getValue();
322-
bool Overflow = false;
323-
AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
324-
if (Overflow)
325-
return false;
326342
}
327343
}
328344

llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
213213
; CHECK: vector.ph:
214214
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
215215
; CHECK: vector.body:
216-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
216+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
217217
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
218218
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
219219
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
@@ -223,32 +223,33 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
223223
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
224224
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], <i32 3, i32 3>
225225
; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
226-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
227-
; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
226+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
227+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
228+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1
229+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
230+
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
231+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
232+
; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
228233
; CHECK: pred.store.if:
229-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
230-
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
231-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
232-
; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i32 [[TMP8]], 2
233-
; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4
234+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
235+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0
236+
; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
237+
; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4
234238
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
235239
; CHECK: pred.store.continue:
236-
; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_STORE_IF]] ]
237-
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
238-
; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
239-
; CHECK: pred.store.if1:
240-
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], -1
241-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP13]]
242-
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
243-
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP13]]
244-
; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP15]], 2
245-
; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP16]], align 4
246-
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
247-
; CHECK: pred.store.continue2:
248-
; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP15]], [[PRED_STORE_IF1]] ]
240+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
241+
; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
242+
; CHECK: pred.store.if3:
243+
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
244+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
245+
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1
246+
; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
247+
; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4
248+
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
249+
; CHECK: pred.store.continue4:
249250
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
250-
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
251-
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
251+
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
252+
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
252253
; CHECK: middle.block:
253254
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
254255
; CHECK: scalar.ph:
@@ -257,13 +258,13 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
257258
; CHECK: for.body:
258259
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
259260
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[INDVARS_IV]]
260-
; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
261-
; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP20]], 3
261+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
262+
; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
262263
; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
263264
; CHECK: if.then:
264265
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[INDVARS_IV]]
265-
; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
266-
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP21]], 2
266+
; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
267+
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
267268
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[INDVARS_IV]]
268269
; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
269270
; CHECK-NEXT: br label [[FOR_INC]]

0 commit comments

Comments
 (0)