Skip to content

Commit ec62dee

Browse files
authored
[VPlan] Handle FirstActiveLane when unrolling. (#145394)
Currently FirstActiveLane is not handled correctly during unrolling. This is currently causing mis-compiles when vectorizing early-exit loops with interleaving forced. This patch updates handling of FirstActiveLane to be analogous to computing final reduction results: during unrolling, the created copies for its original operand are added as additional operands, and FirstActiveLane will always produce the index of the first active lane across all unrolled iterations. Note that some of the generated code is still incorrect, as we also need to handle ExtractElement with FirstActiveLane operands. I will share patches for those soon as well. PR: #145394
1 parent 045b827 commit ec62dee

File tree

6 files changed

+280
-132
lines changed

6 files changed

+280
-132
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
966966
// all unrolled iterations. Unrolling will add all copies of its original
967967
// operand as additional operands.
968968
AnyOf,
969-
// Calculates the first active lane index of the vector predicate operand.
969+
// Calculates the first active lane index of the vector predicate operands.
970+
// It produces the lane index across all unrolled iterations. Unrolling will
971+
// add all copies of its original operand as additional operands.
970972
FirstActiveLane,
971973

972974
// The opcodes below are used for VPInstructionWithType.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -856,9 +856,32 @@ Value *VPInstruction::generate(VPTransformState &State) {
856856
return Builder.CreateOrReduce(Res);
857857
}
858858
case VPInstruction::FirstActiveLane: {
859-
Value *Mask = State.get(getOperand(0));
860-
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
861-
true, Name);
859+
if (getNumOperands() == 1) {
860+
Value *Mask = State.get(getOperand(0));
861+
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
862+
true, Name);
863+
}
864+
// If there are multiple operands, create a chain of selects to pick the
865+
// first operand with an active lane and add the number of lanes of the
866+
// preceding operands.
867+
Value *RuntimeVF =
868+
getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
869+
unsigned LastOpIdx = getNumOperands() - 1;
870+
Value *Res = nullptr;
871+
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
872+
Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
873+
Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
874+
Value *Current = Builder.CreateAdd(
875+
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
876+
if (Res) {
877+
Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
878+
Res = Builder.CreateSelect(Cmp, Current, Res);
879+
} else {
880+
Res = Current;
881+
}
882+
}
883+
884+
return Res;
862885
}
863886
default:
864887
llvm_unreachable("Unsupported opcode for instruction");

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,12 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
345345
if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
346346
continue;
347347

348-
// Add all VPValues for all parts to AnyOf and Compute*Result which combine
349-
// all parts to compute the final value.
348+
// Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
349+
// Compute*Result which combine all parts to compute the final value.
350350
VPValue *Op1;
351351
if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
352+
match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
353+
m_VPValue(Op1))) ||
352354
match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
353355
m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
354356
match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(

llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,27 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
7777
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
7878
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
7979
; CHECK: vector.early.exit:
80+
; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
81+
; CHECK-NEXT: [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
82+
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
83+
; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP42]], 3
84+
; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
85+
; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
86+
; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP42]], 2
87+
; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
88+
; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP42]]
89+
; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
90+
; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
91+
; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP42]], 1
92+
; CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
93+
; CHECK-NEXT: [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP42]]
94+
; CHECK-NEXT: [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
8095
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
81-
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]]
96+
; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP42]], 0
97+
; CHECK-NEXT: [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
98+
; CHECK-NEXT: [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP42]]
99+
; CHECK-NEXT: [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
100+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
82101
; CHECK-NEXT: [[TMP17:%.*]] = add i64 3, [[TMP16]]
83102
; CHECK-NEXT: br label [[LOOP_END]]
84103
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)