[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Mel-Chen · 2025-06-10T15:24:30Z

Pre-commit test #143498

llvmbot · 2025-06-10T15:25:08Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Mel Chen (Mel-Chen)

Changes

Pre-commit test #143498

Full diff: https://github.com/llvm/llvm-project/pull/143552.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+10-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanUtils.h (+5)
(modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+1-1)
(added) llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll (+70)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc3c7bfe5cd1a..c91ca9b90e90b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1180,6 +1180,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
 
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   // Try to narrow wide and replicating recipes to single scalar recipes,
   // based on VPlan analysis. Only process blocks in the loop region for now,
   // without traversing into nested regions, as recipes in replicate regions
@@ -1194,13 +1195,19 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
 
       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      Instruction *UI = RepOrWidenR->getUnderlyingInstr();
+      if (!UI)
+	continue;
+
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
-          any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
-            return !U->usesScalars(RepOrWidenR);
-          }))
+          any_of(RepOrWidenR->users(),
+                 [RepOrWidenR](VPUser *U) {
+                   return !U->usesScalars(RepOrWidenR);
+                 }) ||
+          UI->getType() != TypeInfo.inferScalarType(RepOrWidenR))
         continue;
 
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8a4e6fda89c12..7fc801a466dce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -75,6 +75,11 @@ inline bool isSingleScalar(const VPValue *VPV) {
     return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
+  if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+    return PreservesUniformity(CastR->getOpcode()) &&
+           all_of(CastR->operands(), isSingleScalar);
+  }
+
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2c6fe4f5c808e..641ed90b6de34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -379,7 +379,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
new file mode 100644
index 0000000000000..1ac4fe70921e6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
+; CHECK-LABEL: define void @minbw_cast(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[BOOL1:%.*]], i1 [[BOOL2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BOOL1_EXT:%.*]] = zext i1 [[BOOL1]] to i32
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BOOL2_EXT:%.*]] = zext i1 [[BOOL2]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[BOOL1_EXT]], [[BOOL2_EXT]]
+; CHECK-NEXT:    [[XOR_TRUNC:%.*]] = trunc i32 [[XOR]] to i8
+; CHECK-NEXT:    store i8 [[XOR_TRUNC]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bool1.ext = zext i1 %bool1 to i32
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %bool2.ext = zext i1 %bool2 to i32
+  %xor = xor i32 %bool1.ext, %bool2.ext
+  %xor.trunc = trunc i32 %xor to i8
+  store i8 %xor.trunc, ptr %dst, align 1
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

github-actions · 2025-06-10T15:26:55Z

✅ With the latest revision this PR passed the C/C++ code formatter.

alexey-bataev · 2025-06-10T17:02:53Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

      // Skip recipes that aren't single scalars or don't have only their
      // scalar results used. In the latter case, we would introduce extra
      // broadcasts.


Update comment?

fhahn

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Mel-Chen · 2025-06-17T09:37:28Z

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.

IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

fhahn

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

Mel-Chen · 2025-06-17T10:28:47Z

Could you add a brief description of the problematic issue? IIUC the issue is that a VPWidenCastRecipe could have been narrowed/extended, and then we cannot convert it to a VPReplicateRecipe, which clones the underlying instruction.
IIRC this should be solved by using VPInstructionWithType for single-scalar casts #140623

Not only that — the issue lies in truncateToMinimalBitwidths, which affects the type of VPWidenRecipe as well. I’m trying to emit VPInstruction instead of VPReplicateRecipe to generate a new VPWidenRecipe, but I'm currently facing some issues and resolving them.

To use VPInstructions instead of VPReplicateRecipe, #140623 will be needed I think and its follow-ups.

Yes, #140623 is necessary if we want to solve the issue by emitting a VPInstruction, as I’m currently trying to do.

But I am missing the issue with VPWidenRecipe; the type of the result is defined by the type of the operands, so it should gracefully handle if the operands are extended or truncated?

When execution the recipe, the type of VPWidenRecipe isn’t derived from the underlying value, but it seems to default to a vector type. However, what we need to produce here is actually a scalar type.

Pre-commit test for minbw cast

218fe3d

Mel-Chen requested review from artagnon, fhahn, juliannagele, alexey-bataev and david-arm June 10, 2025 15:24

llvmbot added vectorizers llvm:transforms labels Jun 10, 2025

IsSingleScalar

f32401e

Mel-Chen force-pushed the isSingleScalar-cast branch from 9468b17 to f32401e Compare June 10, 2025 15:32

Mel-Chen mentioned this pull request Jun 10, 2025

[VPlan] Support VPWidenCastRecipe in narrowToSingleScalarRecipes. #141080

Open

alexey-bataev reviewed Jun 10, 2025

View reviewed changes

fhahn reviewed Jun 10, 2025

View reviewed changes

fhahn reviewed Jun 17, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Uh oh!

Mel-Chen commented Jun 10, 2025

Uh oh!

llvmbot commented Jun 10, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Jun 10, 2025 •

edited

Loading

Uh oh!

alexey-bataev Jun 10, 2025

Uh oh!

fhahn left a comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

fhahn left a comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

Uh oh!

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Are you sure you want to change the base?

[VPlan] Support VPWidenCastRecipe in isSingleScalar. #143552

Uh oh!

Conversation

Mel-Chen commented Jun 10, 2025

Uh oh!

llvmbot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev Jun 10, 2025

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Jun 17, 2025

Uh oh!

Uh oh!

llvmbot commented Jun 10, 2025 •

edited

Loading

github-actions bot commented Jun 10, 2025 •

edited

Loading