[SDAG] Adding scalarization of `llvm.vector.insert` #71614

rsuderman · 2023-11-08T01:05:44Z

Needed handling the case of scalarizing operands of subvector insertion.

llvmbot · 2023-11-08T01:06:15Z

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-aarch64

Author: Rob Suderman (rsuderman)

Changes

Needed handling the case of scalarizing operands of subvector insertion.

Full diff: https://github.com/llvm/llvm-project/pull/71614.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (+1)
(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+26-2)
(modified) llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll (+9)
(modified) utils/bazel/llvm-project-overlay/mlir/BUILD.bazel (+72)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f85c1296cdce856..5651c6e9b218447 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -809,6 +809,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Vector Operand Scalarization: <1 x ty> -> ty.
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
+  SDValue ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
   SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a1a9f0f0615cbc7..9f59ae333403d2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -675,6 +675,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::BITCAST:
     Res = ScalarizeVecOp_BITCAST(N);
     break;
+  case ISD::INSERT_SUBVECTOR:
+    Res = ScalarizeVecOp_INSERT_SUBVECTOR(N, OpNo);
+    break;
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -766,6 +769,24 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
                      N->getValueType(0), Elt);
 }
 
+/// If the value to subvector is a vector that needs to be scalarized, it must
+/// be <1 x ty>. Return the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N,
+                                                          unsigned OpNo) {
+  // If the destination vector is unary, we can just return the source vector
+  auto src = GetScalarizedVector(N->getOperand(1));
+  if (OpNo == 0) {
+    return src;
+  }
+
+  auto dest = N->getOperand(0);
+  auto idx = N->getOperand(2);
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0), dest,
+                     src, idx);
+
+  return GetScalarizedVector(src);
+}
+
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
 /// Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
@@ -5891,8 +5912,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
     InOp1 = GetWidenedVector(InOp1);
     InOp2 = GetWidenedVector(InOp2);
   } else {
-    InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
-    InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+    do {
+      InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
+      InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+    } while (ElementCount::isKnownLT(
+        InOp1.getValueType().getVectorElementCount(), WidenEC));
   }
 
   // Assume that the input and output will be widen appropriately.  If not,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
index c932253049e239f..91762cb898897c4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -67,3 +67,12 @@ if.then:
 if.end:
   ret i32 1;
 }
+
+define dso_local <1 x half> @cmp_select(<1 x half> %i105, <1 x half> %in) {
+; CHECK-LABEL: @cmp_select
+; CHECL: fcmge
+newFuncRoot:
+  %i179 = fcmp uno <1 x half> %i105, zeroinitializer
+  %i180 = select <1 x i1> %i179, <1 x half> %in, <1 x half> %i105
+  ret <1 x half> %i180
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2cadd4e0d2911a6..0fbdddeb12950f2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3121,6 +3121,38 @@ gentbl_cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "MeshShardingInterfaceIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td",
+    deps = [":OpBaseTdFiles"],
+)
+
+cc_library(
+    name = "MeshShardingInterface",
+    srcs = ["lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp"],
+    hdrs = ["include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"],
+    includes = ["include"],
+    deps = [
+        ":DialectUtils",
+        ":IR",
+        ":MeshDialect",
+        ":MeshShardingInterfaceIncGen",
+        ":Support",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "MeshDialect",
     srcs = ["lib/Dialect/Mesh/IR/MeshOps.cpp"],
@@ -3136,6 +3168,40 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "MeshTransformsPassIncGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=Mesh",
+            ],
+            "include/mlir/Dialect/Mesh/Transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Mesh/Transforms/Passes.td",
+    deps = [":PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "MeshTransforms",
+    srcs = glob([
+        "lib/Dialect/Mesh/Transforms/*.cpp",
+        "lib/Dialect/Mesh/Transforms/*.h",
+    ]),
+    hdrs = glob(["include/mlir/Dialect/Mesh/Transforms/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":FuncDialect",
+        ":MeshDialect",
+        ":MeshShardingInterface",
+        ":MeshTransformsPassIncGen",
+        ":Pass",
+        "//llvm:Support",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # NVGPU dialect.
 ##---------------------------------------------------------------------------##
@@ -5182,6 +5248,7 @@ cc_library(
         ":ROCDLTarget",
         ":ROCDLToLLVMIRTranslation",
         ":SCFDialect",
+        ":SPIRVDialect",
         ":SerializeToCubin_stub",
         ":SideEffectInterfaces",
         ":Support",
@@ -5618,6 +5685,7 @@ cc_library(
     deps = [
         ":ArithToSPIRV",
         ":ConversionPassIncGen",
+        ":FuncDialect",
         ":FuncToSPIRV",
         ":GPUDialect",
         ":IR",
@@ -6437,6 +6505,7 @@ cc_library(
         ":CommonFolders",
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
+        ":GPUDialect",
         ":IR",
         ":InferTypeOpInterface",
         ":Parser",
@@ -8632,6 +8701,7 @@ cc_library(
         ":MemRefTransformOps",
         ":MemRefTransforms",
         ":MeshDialect",
+        ":MeshTransforms",
         ":NVGPUDialect",
         ":NVGPUPassIncGen",
         ":NVGPUToNVVM",
@@ -11051,6 +11121,8 @@ cc_library(
         ":IR",
         ":InferTypeOpInterface",
         ":LoopLikeInterface",
+        ":MeshDialect",
+        ":MeshShardingInterface",
         ":Pass",
         ":QuantOps",
         ":Support",

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Needed handling the case of scalarizing operands of subvector insertion.

topperc · 2023-11-08T01:21:03Z

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+                                                          unsigned OpNo) {
+  // If the destination vector is unary, we can just return the source vector
+  auto Src = GetScalarizedVector(N->getOperand(1));
+  if (OpNo == 0) {


Can this ever be called with OpNo==0? Wouldn't ScalarVecRes_INSERT_SUBVECTOR be called first? The first operand the result must have the same type.

I am not certain, it would depend where the type propagation occurs. My assumption is that this could iteratively come from a previously operation now scalarized without the result type necessarily being updated (which is only an asumption). Do you have a suggestion for how to verify this?

Result types are always legalized first and need to update the first operand along with them. You could try just asserting OpNo == 1

arsenm · 2023-11-08T02:18:56Z

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

@@ -5891,8 +5910,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
    InOp1 = GetWidenedVector(InOp1);
    InOp2 = GetWidenedVector(InOp2);
  } else {
-    InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
-    InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+    do {


Type legalization doesn't operate incrementally in individual steps. Newly legalized operations are supposed to be re-legalized later if necessary

@arsenm

For having looked at this with Rob, the problem we are facing here is:

The legalization strategy looks like this:

v1i16 is marked as widen by AArch64 backend and the target type is v4i16

v1f16 is marked as scalarize and the target type is f16
So the legalization strategies don't match and we end up in this block.
Now, the problem is a single call to WidenVector will push v1f16 to v2f16 and not the expected v4f16 and the assert line 5900 (of the old code) would break.

There may be a better way to fix that but the newly legalized operations won't get a chance to be re-legalized.

Open to suggestions, admittedly I went with a quick fix here.

FWIW, GISel doesn't have this issue.

The widening is only really intended to apply to integer types. A v1i1 setcc v1f16 having been turned into a v1i16 setcc v1f16 would be better to scalarize from there if we can tell it that.

I think the correct way to handle this is by inserting an explicit conversion instruction from the widened v2f16 to match v4f16. i.e. create a new INSERT_SUBVECTOR v4f16:undef, (v2f16 GetWidenedVector), 0

The widening is only really intended to apply to integer types. A v1i1 setcc v1f16 having been turned into a v1i16 setcc v1f16 would be better to scalarize from there if we can tell it that.

Yes, but that's not what the AArch64 lowering wants (see https://github.com/llvm/llvm-project/blob/f0cdf4b468f6ee48b0d0d51ce78145455e2f07a6/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L24550).
I believe the rational is v1i16 is not natively supported so we widen to the larger supported vector size.
Changing that would fix this particular issue, but I believe it would create worse code in a bunch of cases.

Let me re-phrase, we could change the code to explicitly create an insert_subvector here, but I don't see how this is better :).

Yes, but that's not what the AArch64 lowering wants (see https://github.com/llvm/llvm-project/blob/f0cdf4b468f6ee48b0d0d51ce78145455e2f07a6/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L24550).
I believe the rational is v1i16 is not natively supported so we widen to the larger supported vector size.
Changing that would fix this particular issue, but I believe it would create worse code in a bunch of cases.

Yeah, The point of that code is to act upon integer types, to keep them in vector registers as opposed to scalarizing them to gprs. A fcmp uno <1 x half> is really a floating point operation though, even if it's return type becomes a integer when it gets transformed to a v1i16 setcc v1f16. As an operation it still makes sense to scalarize it, not widen.

I agree that we shouldn't be changing the way that integer types are handled in general, but if there was a way to specify that a setcc with fp operands should really be treated like fp types, that would avoid a lot of the awkwardness here where we attempt to widen it only to scalarize again.

As an operation it still makes sense to scalarize it, not widen.

I agree, but could you refresh my memory: is it something we have a control on?

IIRC, the selection of a strategy for type legalization is agnostic of the particular instruction being legalized and happens before the custom lowering, but it's been a while since I touched SDISel.

Thats always the question, and Im not sure there is a way to specify the types for operations separately. Looking through the example though, it appears that the v1i16 setcc v1f16 is being created in the AArch64 backend inside performVSelectCombine.

I've put up #72048 as a possible fix. It just tries to prevent the generation of v1i16 setcc v1f16, which so long as they dont get generated from anywhere else seems to work OK, even if the code generated it not necessarily optimal. Let me know what you think.

Works for me.

davemgreen · 2023-11-09T11:14:04Z

llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll

+
+define dso_local <1 x half> @cmp_select(<1 x half> %i105, <1 x half> %in) {
+; CHECK-LABEL: @cmp_select
+; CHECL: fcmge


CHECL->CHECK

PR llvm#71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.

PR #71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.

qcolombet · 2023-11-13T15:21:18Z

Thanks to #72048, I think we can abandon this one.

Technically the code that this PR touches is still broken without this PR IMHO, but we can't reach it anymore.

PR llvm#71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.

rsuderman requested a review from qcolombet November 8, 2023 01:05

llvmbot added backend:AArch64 llvm:SelectionDAG SelectionDAGISel as well labels Nov 8, 2023

topperc reviewed Nov 8, 2023

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Outdated Show resolved Hide resolved

topperc reviewed Nov 8, 2023

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Outdated Show resolved Hide resolved

topperc reviewed Nov 8, 2023

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Outdated Show resolved Hide resolved

rsuderman added 2 commits November 7, 2023 17:19

[llvm] Adding scalarization of llvm.vector.insert

185a4f5

Needed handling the case of scalarizing operands of subvector insertion.

s/src/Src and deleted extra return

fb46c3d

rsuderman force-pushed the ScalarizeInsertSubvector branch from 7712d45 to fb46c3d Compare November 8, 2023 01:19

missing capitalizations

7e669c5

topperc reviewed Nov 8, 2023

View reviewed changes

arsenm reviewed Nov 8, 2023

View reviewed changes

davemgreen reviewed Nov 9, 2023

View reviewed changes

davemgreen mentioned this pull request Nov 12, 2023

[AArch64] Prevent v1f16 vselect/setcc type expansion. #72048

Merged

nikic changed the title ~~[llvm] Adding scalarization of llvm.vector.insert~~ [SDAG] Adding scalarization of llvm.vector.insert Nov 12, 2023

qcolombet mentioned this pull request Nov 13, 2023

[CPU][ARM] Failed to compile FP16 model with DT+UK for Android iree-org/iree#15505

Closed

rsuderman closed this Nov 13, 2023

[SDAG] Adding scalarization of llvm.vector.insert #71614

[SDAG] Adding scalarization of llvm.vector.insert #71614

Uh oh!

Conversation

rsuderman commented Nov 8, 2023

Uh oh!

llvmbot commented Nov 8, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

davemgreen Nov 12, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

qcolombet commented Nov 13, 2023

Uh oh!

Uh oh!

[SDAG] Adding scalarization of `llvm.vector.insert` #71614

[SDAG] Adding scalarization of `llvm.vector.insert` #71614

llvmbot commented Nov 8, 2023 •

edited

Loading

davemgreen Nov 12, 2023 •

edited

Loading