-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SDAG] Adding scalarization of llvm.vector.insert
#71614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Rob Suderman (rsuderman) ChangesNeeded handling the case of scalarizing operands of subvector insertion. Full diff: https://github.com/llvm/llvm-project/pull/71614.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f85c1296cdce856..5651c6e9b218447 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -809,6 +809,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
// Vector Operand Scalarization: <1 x ty> -> ty.
bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
SDValue ScalarizeVecOp_BITCAST(SDNode *N);
+ SDValue ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo);
SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a1a9f0f0615cbc7..9f59ae333403d2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -675,6 +675,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::BITCAST:
Res = ScalarizeVecOp_BITCAST(N);
break;
+ case ISD::INSERT_SUBVECTOR:
+ Res = ScalarizeVecOp_INSERT_SUBVECTOR(N, OpNo);
+ break;
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
@@ -766,6 +769,24 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
N->getValueType(0), Elt);
}
+/// If the value to subvector is a vector that needs to be scalarized, it must
+/// be <1 x ty>. Return the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N,
+ unsigned OpNo) {
+ // If the destination vector is unary, we can just return the source vector
+ auto src = GetScalarizedVector(N->getOperand(1));
+ if (OpNo == 0) {
+ return src;
+ }
+
+ auto dest = N->getOperand(0);
+ auto idx = N->getOperand(2);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0), dest,
+ src, idx);
+
+ return GetScalarizedVector(src);
+}
+
/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
/// Do the operation on the element instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
@@ -5891,8 +5912,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
InOp1 = GetWidenedVector(InOp1);
InOp2 = GetWidenedVector(InOp2);
} else {
- InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
- InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+ do {
+ InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
+ InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+ } while (ElementCount::isKnownLT(
+ InOp1.getValueType().getVectorElementCount(), WidenEC));
}
// Assume that the input and output will be widen appropriately. If not,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
index c932253049e239f..91762cb898897c4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -67,3 +67,12 @@ if.then:
if.end:
ret i32 1;
}
+
+define dso_local <1 x half> @cmp_select(<1 x half> %i105, <1 x half> %in) {
+; CHECK-LABEL: @cmp_select
+; CHECL: fcmge
+newFuncRoot:
+ %i179 = fcmp uno <1 x half> %i105, zeroinitializer
+ %i180 = select <1 x i1> %i179, <1 x half> %in, <1 x half> %i105
+ ret <1 x half> %i180
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2cadd4e0d2911a6..0fbdddeb12950f2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3121,6 +3121,38 @@ gentbl_cc_library(
],
)
+gentbl_cc_library(
+ name = "MeshShardingInterfaceIncGen",
+ tbl_outs = [
+ (
+ ["-gen-op-interface-decls"],
+ "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h.inc",
+ ),
+ (
+ ["-gen-op-interface-defs"],
+ "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc",
+ ),
+ ],
+ tblgen = ":mlir-tblgen",
+ td_file = "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td",
+ deps = [":OpBaseTdFiles"],
+)
+
+cc_library(
+ name = "MeshShardingInterface",
+ srcs = ["lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp"],
+ hdrs = ["include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"],
+ includes = ["include"],
+ deps = [
+ ":DialectUtils",
+ ":IR",
+ ":MeshDialect",
+ ":MeshShardingInterfaceIncGen",
+ ":Support",
+ "//llvm:Support",
+ ],
+)
+
cc_library(
name = "MeshDialect",
srcs = ["lib/Dialect/Mesh/IR/MeshOps.cpp"],
@@ -3136,6 +3168,40 @@ cc_library(
],
)
+gentbl_cc_library(
+ name = "MeshTransformsPassIncGen",
+ tbl_outs = [
+ (
+ [
+ "-gen-pass-decls",
+ "-name=Mesh",
+ ],
+ "include/mlir/Dialect/Mesh/Transforms/Passes.h.inc",
+ ),
+ ],
+ tblgen = ":mlir-tblgen",
+ td_file = "include/mlir/Dialect/Mesh/Transforms/Passes.td",
+ deps = [":PassBaseTdFiles"],
+)
+
+cc_library(
+ name = "MeshTransforms",
+ srcs = glob([
+ "lib/Dialect/Mesh/Transforms/*.cpp",
+ "lib/Dialect/Mesh/Transforms/*.h",
+ ]),
+ hdrs = glob(["include/mlir/Dialect/Mesh/Transforms/*.h"]),
+ includes = ["include"],
+ deps = [
+ ":FuncDialect",
+ ":MeshDialect",
+ ":MeshShardingInterface",
+ ":MeshTransformsPassIncGen",
+ ":Pass",
+ "//llvm:Support",
+ ],
+)
+
##---------------------------------------------------------------------------##
# NVGPU dialect.
##---------------------------------------------------------------------------##
@@ -5182,6 +5248,7 @@ cc_library(
":ROCDLTarget",
":ROCDLToLLVMIRTranslation",
":SCFDialect",
+ ":SPIRVDialect",
":SerializeToCubin_stub",
":SideEffectInterfaces",
":Support",
@@ -5618,6 +5685,7 @@ cc_library(
deps = [
":ArithToSPIRV",
":ConversionPassIncGen",
+ ":FuncDialect",
":FuncToSPIRV",
":GPUDialect",
":IR",
@@ -6437,6 +6505,7 @@ cc_library(
":CommonFolders",
":ControlFlowInterfaces",
":FunctionInterfaces",
+ ":GPUDialect",
":IR",
":InferTypeOpInterface",
":Parser",
@@ -8632,6 +8701,7 @@ cc_library(
":MemRefTransformOps",
":MemRefTransforms",
":MeshDialect",
+ ":MeshTransforms",
":NVGPUDialect",
":NVGPUPassIncGen",
":NVGPUToNVVM",
@@ -11051,6 +11121,8 @@ cc_library(
":IR",
":InferTypeOpInterface",
":LoopLikeInterface",
+ ":MeshDialect",
+ ":MeshShardingInterface",
":Pass",
":QuantOps",
":Support",
|
Needed handling the case of scalarizing operands of subvector insertion.
7712d45
to
fb46c3d
Compare
unsigned OpNo) { | ||
// If the destination vector is unary, we can just return the source vector | ||
auto Src = GetScalarizedVector(N->getOperand(1)); | ||
if (OpNo == 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this ever be called with OpNo==0? Wouldn't ScalarVecRes_INSERT_SUBVECTOR be called first? The first operand the result must have the same type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not certain, it would depend where the type propagation occurs. My assumption is that this could iteratively come from a previously operation now scalarized without the result type necessarily being updated (which is only an asumption). Do you have a suggestion for how to verify this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Result types are always legalized first and need to update the first operand along with them. You could try just asserting OpNo == 1
@@ -5891,8 +5910,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) { | |||
InOp1 = GetWidenedVector(InOp1); | |||
InOp2 = GetWidenedVector(InOp2); | |||
} else { | |||
InOp1 = DAG.WidenVector(InOp1, SDLoc(N)); | |||
InOp2 = DAG.WidenVector(InOp2, SDLoc(N)); | |||
do { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Type legalization doesn't operate incrementally in individual steps. Newly legalized operations are supposed to be re-legalized later if necessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For having looked at this with Rob, the problem we are facing here is:
- The legalization strategy looks like this:
v1i16
is marked aswiden
by AArch64 backend and the target type isv4i16
v1f16
is marked asscalarize
and the target type isf16
So the legalization strategies don't match and we end up in this block.
Now, the problem is a single call toWidenVector
will pushv1f16
tov2f16
and not the expectedv4f16
and the assert line 5900 (of the old code) would break.
There may be a better way to fix that but the newly legalized operations won't get a chance to be re-legalized.
Open to suggestions, admittedly I went with a quick fix here.
FWIW, GISel doesn't have this issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The widening is only really intended to apply to integer types. A v1i1 setcc v1f16
having been turned into a v1i16 setcc v1f16
would be better to scalarize from there if we can tell it that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the correct way to handle this is by inserting an explicit conversion instruction from the widened v2f16 to match v4f16. i.e. create a new INSERT_SUBVECTOR v4f16:undef, (v2f16 GetWidenedVector), 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The widening is only really intended to apply to integer types. A
v1i1 setcc v1f16
having been turned into av1i16 setcc v1f16
would be better to scalarize from there if we can tell it that.
Yes, but that's not what the AArch64 lowering wants (see https://github.com/llvm/llvm-project/blob/f0cdf4b468f6ee48b0d0d51ce78145455e2f07a6/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L24550).
I believe the rational is v1i16
is not natively supported so we widen to the larger supported vector size.
Changing that would fix this particular issue, but I believe it would create worse code in a bunch of cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me re-phrase, we could change the code to explicitly create an insert_subvector here, but I don't see how this is better :).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but that's not what the AArch64 lowering wants (see https://github.com/llvm/llvm-project/blob/f0cdf4b468f6ee48b0d0d51ce78145455e2f07a6/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L24550).
I believe the rational is v1i16 is not natively supported so we widen to the larger supported vector size.
Changing that would fix this particular issue, but I believe it would create worse code in a bunch of cases.
Yeah, The point of that code is to act upon integer types, to keep them in vector registers as opposed to scalarizing them to gprs. A fcmp uno <1 x half>
is really a floating point operation though, even if it's return type becomes a integer when it gets transformed to a v1i16 setcc v1f16
. As an operation it still makes sense to scalarize it, not widen.
I agree that we shouldn't be changing the way that integer types are handled in general, but if there was a way to specify that a setcc with fp operands should really be treated like fp types, that would avoid a lot of the awkwardness here where we attempt to widen it only to scalarize again.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As an operation it still makes sense to scalarize it, not widen.
I agree, but could you refresh my memory: is it something we have a control on?
IIRC, the selection of a strategy for type legalization is agnostic of the particular instruction being legalized and happens before the custom lowering, but it's been a while since I touched SDISel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thats always the question, and Im not sure there is a way to specify the types for operations separately. Looking through the example though, it appears that the v1i16 setcc v1f16
is being created in the AArch64 backend inside performVSelectCombine.
I've put up #72048 as a possible fix. It just tries to prevent the generation of v1i16 setcc v1f16
, which so long as they dont get generated from anywhere else seems to work OK, even if the code generated it not necessarily optimal. Let me know what you think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Works for me.
|
||
define dso_local <1 x half> @cmp_select(<1 x half> %i105, <1 x half> %in) { | ||
; CHECK-LABEL: @cmp_select | ||
; CHECL: fcmge |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CHECL->CHECK
PR llvm#71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.
llvm.vector.insert
llvm.vector.insert
PR #71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.
Thanks to #72048, I think we can abandon this one. Technically the code that this PR touches is still broken without this PR IMHO, but we can't reach it anymore. |
PR llvm#71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.
Needed handling the case of scalarizing operands of subvector insertion.