-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LegalizeTypes] Handle non byte-sized elt types when splitting INSERT/EXTRACT_VECTOR_ELT #93357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Björn Pettersson (bjope) ChangesDAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT and DAGTypeLegalizer::SplitVecRes_EXTRACT_VECTOR_ELT did not handle non byte-sized elements properly. In fact, it only dealt with elements smaller than 8 bits (as well as byte-sized elements). This patch generalizes the support for non byte-sized element by always extending the vector elements to match the store size for the element type when legalizing via a stack temporary. This should make sure that we can access a single element via a simple byte-addressed scalar load/store. Full diff: https://github.com/llvm/llvm-project/pull/93357.diff 3 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index dab6c421bf6e6..9ad582c8ab7a9 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -432,6 +432,15 @@ namespace llvm {
return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
}
+ /// Return a VT for an integer vector type with the size of the elements
+ /// extended to the store size. The typed returned may be an extended
+ /// type.
+ EVT getStoreSizedIntegerVectorElementType(LLVMContext &Context) const {
+ EVT EltVT = getVectorElementType();
+ EltVT = EVT::getIntegerVT(Context, EltVT.getStoreSizeInBits());
+ return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
+ }
+
// Return a VT for a vector type with the same element type but
// half the number of elements. The type returned may be an
// extended type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 40e621f0db220..28ecd229bf398 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1832,10 +1832,9 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
// Make the vector elements byte-addressable if they aren't already.
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
- if (VecVT.getScalarSizeInBits() < 8) {
- EltVT = MVT::i8;
- VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
- VecVT.getVectorElementCount());
+ if (!EltVT.isByteSized()) {
+ VecVT = VecVT.getStoreSizedIntegerVectorElementType(*DAG.getContext());
+ EltVT = VecVT.getVectorElementType();
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
// Extend the element type to match if needed.
if (EltVT.bitsGT(Elt.getValueType()))
@@ -3443,10 +3442,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
// Make the vector elements byte-addressable if they aren't already.
SDLoc dl(N);
EVT EltVT = VecVT.getVectorElementType();
- if (VecVT.getScalarSizeInBits() < 8) {
- EltVT = MVT::i8;
- VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
- VecVT.getVectorElementCount());
+ if (!EltVT.isByteSized()) {
+ VecVT = VecVT.getStoreSizedIntegerVectorElementType(*DAG.getContext());
+ EltVT = VecVT.getVectorElementType();
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
}
@@ -3465,8 +3463,6 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
// Load back the required element.
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
- // FIXME: This is to handle i1 vectors with elements promoted to i8.
- // i1 vector handling needs general improvement.
if (N->getValueType(0).bitsLT(EltVT)) {
SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
diff --git a/llvm/test/CodeGen/X86/legalize-ins-ext-vec-elt.ll b/llvm/test/CodeGen/X86/legalize-ins-ext-vec-elt.ll
new file mode 100644
index 0000000000000..7b517c2ca574f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/legalize-ins-ext-vec-elt.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-- -o - %s| FileCheck %s
+
+; Verify that we support non byte-sized elements, together with variable index.
+
+define void @Legalize_SplitVectorResult_insert_i28(i28 %elt, i16 %idx, ptr %p1, ptr %p2) nounwind {
+; CHECK-LABEL: Legalize_SplitVectorResult_insert_i28:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $7, %esi
+; CHECK-NEXT: movl %edi, -40(%rsp,%rsi,4)
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [268435455,268435455,268435455,268435455]
+; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: andps %xmm0, %xmm1
+; CHECK-NEXT: andps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: movaps %xmm0, 16(%rcx)
+; CHECK-NEXT: movaps %xmm1, (%rcx)
+; CHECK-NEXT: retq
+ %vec1 = insertelement <8 x i28> zeroinitializer, i28 %elt, i16 %idx
+ %vec2 = zext <8 x i28> %vec1 to <8 x i32>
+ store <8 x i32> %vec2, ptr %p2
+ ret void
+}
+
+define void @Legalize_SplitVectorResult_extract_i12(i16 %idx, ptr %p1, ptr %p2) nounwind {
+; CHECK-LABEL: Legalize_SplitVectorResult_extract_i12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: movaps (%rsi), %xmm0
+; CHECK-NEXT: movaps 16(%rsi), %xmm1
+; CHECK-NEXT: movaps 32(%rsi), %xmm2
+; CHECK-NEXT: movaps 48(%rsi), %xmm3
+; CHECK-NEXT: movaps 64(%rsi), %xmm4
+; CHECK-NEXT: movaps 80(%rsi), %xmm5
+; CHECK-NEXT: movaps 96(%rsi), %xmm6
+; CHECK-NEXT: movaps 112(%rsi), %xmm7
+; CHECK-NEXT: movaps %xmm7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $63, %edi
+; CHECK-NEXT: movzwl -128(%rsp,%rdi,2), %eax
+; CHECK-NEXT: andl $4095, %eax # imm = 0xFFF
+; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %vec = load <64 x i16>, ptr %p1
+ %trunc = trunc <64 x i16> %vec to <64 x i12>
+ %elt = extractelement <64 x i12> %trunc, i16 %idx
+ store i12 %elt, ptr %p2
+ ret void
+}
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=x86_64-- -o - %s| FileCheck %s | ||
|
||
; Verify that we support non byte-sized elements, together with variable index. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both test case added here triggered assertion failure earlier. So can't really pre-commit anything interesting.
Is the nearest byte really what we want here? We can widen to any type we want, and on most targets, storing an Instead of expanding immediately, should we just construct an EXTRACT_VECTOR_ELT with the promoted vector type, and legalize that? That might allow custom legalization to kick in in certain cases. |
It is a bit hard to understand what is best for every target (unless perhaps scanning for some legal type, if such a type exists).
To summarize. How to optimize might be target specific.
I don't really see the point in giving the target a second chance for custom legalization. A target could just promote the input vector elements itself if that is feasible when doing CustomLowerNode the first time. Otherwise we need to speculate about what element size that would be feasible for the current target, to match whatever types the custom lowering handles. And it will be hard to pick a type that is best for all targets (unless using a target hook just to please custom lowering...). |
Using the next legal element type would be a reasonable default. Maybe you want next legal store memory type
As in i24 is a legal type, or truncating store i32->i24 is legal? |
As in i24 being a legal type. And by saying that I realize that it would be an uncommon thing for in-tree targets as such an MVT doens't exist in-tree. So it seems to make more sense in-tree (given by what ValueTypes.td currently looks like) to make use of getRoundIntegerType here. Or alternatively we should do something slightly more complicated by trying to find a legal type for ISD::STORE. But I don't know if there already exist some helper for that or if we would need to invent it. I'll try to update this PR to use getRoundIntegerType instead of the getStoreSizedIntegerVectorElementType. And then we will deal with it downstream, if we want special handling for types like i24. (Maybe getRoundIntegerType should be described as rounding up to next simple value type, rather than rounding to nearest power of two. Then we could let it round to other non-power-of-two sizes depending on what ValueTypes.td looks like. But that is ofcourse a totally different story.) |
getTypeToTransformTo? |
That one could perhaps be used in some way. But I don't think it really guarantees to return a type that isn't using a packed layout when used as a vector element type etc. I think it would complicate the code a bit if using that one instead of getRoundIntegerType. |
For non-MVT types, I don't think there's any way for a target to request custom lowering. At least, not using setOperationAction. |
Right. CustomLowerNode would just bail out early for such types. I did not consider that, so thanks for enlightening me. |
MVT::Other is supposed to work for this |
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec); | ||
SDValue NewExtract = | ||
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vec, Idx); | ||
return DAG.getZExtOrTrunc(NewExtract, dl, N->getValueType(0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why isn't this any ext or trunc?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch. I've updated it to use getAnyExtOrTrunc now.
Is there some reason we want to create a new EXTRACT_VECTOR_ELT, but not a new INSERT_VECTOR_ELT? |
I think that some earlier step should have choosen to widen instead of split the vector if such legalization is preferred for the INSERT_VECTOR_ELT case. I think optimizing that situation is out-of-scope for this patch anyway. This patch is just supposed to support some scenatios for which we either hit assertion failure of miscompiled in the past. But if suddenly the legalization here is considered as performance critical, then we can follow up with optimizations later. We would need some test cases that show the benefit of doing it some other way, and I don't know about any such test cases. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…/EXTRACT_VECTOR_ELT (llvm#93357) DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT and DAGTypeLegalizer::SplitVecRes_EXTRACT_VECTOR_ELT did not handle non byte-sized elements properly. In fact, it only dealt with elements smaller than 8 bits (as well as byte-sized elements). This patch generalizes the support for non byte-sized element by always widening the the vector elements to next "round integer type" (a power of 2 bit size). This should make sure that we can access a single element via a simple byte-addressed scalar load/store. Also removing a suspicious CustomLowerNode call from SplitVecRes_INSERT_VECTOR_ELT. Considering that it did not reset the Lo/Hi out arguments before the return I think that DAGTypeLegalizer::SplitVectorResult could be fooled into registering the input vector as being the result. This should however not have caused any problems since DAGTypeLegalizer::SplitVectorResult is doing the same CustomLowerNode call, making the code removed by this patch redundant.
…/EXTRACT_VECTOR_ELT (llvm#93357) DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT and DAGTypeLegalizer::SplitVecRes_EXTRACT_VECTOR_ELT did not handle non byte-sized elements properly. In fact, it only dealt with elements smaller than 8 bits (as well as byte-sized elements). This patch generalizes the support for non byte-sized element by always widening the the vector elements to next "round integer type" (a power of 2 bit size). This should make sure that we can access a single element via a simple byte-addressed scalar load/store. Also removing a suspicious CustomLowerNode call from SplitVecRes_INSERT_VECTOR_ELT. Considering that it did not reset the Lo/Hi out arguments before the return I think that DAGTypeLegalizer::SplitVectorResult could be fooled into registering the input vector as being the result. This should however not have caused any problems since DAGTypeLegalizer::SplitVectorResult is doing the same CustomLowerNode call, making the code removed by this patch redundant.
DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT and
DAGTypeLegalizer::SplitVecRes_EXTRACT_VECTOR_ELT did not handle
non byte-sized elements properly. In fact, it only dealt with
elements smaller than 8 bits (as well as byte-sized elements).
This patch generalizes the support for non byte-sized element by
always widening the the vector elements to next "round integer type"
(a power of 2 bit size). This should make sure that we can access a
single element via a simple byte-addressed scalar load/store.
Also removing a suspicious CustomLowerNode call from
SplitVecRes_INSERT_VECTOR_ELT. Considering that it did not reset
the Lo/Hi out arguments before the return I think that
DAGTypeLegalizer::SplitVectorResult could be fooled into registering
the input vector as being the result. This should however not have
caused any problems since DAGTypeLegalizer::SplitVectorResult is
doing the same CustomLowerNode call, making the code removed by
this patch redundant.