-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] Remove extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x), idx) fold #87480
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
…tor_elt(x), idx) fold Reverse the fold with handling inside canCreateUndefOrPoison for cases where we know that the extract index is in bounds. This required some initial freeze handling of SCALAR_TO_VECTOR, which will require us to start improving demandedelts support to handle its undef upper elements. There is still one outstanding regression - how do we want to handle folds of frozen loads? Fixes llvm#86968
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: Simon Pilgrim (RKSimon) ChangesReverse the fold with handling inside canCreateUndefOrPoison for cases where we know that the extract index is in bounds. This exposed a number or regressions, and required some initial freeze handling of SCALAR_TO_VECTOR, which will require us to properly improve demandedelts support to handle its undef upper elements. There is still one outstanding regression - how do we want to handle folds involving frozen loads? Fixes #86968 Full diff: https://github.com/llvm/llvm-project/pull/87480.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 28fe0697357dcd..0a473180538a5d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22265,12 +22265,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
return DAG.getUNDEF(ScalarVT);
- // extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
- if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
- return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
- VecOp.getOperand(0), Index));
- }
-
// extract_vector_elt (build_vector x, y), 1 -> y
if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25b51d5c1e0226..1dd0fa49a460f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5149,6 +5149,17 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::OR:
return ConsiderFlags && Op->getFlags().hasDisjoint();
+ case ISD::SCALAR_TO_VECTOR:
+ // Check if we demand any upper (undef) elements.
+ return !PoisonOnly && DemandedElts.ugt(1);
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ // Ensure that the element index is in bounds.
+ EVT VecVT = Op.getOperand(0).getValueType();
+ KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
+ return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+ }
+
case ISD::INSERT_VECTOR_ELT:{
// Ensure that the element index is in bounds.
EVT VecVT = Op.getOperand(0).getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8bb9541bfe1027..5e053f97675d7f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -742,6 +742,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
break;
}
+ case ISD::FREEZE: {
+ SDValue N0 = Op.getOperand(0);
+ if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+ /*PoisonOnly=*/false))
+ return N0;
+ break;
+ }
case ISD::AND: {
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3184,6 +3191,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
}
break;
}
+ case ISD::FREEZE: {
+ SDValue N0 = Op.getOperand(0);
+ if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+ /*PoisonOnly=*/false))
+ return TLO.CombineTo(Op, N0);
+
+ // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
+ // freeze(op(x, ...)) -> op(freeze(x), ...).
+ if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && DemandedElts == 1)
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+ TLO.DAG.getFreeze(N0.getOperand(0))));
+ break;
+ }
case ISD::BUILD_VECTOR: {
// Check all elements and simplify any unused elements with UNDEF.
if (!DemandedElts.isAllOnes()) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9751e170fe63d..6f65344215c020 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
switch (Op.getOpcode()) {
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
return false;
}
return TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index d9ee5f0d3e49c5..ee7f4aea02c00f 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: vmovdqa (%edx), %xmm0
; X86-NEXT: vpand (%ecx), %xmm0, %xmm0
-; X86-NEXT: vpextrb $6, %xmm0, %ecx
-; X86-NEXT: movb %cl, (%eax)
+; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_extractelement:
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm0
; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
-; X64-NEXT: vpextrb $6, %xmm0, %eax
-; X64-NEXT: movb %al, (%rdx)
+; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
%i0 = load <16 x i8>, ptr %origin0
%i1 = load <16 x i8>, ptr %origin1
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 691ca40191d4ba..f7a27a5b914466 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -74,6 +75,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movb %al, (%rdx)
@@ -81,14 +83,15 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2: # %bb.0:
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT: movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT: movzwl %dx, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NEXT: movb %al, (%edx)
+; X86-NO-BMI2-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NEXT: movb %dl, (%eax)
; X86-NO-BMI2-NEXT: retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
+; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT: movzwl %dx, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT: shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
+; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movw %cx, (%eax)
|
Reverse the fold with handling inside canCreateUndefOrPoison for cases where we know that the extract index is in bounds.
This exposed a number or regressions, and required some initial freeze handling of SCALAR_TO_VECTOR, which will require us to properly improve demandedelts support to handle its undef upper elements.
There is still one outstanding regression - how do we want to handle folds involving frozen loads?
Fixes #86968