Skip to content

Commit 2d00874

Browse files
authored
[DAG] Remove extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x), idx) fold (#87480)
Reverse the fold with handling inside canCreateUndefOrPoison for cases where we know that the extract index is in bounds. This exposed a number or regressions, and required some initial freeze handling of SCALAR_TO_VECTOR, which will require us to properly improve demandedelts support to handle its undef upper elements. There is still one outstanding regression to be addressed in the future - how do we want to handle folds involving frozen loads? Fixes #86968
1 parent cbdc86e commit 2d00874

File tree

6 files changed

+49
-15
lines changed

6 files changed

+49
-15
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22265,12 +22265,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
2226522265
IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
2226622266
return DAG.getUNDEF(ScalarVT);
2226722267

22268-
// extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
22269-
if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
22270-
return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
22271-
VecOp.getOperand(0), Index));
22272-
}
22273-
2227422268
// extract_vector_elt (build_vector x, y), 1 -> y
2227522269
if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
2227622270
VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5149,6 +5149,17 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
51495149
case ISD::OR:
51505150
return ConsiderFlags && Op->getFlags().hasDisjoint();
51515151

5152+
case ISD::SCALAR_TO_VECTOR:
5153+
// Check if we demand any upper (undef) elements.
5154+
return !PoisonOnly && DemandedElts.ugt(1);
5155+
5156+
case ISD::EXTRACT_VECTOR_ELT: {
5157+
// Ensure that the element index is in bounds.
5158+
EVT VecVT = Op.getOperand(0).getValueType();
5159+
KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
5160+
return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
5161+
}
5162+
51525163
case ISD::INSERT_VECTOR_ELT:{
51535164
// Ensure that the element index is in bounds.
51545165
EVT VecVT = Op.getOperand(0).getValueType();

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
742742

743743
break;
744744
}
745+
case ISD::FREEZE: {
746+
SDValue N0 = Op.getOperand(0);
747+
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
748+
/*PoisonOnly=*/false))
749+
return N0;
750+
break;
751+
}
745752
case ISD::AND: {
746753
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
747754
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3184,6 +3191,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
31843191
}
31853192
break;
31863193
}
3194+
case ISD::FREEZE: {
3195+
SDValue N0 = Op.getOperand(0);
3196+
if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
3197+
/*PoisonOnly=*/false))
3198+
return TLO.CombineTo(Op, N0);
3199+
3200+
// TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
3201+
// freeze(op(x, ...)) -> op(freeze(x), ...).
3202+
if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && DemandedElts == 1)
3203+
return TLO.CombineTo(
3204+
Op, TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
3205+
TLO.DAG.getFreeze(N0.getOperand(0))));
3206+
break;
3207+
}
31873208
case ISD::BUILD_VECTOR: {
31883209
// Check all elements and simplify any unused elements with UNDEF.
31893210
if (!DemandedElts.isAllOnes()) {

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
4272542725
switch (Op.getOpcode()) {
4272642726
case X86ISD::PSHUFD:
4272742727
case X86ISD::VPERMILPI:
42728+
case X86ISD::UNPCKH:
42729+
case X86ISD::UNPCKL:
4272842730
return false;
4272942731
}
4273042732
return TargetLowering::canCreateUndefOrPoisonForTargetNode(

llvm/test/CodeGen/X86/freeze-vector.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
173173
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
174174
; X86-NEXT: vmovdqa (%edx), %xmm0
175175
; X86-NEXT: vpand (%ecx), %xmm0, %xmm0
176-
; X86-NEXT: vpextrb $6, %xmm0, %ecx
177-
; X86-NEXT: movb %cl, (%eax)
176+
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
178177
; X86-NEXT: retl
179178
;
180179
; X64-LABEL: freeze_extractelement:
181180
; X64: # %bb.0:
182181
; X64-NEXT: vmovdqa (%rdi), %xmm0
183182
; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
184-
; X64-NEXT: vpextrb $6, %xmm0, %eax
185-
; X64-NEXT: movb %al, (%rdx)
183+
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
186184
; X64-NEXT: retq
187185
%i0 = load <16 x i8>, ptr %origin0
188186
%i1 = load <16 x i8>, ptr %origin1

llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
6565
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
6666
; X64-NO-BMI2: # %bb.0:
6767
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
68+
; X64-NO-BMI2-NEXT: movzwl %ax, %eax
6869
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
6970
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
7071
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -74,21 +75,23 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
7475
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
7576
; X64-BMI2: # %bb.0:
7677
; X64-BMI2-NEXT: movzwl (%rdi), %eax
78+
; X64-BMI2-NEXT: movzwl %ax, %eax
7779
; X64-BMI2-NEXT: shll $3, %esi
7880
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
7981
; X64-BMI2-NEXT: movb %al, (%rdx)
8082
; X64-BMI2-NEXT: retq
8183
;
8284
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
8385
; X86-NO-BMI2: # %bb.0:
84-
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
85-
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
8686
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
87-
; X86-NO-BMI2-NEXT: movzwl (%eax), %eax
87+
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
88+
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
89+
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
90+
; X86-NO-BMI2-NEXT: movzwl %dx, %edx
8891
; X86-NO-BMI2-NEXT: shll $3, %ecx
8992
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
90-
; X86-NO-BMI2-NEXT: shrl %cl, %eax
91-
; X86-NO-BMI2-NEXT: movb %al, (%edx)
93+
; X86-NO-BMI2-NEXT: shrl %cl, %edx
94+
; X86-NO-BMI2-NEXT: movb %dl, (%eax)
9295
; X86-NO-BMI2-NEXT: retl
9396
;
9497
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
97100
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
98101
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
99102
; X86-BMI2-NEXT: movzwl (%edx), %edx
103+
; X86-BMI2-NEXT: movzwl %dx, %edx
100104
; X86-BMI2-NEXT: shll $3, %ecx
101105
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
102106
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
119123
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
120124
; X64-NO-BMI2: # %bb.0:
121125
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
126+
; X64-NO-BMI2-NEXT: movzwl %ax, %eax
122127
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
123128
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
124129
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
128133
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
129134
; X64-BMI2: # %bb.0:
130135
; X64-BMI2-NEXT: movzwl (%rdi), %eax
136+
; X64-BMI2-NEXT: movzwl %ax, %eax
131137
; X64-BMI2-NEXT: shll $3, %esi
132138
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
133139
; X64-BMI2-NEXT: movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
139145
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
140146
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
141147
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
148+
; X86-NO-BMI2-NEXT: movzwl %dx, %edx
142149
; X86-NO-BMI2-NEXT: shll $3, %ecx
143150
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
144151
; X86-NO-BMI2-NEXT: shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
151158
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
152159
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
153160
; X86-BMI2-NEXT: movzwl (%edx), %edx
161+
; X86-BMI2-NEXT: movzwl %dx, %edx
154162
; X86-BMI2-NEXT: shll $3, %ecx
155163
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
156164
; X86-BMI2-NEXT: movw %cx, (%eax)

0 commit comments

Comments
 (0)