Skip to content

Commit 31b7d43

Browse files
authored
[DAG] Extend extract_element(bitcast(scalar_to_vector(X))) -> trunc(srl(X,C)) (#117900)
When extracting a smaller integer from a scalar_to_vector source, we were limited to only folding/truncating the lowest bits of the scalar source. This patch extends the fold to handle extraction of any other element, by right shifting the source before truncation. Fixes a regression from #117884
1 parent ef50d79 commit 31b7d43

File tree

5 files changed

+49
-63
lines changed

5 files changed

+49
-63
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23055,18 +23055,29 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
2305523055
// ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
2305623056
// trunc i64 X to i32
2305723057
SDValue X = BCSrc.getOperand(0);
23058-
assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
23058+
EVT XVT = X.getValueType();
23059+
assert(XVT.isScalarInteger() && ScalarVT.isScalarInteger() &&
2305923060
"Extract element and scalar to vector can't change element type "
2306023061
"from FP to integer.");
2306123062
unsigned XBitWidth = X.getValueSizeInBits();
23062-
BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
23063+
unsigned Scale = XBitWidth / VecEltBitWidth;
23064+
BCTruncElt = IsLE ? 0 : Scale - 1;
2306323065

2306423066
// An extract element return value type can be wider than its vector
2306523067
// operand element type. In that case, the high bits are undefined, so
2306623068
// it's possible that we may need to extend rather than truncate.
23067-
if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
23069+
if (ExtractIndex < Scale && XBitWidth > VecEltBitWidth) {
2306823070
assert(XBitWidth % VecEltBitWidth == 0 &&
2306923071
"Scalar bitwidth must be a multiple of vector element bitwidth");
23072+
23073+
if (ExtractIndex != BCTruncElt) {
23074+
unsigned ShiftIndex =
23075+
IsLE ? ExtractIndex : (Scale - 1) - ExtractIndex;
23076+
X = DAG.getNode(
23077+
ISD::SRL, DL, XVT, X,
23078+
DAG.getShiftAmountConstant(ShiftIndex * VecEltBitWidth, XVT, DL));
23079+
}
23080+
2307023081
return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
2307123082
}
2307223083
}

llvm/test/CodeGen/AArch64/extract-insert.ll

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
define i32 @trunc_i64_to_i32_le(i64 %x) {
66
; BE-LABEL: trunc_i64_to_i32_le:
77
; BE: // %bb.0:
8-
; BE-NEXT: fmov d0, x0
9-
; BE-NEXT: rev64 v0.4s, v0.4s
10-
; BE-NEXT: fmov w0, s0
8+
; BE-NEXT: lsr x0, x0, #32
9+
; BE-NEXT: // kill: def $w0 killed $w0 killed $x0
1110
; BE-NEXT: ret
1211
;
1312
; LE-LABEL: trunc_i64_to_i32_le:
@@ -28,8 +27,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) {
2827
;
2928
; LE-LABEL: trunc_i64_to_i32_be:
3029
; LE: // %bb.0:
31-
; LE-NEXT: fmov d0, x0
32-
; LE-NEXT: mov w0, v0.s[1]
30+
; LE-NEXT: lsr x0, x0, #32
31+
; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
3332
; LE-NEXT: ret
3433
%ins = insertelement <2 x i64> undef, i64 %x, i32 0
3534
%bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -40,9 +39,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) {
4039
define i16 @trunc_i64_to_i16_le(i64 %x) {
4140
; BE-LABEL: trunc_i64_to_i16_le:
4241
; BE: // %bb.0:
43-
; BE-NEXT: fmov d0, x0
44-
; BE-NEXT: rev64 v0.8h, v0.8h
45-
; BE-NEXT: umov w0, v0.h[0]
42+
; BE-NEXT: lsr x0, x0, #48
43+
; BE-NEXT: // kill: def $w0 killed $w0 killed $x0
4644
; BE-NEXT: ret
4745
;
4846
; LE-LABEL: trunc_i64_to_i16_le:
@@ -63,8 +61,8 @@ define i16 @trunc_i64_to_i16_be(i64 %x) {
6361
;
6462
; LE-LABEL: trunc_i64_to_i16_be:
6563
; LE: // %bb.0:
66-
; LE-NEXT: fmov d0, x0
67-
; LE-NEXT: umov w0, v0.h[3]
64+
; LE-NEXT: lsr x0, x0, #48
65+
; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
6866
; LE-NEXT: ret
6967
%ins = insertelement <2 x i64> undef, i64 %x, i32 0
7068
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -75,9 +73,7 @@ define i16 @trunc_i64_to_i16_be(i64 %x) {
7573
define i8 @trunc_i32_to_i8_le(i32 %x) {
7674
; BE-LABEL: trunc_i32_to_i8_le:
7775
; BE: // %bb.0:
78-
; BE-NEXT: fmov s0, w0
79-
; BE-NEXT: rev32 v0.16b, v0.16b
80-
; BE-NEXT: umov w0, v0.b[0]
76+
; BE-NEXT: lsr w0, w0, #24
8177
; BE-NEXT: ret
8278
;
8379
; LE-LABEL: trunc_i32_to_i8_le:
@@ -96,8 +92,7 @@ define i8 @trunc_i32_to_i8_be(i32 %x) {
9692
;
9793
; LE-LABEL: trunc_i32_to_i8_be:
9894
; LE: // %bb.0:
99-
; LE-NEXT: fmov s0, w0
100-
; LE-NEXT: umov w0, v0.b[3]
95+
; LE-NEXT: lsr w0, w0, #24
10196
; LE-NEXT: ret
10297
%ins = insertelement <4 x i32> undef, i32 %x, i32 0
10398
%bc = bitcast <4 x i32> %ins to <16 x i8>
@@ -115,8 +110,8 @@ define i8 @trunc_i64_to_i8_be(i64 %x) {
115110
;
116111
; LE-LABEL: trunc_i64_to_i8_be:
117112
; LE: // %bb.0:
118-
; LE-NEXT: fmov d0, x0
119-
; LE-NEXT: umov w0, v0.b[7]
113+
; LE-NEXT: lsr x0, x0, #56
114+
; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
120115
; LE-NEXT: ret
121116
%ins = insertelement <3 x i64> undef, i64 %x, i32 0
122117
%bc = bitcast <3 x i64> %ins to <24 x i8>

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -358,11 +358,10 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
358358
; CHECK-NEXT: ldr w8, [x0]
359359
; CHECK-NEXT: add x9, x0, #4
360360
; CHECK-NEXT: ld1r.4h { v0 }, [x9]
361-
; CHECK-NEXT: fmov s1, w8
361+
; CHECK-NEXT: lsr w9, w8, #16
362362
; CHECK-NEXT: strb w8, [x1]
363-
; CHECK-NEXT: add x8, x1, #1
364-
; CHECK-NEXT: st1.b { v1 }[2], [x8]
365363
; CHECK-NEXT: add x8, x1, #2
364+
; CHECK-NEXT: strb w9, [x1, #1]
366365
; CHECK-NEXT: st1.b { v0 }[4], [x8]
367366
; CHECK-NEXT: ret
368367
;

llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
1616
;
1717
; P9BE-LABEL: scalar_to_vector_half:
1818
; P9BE: # %bb.0: # %entry
19-
; P9BE-NEXT: lxsihzx v2, 0, r3
20-
; P9BE-NEXT: li r3, 0
21-
; P9BE-NEXT: vsplth v2, v2, 3
22-
; P9BE-NEXT: vextublx r3, r3, v2
19+
; P9BE-NEXT: lhz r3, 0(r3)
20+
; P9BE-NEXT: srwi r3, r3, 24
2321
; P9BE-NEXT: blr
2422
;
2523
; P8LE-LABEL: scalar_to_vector_half:
@@ -30,10 +28,7 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
3028
; P8BE-LABEL: scalar_to_vector_half:
3129
; P8BE: # %bb.0: # %entry
3230
; P8BE-NEXT: lhz r3, 0(r3)
33-
; P8BE-NEXT: sldi r3, r3, 48
34-
; P8BE-NEXT: mtfprd f0, r3
35-
; P8BE-NEXT: mffprd r3, f0
36-
; P8BE-NEXT: rldicl r3, r3, 8, 56
31+
; P8BE-NEXT: srwi r3, r3, 24
3732
; P8BE-NEXT: blr
3833
entry:
3934
%0 = load <2 x i8>, ptr %ad, align 1

llvm/test/CodeGen/X86/load-partial.ll

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
77

88
;
99
; Partial Vector Loads - PR16739
@@ -382,38 +382,24 @@ define dso_local i32 @load_partial_illegal_type() {
382382
define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) {
383383
; SSE-LABEL: PR43227:
384384
; SSE: # %bb.0:
385-
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
386-
; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
387-
; SSE-NEXT: psrlq $32, %xmm0
388-
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
389-
; SSE-NEXT: pxor %xmm1, %xmm1
390-
; SSE-NEXT: movdqa %xmm1, 672(%rsi)
391-
; SSE-NEXT: movdqa %xmm0, 688(%rsi)
385+
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
386+
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
387+
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
388+
; SSE-NEXT: xorps %xmm0, %xmm0
389+
; SSE-NEXT: movaps %xmm0, 672(%rsi)
390+
; SSE-NEXT: movaps %xmm1, 688(%rsi)
392391
; SSE-NEXT: retq
393392
;
394-
; AVX1-LABEL: PR43227:
395-
; AVX1: # %bb.0:
396-
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
397-
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
398-
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
399-
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
400-
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
401-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
402-
; AVX1-NEXT: vmovaps %ymm0, 672(%rsi)
403-
; AVX1-NEXT: vzeroupper
404-
; AVX1-NEXT: retq
405-
;
406-
; AVX2-LABEL: PR43227:
407-
; AVX2: # %bb.0:
408-
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
409-
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
410-
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
411-
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
412-
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
413-
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
414-
; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi)
415-
; AVX2-NEXT: vzeroupper
416-
; AVX2-NEXT: retq
393+
; AVX-LABEL: PR43227:
394+
; AVX: # %bb.0:
395+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
396+
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
397+
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
398+
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
399+
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
400+
; AVX-NEXT: vmovaps %ymm0, 672(%rsi)
401+
; AVX-NEXT: vzeroupper
402+
; AVX-NEXT: retq
417403
%1 = getelementptr i32, ptr %explicit_0, i64 63
418404
%2 = load <3 x i32>, ptr %1, align 1
419405
%3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> <i32 1, i32 2>

0 commit comments

Comments
 (0)