Skip to content

Commit 9d09d20

Browse files
committed
Reapply "[X86] Limit X86InterleavedAccessGroup to handle the same type case only"
The current implementation assumes the destination type of shuffle is the same as the decomposed ones. Add the check to avoid crush when the condition is not satisfied. This fixes PR37616. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D102751
1 parent 707fc2e commit 9d09d20

File tree

2 files changed

+34
-11
lines changed

2 files changed

+34
-11
lines changed

llvm/lib/Target/X86/X86InterleavedAccess.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -724,30 +724,34 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
724724
auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
725725

726726
if (isa<LoadInst>(Inst)) {
727-
// Try to generate target-sized register(/instruction).
728-
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
729-
730727
auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
731728
unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
732-
// Perform matrix-transposition in order to compute interleaved
733-
// results by generating some sort of (optimized) target-specific
734-
// instructions.
735-
736729
switch (NumSubVecElems) {
737730
default:
738731
return false;
739732
case 4:
740-
transpose_4x4(DecomposedVectors, TransposedVectors);
741-
break;
742733
case 8:
743734
case 16:
744735
case 32:
745736
case 64:
746-
deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
747-
NumSubVecElems);
737+
if (ShuffleTy->getNumElements() != NumSubVecElems)
738+
return false;
748739
break;
749740
}
750741

742+
// Try to generate target-sized register(/instruction).
743+
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
744+
745+
// Perform matrix-transposition in order to compute interleaved
746+
// results by generating some sort of (optimized) target-specific
747+
// instructions.
748+
749+
if (NumSubVecElems == 4)
750+
transpose_4x4(DecomposedVectors, TransposedVectors);
751+
else
752+
deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
753+
NumSubVecElems);
754+
751755
// Now replace the unoptimized-interleaved-vectors with the
752756
// transposed-interleaved vectors.
753757
for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)

llvm/test/CodeGen/X86/x86-interleaved-access.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1930,3 +1930,22 @@ define void @splat4_v4i64_load_store(<4 x i64>* %s, <16 x i64>* %d) {
19301930
store <16 x i64> %r, <16 x i64>* %d, align 8
19311931
ret void
19321932
}
1933+
1934+
define <2 x i64> @PR37616(<16 x i64>* %a0) {
1935+
; AVX1-LABEL: PR37616:
1936+
; AVX1: # %bb.0:
1937+
; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
1938+
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1939+
; AVX1-NEXT: retq
1940+
;
1941+
; AVX2OR512-LABEL: PR37616:
1942+
; AVX2OR512: # %bb.0:
1943+
; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
1944+
; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
1945+
; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
1946+
; AVX2OR512-NEXT: vzeroupper
1947+
; AVX2OR512-NEXT: retq
1948+
%load = load <16 x i64>, <16 x i64>* %a0, align 128
1949+
%shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
1950+
ret <2 x i64> %shuffle
1951+
}

0 commit comments

Comments
 (0)