Skip to content

Commit d9d28b3

Browse files
committed
[X86][AVX] getFauxShuffleMask - fix sub vector size check in INSERT_SUBVECTOR(X,SHUFFLE(Y,Z))
We were bailing on subvector shuffle inputs that were smaller than the subvector type instead of larger than it. Fixes PR46178
1 parent 6780be4 commit d9d28b3

File tree

2 files changed

+56
-2
lines changed

2 files changed

+56
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7439,8 +7439,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74397439
return false;
74407440

74417441
// Subvector shuffle inputs must not be larger than the subvector.
7442-
if (llvm::any_of(SubInputs, [SubVT](SDValue Op) {
7443-
return SubVT.getSizeInBits() > Op.getValueSizeInBits();
7442+
if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7443+
return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
74447444
}))
74457445
return false;
74467446

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,57 @@ define <16 x i8> @combine_shuffle_vrotli_v4i32(<4 x i32> %a0) {
9999
ret <16 x i8> %3
100100
}
101101
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
102+
103+
define void @PR46178(i16* %0) {
104+
; X86-LABEL: PR46178:
105+
; X86: # %bb.0:
106+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
107+
; X86-NEXT: vmovdqu 0, %ymm0
108+
; X86-NEXT: vmovdqu (%eax), %ymm1
109+
; X86-NEXT: vpmovqw %ymm0, %xmm0
110+
; X86-NEXT: vpmovqw %ymm1, %xmm1
111+
; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
112+
; X86-NEXT: vpsllw $8, %ymm0, %ymm0
113+
; X86-NEXT: vpsraw $8, %ymm0, %ymm0
114+
; X86-NEXT: vmovapd {{.*#+}} ymm1 = [0,0,2,0,4,0,4,0]
115+
; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2
116+
; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
117+
; X86-NEXT: vmovupd %ymm1, (%eax)
118+
; X86-NEXT: vzeroupper
119+
; X86-NEXT: retl
120+
;
121+
; X64-LABEL: PR46178:
122+
; X64: # %bb.0:
123+
; X64-NEXT: vmovdqu 0, %ymm0
124+
; X64-NEXT: vmovdqu (%rax), %ymm1
125+
; X64-NEXT: vpmovqw %ymm0, %xmm0
126+
; X64-NEXT: vpmovqw %ymm1, %xmm1
127+
; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
128+
; X64-NEXT: vpsllw $8, %ymm0, %ymm0
129+
; X64-NEXT: vpsraw $8, %ymm0, %ymm0
130+
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
131+
; X64-NEXT: vmovdqa %xmm0, %xmm0
132+
; X64-NEXT: vmovdqu %ymm0, (%rdi)
133+
; X64-NEXT: vzeroupper
134+
; X64-NEXT: retq
135+
%2 = load <4 x i64>, <4 x i64>* null, align 8
136+
%3 = load <4 x i64>, <4 x i64>* undef, align 8
137+
%4 = trunc <4 x i64> %2 to <4 x i16>
138+
%5 = trunc <4 x i64> %3 to <4 x i16>
139+
%6 = shl <4 x i16> %4, <i16 8, i16 8, i16 8, i16 8>
140+
%7 = shl <4 x i16> %5, <i16 8, i16 8, i16 8, i16 8>
141+
%8 = ashr exact <4 x i16> %6, <i16 8, i16 8, i16 8, i16 8>
142+
%9 = ashr exact <4 x i16> %7, <i16 8, i16 8, i16 8, i16 8>
143+
%10 = bitcast i16* %0 to <4 x i16>*
144+
%11 = getelementptr inbounds i16, i16* %0, i64 4
145+
%12 = bitcast i16* %11 to <4 x i16>*
146+
%13 = getelementptr inbounds i16, i16* %0, i64 8
147+
%14 = bitcast i16* %13 to <4 x i16>*
148+
%15 = getelementptr inbounds i16, i16* %0, i64 12
149+
%16 = bitcast i16* %15 to <4 x i16>*
150+
store <4 x i16> %8, <4 x i16>* %10, align 2
151+
store <4 x i16> %9, <4 x i16>* %12, align 2
152+
store <4 x i16> zeroinitializer, <4 x i16>* %14, align 2
153+
store <4 x i16> zeroinitializer, <4 x i16>* %16, align 2
154+
ret void
155+
}

0 commit comments

Comments
 (0)