Skip to content

Commit cc7d966

Browse files
committed
Add coverage for missing (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) transform
1 parent a565509 commit cc7d966

File tree

1 file changed

+213
-0
lines changed

1 file changed

+213
-0
lines changed

llvm/test/CodeGen/X86/combine-urem.ll

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,219 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
357357
ret <4 x i32> %2
358358
}
359359

360+
; fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
361+
define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
362+
; SSE-LABEL: combine_vec_urem_by_lshr_pow2a:
363+
; SSE: # %bb.0:
364+
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
365+
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
366+
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [4,4,4,4]
367+
; SSE-NEXT: movdqa %xmm5, %xmm2
368+
; SSE-NEXT: psrld %xmm4, %xmm2
369+
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
370+
; SSE-NEXT: movdqa %xmm5, %xmm4
371+
; SSE-NEXT: psrld %xmm3, %xmm4
372+
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
373+
; SSE-NEXT: movdqa %xmm5, %xmm6
374+
; SSE-NEXT: psrld %xmm3, %xmm6
375+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
376+
; SSE-NEXT: psrld %xmm1, %xmm5
377+
; SSE-NEXT: pextrd $1, %xmm5, %ecx
378+
; SSE-NEXT: pextrd $1, %xmm0, %eax
379+
; SSE-NEXT: xorl %edx, %edx
380+
; SSE-NEXT: divl %ecx
381+
; SSE-NEXT: movl %edx, %ecx
382+
; SSE-NEXT: movd %xmm6, %esi
383+
; SSE-NEXT: movd %xmm0, %eax
384+
; SSE-NEXT: xorl %edx, %edx
385+
; SSE-NEXT: divl %esi
386+
; SSE-NEXT: movd %edx, %xmm1
387+
; SSE-NEXT: pinsrd $1, %ecx, %xmm1
388+
; SSE-NEXT: pextrd $2, %xmm4, %ecx
389+
; SSE-NEXT: pextrd $2, %xmm0, %eax
390+
; SSE-NEXT: xorl %edx, %edx
391+
; SSE-NEXT: divl %ecx
392+
; SSE-NEXT: pinsrd $2, %edx, %xmm1
393+
; SSE-NEXT: pextrd $3, %xmm2, %ecx
394+
; SSE-NEXT: pextrd $3, %xmm0, %eax
395+
; SSE-NEXT: xorl %edx, %edx
396+
; SSE-NEXT: divl %ecx
397+
; SSE-NEXT: pinsrd $3, %edx, %xmm1
398+
; SSE-NEXT: movdqa %xmm1, %xmm0
399+
; SSE-NEXT: retq
400+
;
401+
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a:
402+
; AVX1: # %bb.0:
403+
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
404+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4]
405+
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
406+
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
407+
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
408+
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
409+
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
410+
; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
411+
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
412+
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
413+
; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
414+
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
415+
; AVX1-NEXT: xorl %edx, %edx
416+
; AVX1-NEXT: divl %ecx
417+
; AVX1-NEXT: movl %edx, %ecx
418+
; AVX1-NEXT: vmovd %xmm5, %esi
419+
; AVX1-NEXT: vmovd %xmm0, %eax
420+
; AVX1-NEXT: xorl %edx, %edx
421+
; AVX1-NEXT: divl %esi
422+
; AVX1-NEXT: vmovd %edx, %xmm1
423+
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
424+
; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
425+
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
426+
; AVX1-NEXT: xorl %edx, %edx
427+
; AVX1-NEXT: divl %ecx
428+
; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
429+
; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
430+
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
431+
; AVX1-NEXT: xorl %edx, %edx
432+
; AVX1-NEXT: divl %ecx
433+
; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
434+
; AVX1-NEXT: retq
435+
;
436+
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a:
437+
; AVX2: # %bb.0:
438+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
439+
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
440+
; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
441+
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
442+
; AVX2-NEXT: xorl %edx, %edx
443+
; AVX2-NEXT: divl %ecx
444+
; AVX2-NEXT: movl %edx, %ecx
445+
; AVX2-NEXT: vmovd %xmm1, %esi
446+
; AVX2-NEXT: vmovd %xmm0, %eax
447+
; AVX2-NEXT: xorl %edx, %edx
448+
; AVX2-NEXT: divl %esi
449+
; AVX2-NEXT: vmovd %edx, %xmm2
450+
; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
451+
; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
452+
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
453+
; AVX2-NEXT: xorl %edx, %edx
454+
; AVX2-NEXT: divl %ecx
455+
; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
456+
; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
457+
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
458+
; AVX2-NEXT: xorl %edx, %edx
459+
; AVX2-NEXT: divl %ecx
460+
; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
461+
; AVX2-NEXT: retq
462+
%1 = lshr <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
463+
%2 = urem <4 x i32> %x, %1
464+
ret <4 x i32> %2
465+
}
466+
467+
define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) {
468+
; SSE-LABEL: combine_vec_urem_by_lshr_pow2b:
469+
; SSE: # %bb.0:
470+
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
471+
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
472+
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,8,16]
473+
; SSE-NEXT: movdqa %xmm5, %xmm2
474+
; SSE-NEXT: psrld %xmm4, %xmm2
475+
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
476+
; SSE-NEXT: movdqa %xmm5, %xmm4
477+
; SSE-NEXT: psrld %xmm3, %xmm4
478+
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
479+
; SSE-NEXT: movdqa %xmm5, %xmm6
480+
; SSE-NEXT: psrld %xmm3, %xmm6
481+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
482+
; SSE-NEXT: psrld %xmm1, %xmm5
483+
; SSE-NEXT: pextrd $1, %xmm5, %ecx
484+
; SSE-NEXT: pextrd $1, %xmm0, %eax
485+
; SSE-NEXT: xorl %edx, %edx
486+
; SSE-NEXT: divl %ecx
487+
; SSE-NEXT: movl %edx, %ecx
488+
; SSE-NEXT: movd %xmm6, %esi
489+
; SSE-NEXT: movd %xmm0, %eax
490+
; SSE-NEXT: xorl %edx, %edx
491+
; SSE-NEXT: divl %esi
492+
; SSE-NEXT: movd %edx, %xmm1
493+
; SSE-NEXT: pinsrd $1, %ecx, %xmm1
494+
; SSE-NEXT: pextrd $2, %xmm4, %ecx
495+
; SSE-NEXT: pextrd $2, %xmm0, %eax
496+
; SSE-NEXT: xorl %edx, %edx
497+
; SSE-NEXT: divl %ecx
498+
; SSE-NEXT: pinsrd $2, %edx, %xmm1
499+
; SSE-NEXT: pextrd $3, %xmm2, %ecx
500+
; SSE-NEXT: pextrd $3, %xmm0, %eax
501+
; SSE-NEXT: xorl %edx, %edx
502+
; SSE-NEXT: divl %ecx
503+
; SSE-NEXT: pinsrd $3, %edx, %xmm1
504+
; SSE-NEXT: movdqa %xmm1, %xmm0
505+
; SSE-NEXT: retq
506+
;
507+
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b:
508+
; AVX1: # %bb.0:
509+
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
510+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16]
511+
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
512+
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
513+
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
514+
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
515+
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
516+
; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
517+
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
518+
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
519+
; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
520+
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
521+
; AVX1-NEXT: xorl %edx, %edx
522+
; AVX1-NEXT: divl %ecx
523+
; AVX1-NEXT: movl %edx, %ecx
524+
; AVX1-NEXT: vmovd %xmm5, %esi
525+
; AVX1-NEXT: vmovd %xmm0, %eax
526+
; AVX1-NEXT: xorl %edx, %edx
527+
; AVX1-NEXT: divl %esi
528+
; AVX1-NEXT: vmovd %edx, %xmm1
529+
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
530+
; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
531+
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
532+
; AVX1-NEXT: xorl %edx, %edx
533+
; AVX1-NEXT: divl %ecx
534+
; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
535+
; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
536+
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
537+
; AVX1-NEXT: xorl %edx, %edx
538+
; AVX1-NEXT: divl %ecx
539+
; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
540+
; AVX1-NEXT: retq
541+
;
542+
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b:
543+
; AVX2: # %bb.0:
544+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
545+
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
546+
; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
547+
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
548+
; AVX2-NEXT: xorl %edx, %edx
549+
; AVX2-NEXT: divl %ecx
550+
; AVX2-NEXT: movl %edx, %ecx
551+
; AVX2-NEXT: vmovd %xmm1, %esi
552+
; AVX2-NEXT: vmovd %xmm0, %eax
553+
; AVX2-NEXT: xorl %edx, %edx
554+
; AVX2-NEXT: divl %esi
555+
; AVX2-NEXT: vmovd %edx, %xmm2
556+
; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
557+
; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
558+
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
559+
; AVX2-NEXT: xorl %edx, %edx
560+
; AVX2-NEXT: divl %ecx
561+
; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
562+
; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
563+
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
564+
; AVX2-NEXT: xorl %edx, %edx
565+
; AVX2-NEXT: divl %ecx
566+
; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
567+
; AVX2-NEXT: retq
568+
%1 = lshr <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
569+
%2 = urem <4 x i32> %x, %1
570+
ret <4 x i32> %2
571+
}
572+
360573
; FIXME: PR55271 - urem(undef, 3) != undef
361574
; Use PSLLI intrinsic to postpone the undef creation until after urem-by-constant expansion
362575
define <4 x i32> @combine_vec_urem_undef_by_3(<4 x i32> %in) {

0 commit comments

Comments
 (0)