Skip to content

Commit fd67992

Browse files
committed
[DAGCombine] fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
We have the same fold in InstCombine - though implemented via OrZero flag on isKnownToBePowerOfTwo. The reasoning here is that either a) the result of the lshr is a power-of-two, or b) we have a div-by-zero triggering UB which we can ignore. Differential Revision: https://reviews.llvm.org/D129606
1 parent d5fa11c commit fd67992

File tree

2 files changed

+67
-162
lines changed

2 files changed

+67
-162
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4593,9 +4593,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
45934593
AddToWorklist(Add.getNode());
45944594
return DAG.getNode(ISD::AND, DL, VT, N0, Add);
45954595
}
4596-
if (N1.getOpcode() == ISD::SHL &&
4596+
// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
4597+
// fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
4598+
// TODO: We should sink the following into isKnownToBePowerOfTwo
4599+
// using a OrZero parameter analogous to our handling in ValueTracking.
4600+
if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) &&
45974601
DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
4598-
// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
45994602
SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
46004603
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
46014604
AddToWorklist(Add.getNode());

llvm/test/CodeGen/X86/combine-urem.ll

Lines changed: 62 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -361,103 +361,54 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
361361
define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
362362
; SSE-LABEL: combine_vec_urem_by_lshr_pow2a:
363363
; SSE: # %bb.0:
364-
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
365-
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
366-
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [4,4,4,4]
367-
; SSE-NEXT: movdqa %xmm5, %xmm2
368-
; SSE-NEXT: psrld %xmm4, %xmm2
369-
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
370-
; SSE-NEXT: movdqa %xmm5, %xmm4
371-
; SSE-NEXT: psrld %xmm3, %xmm4
372-
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
373-
; SSE-NEXT: movdqa %xmm5, %xmm6
374-
; SSE-NEXT: psrld %xmm3, %xmm6
375-
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
376-
; SSE-NEXT: psrld %xmm1, %xmm5
377-
; SSE-NEXT: pextrd $1, %xmm5, %ecx
378-
; SSE-NEXT: pextrd $1, %xmm0, %eax
379-
; SSE-NEXT: xorl %edx, %edx
380-
; SSE-NEXT: divl %ecx
381-
; SSE-NEXT: movl %edx, %ecx
382-
; SSE-NEXT: movd %xmm6, %esi
383-
; SSE-NEXT: movd %xmm0, %eax
384-
; SSE-NEXT: xorl %edx, %edx
385-
; SSE-NEXT: divl %esi
386-
; SSE-NEXT: movd %edx, %xmm1
387-
; SSE-NEXT: pinsrd $1, %ecx, %xmm1
388-
; SSE-NEXT: pextrd $2, %xmm4, %ecx
389-
; SSE-NEXT: pextrd $2, %xmm0, %eax
390-
; SSE-NEXT: xorl %edx, %edx
391-
; SSE-NEXT: divl %ecx
392-
; SSE-NEXT: pinsrd $2, %edx, %xmm1
393-
; SSE-NEXT: pextrd $3, %xmm2, %ecx
394-
; SSE-NEXT: pextrd $3, %xmm0, %eax
395-
; SSE-NEXT: xorl %edx, %edx
396-
; SSE-NEXT: divl %ecx
397-
; SSE-NEXT: pinsrd $3, %edx, %xmm1
398-
; SSE-NEXT: movdqa %xmm1, %xmm0
364+
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
365+
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,4,4,4]
366+
; SSE-NEXT: movdqa %xmm3, %xmm4
367+
; SSE-NEXT: psrld %xmm2, %xmm4
368+
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
369+
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
370+
; SSE-NEXT: movdqa %xmm3, %xmm6
371+
; SSE-NEXT: psrld %xmm5, %xmm6
372+
; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
373+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
374+
; SSE-NEXT: movdqa %xmm3, %xmm4
375+
; SSE-NEXT: psrld %xmm1, %xmm4
376+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
377+
; SSE-NEXT: psrld %xmm1, %xmm3
378+
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
379+
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
380+
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
381+
; SSE-NEXT: paddd %xmm3, %xmm1
382+
; SSE-NEXT: pand %xmm1, %xmm0
399383
; SSE-NEXT: retq
400384
;
401385
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a:
402386
; AVX1: # %bb.0:
403387
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
404388
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4]
405389
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
390+
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
391+
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
392+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
406393
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
407394
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
408395
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
409-
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
410-
; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
411-
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
396+
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
412397
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
413-
; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
414-
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
415-
; AVX1-NEXT: xorl %edx, %edx
416-
; AVX1-NEXT: divl %ecx
417-
; AVX1-NEXT: movl %edx, %ecx
418-
; AVX1-NEXT: vmovd %xmm5, %esi
419-
; AVX1-NEXT: vmovd %xmm0, %eax
420-
; AVX1-NEXT: xorl %edx, %edx
421-
; AVX1-NEXT: divl %esi
422-
; AVX1-NEXT: vmovd %edx, %xmm1
423-
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
424-
; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
425-
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
426-
; AVX1-NEXT: xorl %edx, %edx
427-
; AVX1-NEXT: divl %ecx
428-
; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
429-
; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
430-
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
431-
; AVX1-NEXT: xorl %edx, %edx
432-
; AVX1-NEXT: divl %ecx
433-
; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
398+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
399+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
400+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
401+
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
402+
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
434403
; AVX1-NEXT: retq
435404
;
436405
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a:
437406
; AVX2: # %bb.0:
438407
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
439408
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
440-
; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
441-
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
442-
; AVX2-NEXT: xorl %edx, %edx
443-
; AVX2-NEXT: divl %ecx
444-
; AVX2-NEXT: movl %edx, %ecx
445-
; AVX2-NEXT: vmovd %xmm1, %esi
446-
; AVX2-NEXT: vmovd %xmm0, %eax
447-
; AVX2-NEXT: xorl %edx, %edx
448-
; AVX2-NEXT: divl %esi
449-
; AVX2-NEXT: vmovd %edx, %xmm2
450-
; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
451-
; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
452-
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
453-
; AVX2-NEXT: xorl %edx, %edx
454-
; AVX2-NEXT: divl %ecx
455-
; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
456-
; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
457-
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
458-
; AVX2-NEXT: xorl %edx, %edx
459-
; AVX2-NEXT: divl %ecx
460-
; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
409+
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
410+
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
411+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
461412
; AVX2-NEXT: retq
462413
%1 = lshr <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
463414
%2 = urem <4 x i32> %x, %1
@@ -467,103 +418,54 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
467418
define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) {
468419
; SSE-LABEL: combine_vec_urem_by_lshr_pow2b:
469420
; SSE: # %bb.0:
470-
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
471-
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
472-
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,8,16]
473-
; SSE-NEXT: movdqa %xmm5, %xmm2
474-
; SSE-NEXT: psrld %xmm4, %xmm2
475-
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
476-
; SSE-NEXT: movdqa %xmm5, %xmm4
477-
; SSE-NEXT: psrld %xmm3, %xmm4
478-
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
479-
; SSE-NEXT: movdqa %xmm5, %xmm6
480-
; SSE-NEXT: psrld %xmm3, %xmm6
481-
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
482-
; SSE-NEXT: psrld %xmm1, %xmm5
483-
; SSE-NEXT: pextrd $1, %xmm5, %ecx
484-
; SSE-NEXT: pextrd $1, %xmm0, %eax
485-
; SSE-NEXT: xorl %edx, %edx
486-
; SSE-NEXT: divl %ecx
487-
; SSE-NEXT: movl %edx, %ecx
488-
; SSE-NEXT: movd %xmm6, %esi
489-
; SSE-NEXT: movd %xmm0, %eax
490-
; SSE-NEXT: xorl %edx, %edx
491-
; SSE-NEXT: divl %esi
492-
; SSE-NEXT: movd %edx, %xmm1
493-
; SSE-NEXT: pinsrd $1, %ecx, %xmm1
494-
; SSE-NEXT: pextrd $2, %xmm4, %ecx
495-
; SSE-NEXT: pextrd $2, %xmm0, %eax
496-
; SSE-NEXT: xorl %edx, %edx
497-
; SSE-NEXT: divl %ecx
498-
; SSE-NEXT: pinsrd $2, %edx, %xmm1
499-
; SSE-NEXT: pextrd $3, %xmm2, %ecx
500-
; SSE-NEXT: pextrd $3, %xmm0, %eax
501-
; SSE-NEXT: xorl %edx, %edx
502-
; SSE-NEXT: divl %ecx
503-
; SSE-NEXT: pinsrd $3, %edx, %xmm1
504-
; SSE-NEXT: movdqa %xmm1, %xmm0
421+
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
422+
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,8,16]
423+
; SSE-NEXT: movdqa %xmm3, %xmm4
424+
; SSE-NEXT: psrld %xmm2, %xmm4
425+
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
426+
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
427+
; SSE-NEXT: movdqa %xmm3, %xmm6
428+
; SSE-NEXT: psrld %xmm5, %xmm6
429+
; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
430+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
431+
; SSE-NEXT: movdqa %xmm3, %xmm4
432+
; SSE-NEXT: psrld %xmm1, %xmm4
433+
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
434+
; SSE-NEXT: psrld %xmm1, %xmm3
435+
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
436+
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
437+
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
438+
; SSE-NEXT: paddd %xmm3, %xmm1
439+
; SSE-NEXT: pand %xmm1, %xmm0
505440
; SSE-NEXT: retq
506441
;
507442
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b:
508443
; AVX1: # %bb.0:
509444
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
510445
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16]
511446
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
447+
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
448+
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
449+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
512450
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
513451
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
514452
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
515-
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
516-
; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
517-
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
453+
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
518454
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
519-
; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
520-
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
521-
; AVX1-NEXT: xorl %edx, %edx
522-
; AVX1-NEXT: divl %ecx
523-
; AVX1-NEXT: movl %edx, %ecx
524-
; AVX1-NEXT: vmovd %xmm5, %esi
525-
; AVX1-NEXT: vmovd %xmm0, %eax
526-
; AVX1-NEXT: xorl %edx, %edx
527-
; AVX1-NEXT: divl %esi
528-
; AVX1-NEXT: vmovd %edx, %xmm1
529-
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
530-
; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
531-
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
532-
; AVX1-NEXT: xorl %edx, %edx
533-
; AVX1-NEXT: divl %ecx
534-
; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
535-
; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
536-
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
537-
; AVX1-NEXT: xorl %edx, %edx
538-
; AVX1-NEXT: divl %ecx
539-
; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
455+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
456+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
457+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
458+
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
459+
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
540460
; AVX1-NEXT: retq
541461
;
542462
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b:
543463
; AVX2: # %bb.0:
544464
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
545465
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
546-
; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
547-
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
548-
; AVX2-NEXT: xorl %edx, %edx
549-
; AVX2-NEXT: divl %ecx
550-
; AVX2-NEXT: movl %edx, %ecx
551-
; AVX2-NEXT: vmovd %xmm1, %esi
552-
; AVX2-NEXT: vmovd %xmm0, %eax
553-
; AVX2-NEXT: xorl %edx, %edx
554-
; AVX2-NEXT: divl %esi
555-
; AVX2-NEXT: vmovd %edx, %xmm2
556-
; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
557-
; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
558-
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
559-
; AVX2-NEXT: xorl %edx, %edx
560-
; AVX2-NEXT: divl %ecx
561-
; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
562-
; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
563-
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
564-
; AVX2-NEXT: xorl %edx, %edx
565-
; AVX2-NEXT: divl %ecx
566-
; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
466+
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
467+
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
468+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
567469
; AVX2-NEXT: retq
568470
%1 = lshr <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
569471
%2 = urem <4 x i32> %x, %1

0 commit comments

Comments
 (0)