Skip to content

Commit 1eec357

Browse files
committed
[VP] IR expansion for maxnum/minnum
Add basic handling for VP ops that can expand to non-predicate ops Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D159494
1 parent 2bdf5aa commit 1eec357

File tree

2 files changed

+173
-0
lines changed

2 files changed

+173
-0
lines changed

llvm/lib/CodeGen/ExpandVectorPredication.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,16 @@ Value *CachingVPExpander::expandPredicationToFPCall(
318318
replaceOperation(*NewOp, VPI);
319319
return NewOp;
320320
}
321+
case Intrinsic::maxnum:
322+
case Intrinsic::minnum: {
323+
Value *Op0 = VPI.getOperand(0);
324+
Value *Op1 = VPI.getOperand(1);
325+
Function *Fn = Intrinsic::getDeclaration(
326+
VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
327+
Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName());
328+
replaceOperation(*NewOp, VPI);
329+
return NewOp;
330+
}
321331
case Intrinsic::experimental_constrained_fma:
322332
case Intrinsic::experimental_constrained_fmuladd: {
323333
Value *Op0 = VPI.getOperand(0);
@@ -708,6 +718,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
708718
return expandPredicationToFPCall(Builder, VPI, Intrinsic::fabs);
709719
case Intrinsic::vp_sqrt:
710720
return expandPredicationToFPCall(Builder, VPI, Intrinsic::sqrt);
721+
case Intrinsic::vp_maxnum:
722+
return expandPredicationToFPCall(Builder, VPI, Intrinsic::maxnum);
723+
case Intrinsic::vp_minnum:
724+
return expandPredicationToFPCall(Builder, VPI, Intrinsic::minnum);
711725
case Intrinsic::vp_load:
712726
case Intrinsic::vp_store:
713727
case Intrinsic::vp_gather:

llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,3 +404,162 @@ define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5
404404
}
405405
declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
406406

407+
declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
408+
define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
409+
; SSE-LABEL: vfmax_vv_v4f32:
410+
; SSE: # %bb.0:
411+
; SSE-NEXT: movaps %xmm1, %xmm2
412+
; SSE-NEXT: maxps %xmm0, %xmm2
413+
; SSE-NEXT: cmpunordps %xmm0, %xmm0
414+
; SSE-NEXT: andps %xmm0, %xmm1
415+
; SSE-NEXT: andnps %xmm2, %xmm0
416+
; SSE-NEXT: orps %xmm1, %xmm0
417+
; SSE-NEXT: retq
418+
;
419+
; AVX1-LABEL: vfmax_vv_v4f32:
420+
; AVX1: # %bb.0:
421+
; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm2
422+
; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
423+
; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
424+
; AVX1-NEXT: retq
425+
;
426+
; AVX2-LABEL: vfmax_vv_v4f32:
427+
; AVX2: # %bb.0:
428+
; AVX2-NEXT: vmaxps %xmm0, %xmm1, %xmm2
429+
; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
430+
; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
431+
; AVX2-NEXT: retq
432+
;
433+
; AVX512-LABEL: vfmax_vv_v4f32:
434+
; AVX512: # %bb.0:
435+
; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm2
436+
; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1
437+
; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1}
438+
; AVX512-NEXT: vmovaps %xmm2, %xmm0
439+
; AVX512-NEXT: retq
440+
%v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
441+
ret <4 x float> %v
442+
}
443+
444+
declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
445+
define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
446+
; SSE-LABEL: vfmax_vv_v8f32:
447+
; SSE: # %bb.0:
448+
; SSE-NEXT: movaps %xmm2, %xmm4
449+
; SSE-NEXT: maxps %xmm0, %xmm4
450+
; SSE-NEXT: cmpunordps %xmm0, %xmm0
451+
; SSE-NEXT: andps %xmm0, %xmm2
452+
; SSE-NEXT: andnps %xmm4, %xmm0
453+
; SSE-NEXT: orps %xmm2, %xmm0
454+
; SSE-NEXT: movaps %xmm3, %xmm2
455+
; SSE-NEXT: maxps %xmm1, %xmm2
456+
; SSE-NEXT: cmpunordps %xmm1, %xmm1
457+
; SSE-NEXT: andps %xmm1, %xmm3
458+
; SSE-NEXT: andnps %xmm2, %xmm1
459+
; SSE-NEXT: orps %xmm3, %xmm1
460+
; SSE-NEXT: retq
461+
;
462+
; AVX1-LABEL: vfmax_vv_v8f32:
463+
; AVX1: # %bb.0:
464+
; AVX1-NEXT: vmaxps %ymm0, %ymm1, %ymm2
465+
; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
466+
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
467+
; AVX1-NEXT: retq
468+
;
469+
; AVX2-LABEL: vfmax_vv_v8f32:
470+
; AVX2: # %bb.0:
471+
; AVX2-NEXT: vmaxps %ymm0, %ymm1, %ymm2
472+
; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
473+
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
474+
; AVX2-NEXT: retq
475+
;
476+
; AVX512-LABEL: vfmax_vv_v8f32:
477+
; AVX512: # %bb.0:
478+
; AVX512-NEXT: vmaxps %ymm0, %ymm1, %ymm2
479+
; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1
480+
; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1}
481+
; AVX512-NEXT: vmovaps %ymm2, %ymm0
482+
; AVX512-NEXT: retq
483+
%v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
484+
ret <8 x float> %v
485+
}
486+
487+
declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
488+
define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
489+
; SSE-LABEL: vfmin_vv_v4f32:
490+
; SSE: # %bb.0:
491+
; SSE-NEXT: movaps %xmm1, %xmm2
492+
; SSE-NEXT: minps %xmm0, %xmm2
493+
; SSE-NEXT: cmpunordps %xmm0, %xmm0
494+
; SSE-NEXT: andps %xmm0, %xmm1
495+
; SSE-NEXT: andnps %xmm2, %xmm0
496+
; SSE-NEXT: orps %xmm1, %xmm0
497+
; SSE-NEXT: retq
498+
;
499+
; AVX1-LABEL: vfmin_vv_v4f32:
500+
; AVX1: # %bb.0:
501+
; AVX1-NEXT: vminps %xmm0, %xmm1, %xmm2
502+
; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
503+
; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
504+
; AVX1-NEXT: retq
505+
;
506+
; AVX2-LABEL: vfmin_vv_v4f32:
507+
; AVX2: # %bb.0:
508+
; AVX2-NEXT: vminps %xmm0, %xmm1, %xmm2
509+
; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
510+
; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
511+
; AVX2-NEXT: retq
512+
;
513+
; AVX512-LABEL: vfmin_vv_v4f32:
514+
; AVX512: # %bb.0:
515+
; AVX512-NEXT: vminps %xmm0, %xmm1, %xmm2
516+
; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1
517+
; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1}
518+
; AVX512-NEXT: vmovaps %xmm2, %xmm0
519+
; AVX512-NEXT: retq
520+
%v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
521+
ret <4 x float> %v
522+
}
523+
524+
declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
525+
define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
526+
; SSE-LABEL: vfmin_vv_v8f32:
527+
; SSE: # %bb.0:
528+
; SSE-NEXT: movaps %xmm2, %xmm4
529+
; SSE-NEXT: minps %xmm0, %xmm4
530+
; SSE-NEXT: cmpunordps %xmm0, %xmm0
531+
; SSE-NEXT: andps %xmm0, %xmm2
532+
; SSE-NEXT: andnps %xmm4, %xmm0
533+
; SSE-NEXT: orps %xmm2, %xmm0
534+
; SSE-NEXT: movaps %xmm3, %xmm2
535+
; SSE-NEXT: minps %xmm1, %xmm2
536+
; SSE-NEXT: cmpunordps %xmm1, %xmm1
537+
; SSE-NEXT: andps %xmm1, %xmm3
538+
; SSE-NEXT: andnps %xmm2, %xmm1
539+
; SSE-NEXT: orps %xmm3, %xmm1
540+
; SSE-NEXT: retq
541+
;
542+
; AVX1-LABEL: vfmin_vv_v8f32:
543+
; AVX1: # %bb.0:
544+
; AVX1-NEXT: vminps %ymm0, %ymm1, %ymm2
545+
; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
546+
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
547+
; AVX1-NEXT: retq
548+
;
549+
; AVX2-LABEL: vfmin_vv_v8f32:
550+
; AVX2: # %bb.0:
551+
; AVX2-NEXT: vminps %ymm0, %ymm1, %ymm2
552+
; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
553+
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
554+
; AVX2-NEXT: retq
555+
;
556+
; AVX512-LABEL: vfmin_vv_v8f32:
557+
; AVX512: # %bb.0:
558+
; AVX512-NEXT: vminps %ymm0, %ymm1, %ymm2
559+
; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1
560+
; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1}
561+
; AVX512-NEXT: vmovaps %ymm2, %ymm0
562+
; AVX512-NEXT: retq
563+
%v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
564+
ret <8 x float> %v
565+
}

0 commit comments

Comments
 (0)