Skip to content

Commit 3d4d033

Browse files
committed
[AArch64][Arm] Add nested double reduction tests. NFC
1 parent 2f48765 commit 3d4d033

File tree

2 files changed

+660
-0
lines changed

2 files changed

+660
-0
lines changed

llvm/test/CodeGen/AArch64/double_reduct.ll

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,371 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
284284
ret i32 %r
285285
}
286286

287+
288+
define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
289+
; CHECK-LABEL: nested_fadd_f32:
290+
; CHECK: // %bb.0:
291+
; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
292+
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
293+
; CHECK-NEXT: faddp s1, v1.2s
294+
; CHECK-NEXT: faddp s0, v0.2s
295+
; CHECK-NEXT: fadd s1, s1, s3
296+
; CHECK-NEXT: fadd s0, s0, s2
297+
; CHECK-NEXT: fadd s0, s0, s1
298+
; CHECK-NEXT: ret
299+
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
300+
%a1 = fadd fast float %r1, %c
301+
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
302+
%a2 = fadd fast float %r2, %d
303+
%r = fadd fast float %a1, %a2
304+
ret float %r
305+
}
306+
307+
define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, float %d) {
308+
; CHECK-LABEL: nested_fadd_f32_slow:
309+
; CHECK: // %bb.0:
310+
; CHECK-NEXT: mov s4, v1.s[2]
311+
; CHECK-NEXT: mov s5, v0.s[2]
312+
; CHECK-NEXT: faddp s6, v0.2s
313+
; CHECK-NEXT: faddp s7, v1.2s
314+
; CHECK-NEXT: mov s1, v1.s[3]
315+
; CHECK-NEXT: mov s0, v0.s[3]
316+
; CHECK-NEXT: fadd s5, s6, s5
317+
; CHECK-NEXT: fadd s4, s7, s4
318+
; CHECK-NEXT: fadd s0, s5, s0
319+
; CHECK-NEXT: fadd s1, s4, s1
320+
; CHECK-NEXT: fadd s0, s0, s2
321+
; CHECK-NEXT: fadd s1, s1, s3
322+
; CHECK-NEXT: fadd s0, s0, s1
323+
; CHECK-NEXT: ret
324+
%r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
325+
%a1 = fadd float %r1, %c
326+
%r2 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
327+
%a2 = fadd float %r2, %d
328+
%r = fadd float %a1, %a2
329+
ret float %r
330+
}
331+
332+
define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
333+
; CHECK-LABEL: nested_mul_f32:
334+
; CHECK: // %bb.0:
335+
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
336+
; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
337+
; CHECK-NEXT: fmul v1.2s, v1.2s, v4.2s
338+
; CHECK-NEXT: fmul v0.2s, v0.2s, v5.2s
339+
; CHECK-NEXT: fmul s1, s1, v1.s[1]
340+
; CHECK-NEXT: fmul s0, s0, v0.s[1]
341+
; CHECK-NEXT: fmul s1, s1, s3
342+
; CHECK-NEXT: fmul s0, s0, s2
343+
; CHECK-NEXT: fmul s0, s0, s1
344+
; CHECK-NEXT: ret
345+
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
346+
%a1 = fmul fast float %r1, %c
347+
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
348+
%a2 = fmul fast float %r2, %d
349+
%r = fmul fast float %a1, %a2
350+
ret float %r
351+
}
352+
353+
define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
354+
; CHECK-LABEL: nested_add_i32:
355+
; CHECK: // %bb.0:
356+
; CHECK-NEXT: addv s1, v1.4s
357+
; CHECK-NEXT: addv s0, v0.4s
358+
; CHECK-NEXT: fmov w8, s1
359+
; CHECK-NEXT: fmov w9, s0
360+
; CHECK-NEXT: add w9, w9, w0
361+
; CHECK-NEXT: add w8, w8, w1
362+
; CHECK-NEXT: add w0, w9, w8
363+
; CHECK-NEXT: ret
364+
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
365+
%a1 = add i32 %r1, %c
366+
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
367+
%a2 = add i32 %r2, %d
368+
%r = add i32 %a1, %a2
369+
ret i32 %r
370+
}
371+
372+
define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
373+
; CHECK-LABEL: nested_add_c1_i32:
374+
; CHECK: // %bb.0:
375+
; CHECK-NEXT: addv s1, v1.4s
376+
; CHECK-NEXT: addv s0, v0.4s
377+
; CHECK-NEXT: fmov w8, s1
378+
; CHECK-NEXT: fmov w9, s0
379+
; CHECK-NEXT: add w9, w0, w9
380+
; CHECK-NEXT: add w8, w8, w1
381+
; CHECK-NEXT: add w0, w9, w8
382+
; CHECK-NEXT: ret
383+
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
384+
%a1 = add i32 %c, %r1
385+
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
386+
%a2 = add i32 %r2, %d
387+
%r = add i32 %a1, %a2
388+
ret i32 %r
389+
}
390+
391+
define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
392+
; CHECK-LABEL: nested_add_c2_i32:
393+
; CHECK: // %bb.0:
394+
; CHECK-NEXT: addv s1, v1.4s
395+
; CHECK-NEXT: addv s0, v0.4s
396+
; CHECK-NEXT: fmov w8, s1
397+
; CHECK-NEXT: fmov w9, s0
398+
; CHECK-NEXT: add w9, w9, w0
399+
; CHECK-NEXT: add w8, w1, w8
400+
; CHECK-NEXT: add w0, w9, w8
401+
; CHECK-NEXT: ret
402+
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
403+
%a1 = add i32 %r1, %c
404+
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
405+
%a2 = add i32 %d, %r2
406+
%r = add i32 %a1, %a2
407+
ret i32 %r
408+
}
409+
410+
define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
411+
; CHECK-LABEL: nested_add_manyreduct_i32:
412+
; CHECK: // %bb.0:
413+
; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
414+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
415+
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
416+
; CHECK-NEXT: addv s0, v0.4s
417+
; CHECK-NEXT: fmov w0, s0
418+
; CHECK-NEXT: ret
419+
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
420+
%r3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
421+
%a1 = add i32 %r1, %r3
422+
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
423+
%r4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
424+
%a2 = add i32 %r2, %r4
425+
%r = add i32 %a1, %a2
426+
ret i32 %r
427+
}
428+
429+
define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
430+
; CHECK-LABEL: nested_mul_i32:
431+
; CHECK: // %bb.0:
432+
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
433+
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
434+
; CHECK-NEXT: mul v0.2s, v0.2s, v3.2s
435+
; CHECK-NEXT: mul v1.2s, v1.2s, v2.2s
436+
; CHECK-NEXT: mov w8, v0.s[1]
437+
; CHECK-NEXT: fmov w10, s0
438+
; CHECK-NEXT: mov w9, v1.s[1]
439+
; CHECK-NEXT: mul w8, w10, w8
440+
; CHECK-NEXT: fmov w10, s1
441+
; CHECK-NEXT: mul w9, w10, w9
442+
; CHECK-NEXT: mul w8, w8, w0
443+
; CHECK-NEXT: mul w9, w9, w1
444+
; CHECK-NEXT: mul w0, w8, w9
445+
; CHECK-NEXT: ret
446+
%r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
447+
%a1 = mul i32 %r1, %c
448+
%r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
449+
%a2 = mul i32 %r2, %d
450+
%r = mul i32 %a1, %a2
451+
ret i32 %r
452+
}
453+
454+
define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
455+
; CHECK-LABEL: nested_and_i32:
456+
; CHECK: // %bb.0:
457+
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
458+
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
459+
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
460+
; CHECK-NEXT: and v0.8b, v0.8b, v3.8b
461+
; CHECK-NEXT: fmov x8, d1
462+
; CHECK-NEXT: fmov x9, d0
463+
; CHECK-NEXT: lsr x10, x9, #32
464+
; CHECK-NEXT: lsr x11, x8, #32
465+
; CHECK-NEXT: and w9, w9, w0
466+
; CHECK-NEXT: and w8, w8, w1
467+
; CHECK-NEXT: and w9, w9, w10
468+
; CHECK-NEXT: and w8, w8, w11
469+
; CHECK-NEXT: and w0, w9, w8
470+
; CHECK-NEXT: ret
471+
%r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
472+
%a1 = and i32 %r1, %c
473+
%r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
474+
%a2 = and i32 %r2, %d
475+
%r = and i32 %a1, %a2
476+
ret i32 %r
477+
}
478+
479+
define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
480+
; CHECK-LABEL: nested_or_i32:
481+
; CHECK: // %bb.0:
482+
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
483+
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
484+
; CHECK-NEXT: orr v1.8b, v1.8b, v2.8b
485+
; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b
486+
; CHECK-NEXT: fmov x8, d1
487+
; CHECK-NEXT: fmov x9, d0
488+
; CHECK-NEXT: lsr x10, x9, #32
489+
; CHECK-NEXT: lsr x11, x8, #32
490+
; CHECK-NEXT: orr w9, w9, w0
491+
; CHECK-NEXT: orr w8, w8, w1
492+
; CHECK-NEXT: orr w9, w9, w10
493+
; CHECK-NEXT: orr w8, w8, w11
494+
; CHECK-NEXT: orr w0, w9, w8
495+
; CHECK-NEXT: ret
496+
%r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
497+
%a1 = or i32 %r1, %c
498+
%r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
499+
%a2 = or i32 %r2, %d
500+
%r = or i32 %a1, %a2
501+
ret i32 %r
502+
}
503+
504+
define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
505+
; CHECK-LABEL: nested_xor_i32:
506+
; CHECK: // %bb.0:
507+
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
508+
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
509+
; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
510+
; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
511+
; CHECK-NEXT: fmov x8, d1
512+
; CHECK-NEXT: fmov x9, d0
513+
; CHECK-NEXT: lsr x10, x9, #32
514+
; CHECK-NEXT: lsr x11, x8, #32
515+
; CHECK-NEXT: eor w9, w9, w0
516+
; CHECK-NEXT: eor w8, w8, w1
517+
; CHECK-NEXT: eor w9, w9, w10
518+
; CHECK-NEXT: eor w8, w8, w11
519+
; CHECK-NEXT: eor w0, w9, w8
520+
; CHECK-NEXT: ret
521+
%r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
522+
%a1 = xor i32 %r1, %c
523+
%r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b)
524+
%a2 = xor i32 %r2, %d
525+
%r = xor i32 %a1, %a2
526+
ret i32 %r
527+
}
528+
529+
define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
530+
; CHECK-LABEL: nested_smin_i32:
531+
; CHECK: // %bb.0:
532+
; CHECK-NEXT: sminv s0, v0.4s
533+
; CHECK-NEXT: sminv s1, v1.4s
534+
; CHECK-NEXT: fmov w9, s0
535+
; CHECK-NEXT: fmov w8, s1
536+
; CHECK-NEXT: cmp w9, w0
537+
; CHECK-NEXT: csel w9, w9, w0, lt
538+
; CHECK-NEXT: cmp w8, w1
539+
; CHECK-NEXT: csel w8, w8, w1, lt
540+
; CHECK-NEXT: cmp w9, w8
541+
; CHECK-NEXT: csel w0, w9, w8, lt
542+
; CHECK-NEXT: ret
543+
%r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
544+
%a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c)
545+
%r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b)
546+
%a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d)
547+
%r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2)
548+
ret i32 %r
549+
}
550+
551+
define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
552+
; CHECK-LABEL: nested_smax_i32:
553+
; CHECK: // %bb.0:
554+
; CHECK-NEXT: smaxv s0, v0.4s
555+
; CHECK-NEXT: smaxv s1, v1.4s
556+
; CHECK-NEXT: fmov w9, s0
557+
; CHECK-NEXT: fmov w8, s1
558+
; CHECK-NEXT: cmp w9, w0
559+
; CHECK-NEXT: csel w9, w9, w0, gt
560+
; CHECK-NEXT: cmp w8, w1
561+
; CHECK-NEXT: csel w8, w8, w1, gt
562+
; CHECK-NEXT: cmp w9, w8
563+
; CHECK-NEXT: csel w0, w9, w8, gt
564+
; CHECK-NEXT: ret
565+
%r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
566+
%a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c)
567+
%r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b)
568+
%a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d)
569+
%r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2)
570+
ret i32 %r
571+
}
572+
573+
define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
574+
; CHECK-LABEL: nested_umin_i32:
575+
; CHECK: // %bb.0:
576+
; CHECK-NEXT: uminv s0, v0.4s
577+
; CHECK-NEXT: uminv s1, v1.4s
578+
; CHECK-NEXT: fmov w9, s0
579+
; CHECK-NEXT: fmov w8, s1
580+
; CHECK-NEXT: cmp w9, w0
581+
; CHECK-NEXT: csel w9, w9, w0, lo
582+
; CHECK-NEXT: cmp w8, w1
583+
; CHECK-NEXT: csel w8, w8, w1, lo
584+
; CHECK-NEXT: cmp w9, w8
585+
; CHECK-NEXT: csel w0, w9, w8, lo
586+
; CHECK-NEXT: ret
587+
%r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
588+
%a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c)
589+
%r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b)
590+
%a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d)
591+
%r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2)
592+
ret i32 %r
593+
}
594+
595+
define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
596+
; CHECK-LABEL: nested_umax_i32:
597+
; CHECK: // %bb.0:
598+
; CHECK-NEXT: umaxv s0, v0.4s
599+
; CHECK-NEXT: umaxv s1, v1.4s
600+
; CHECK-NEXT: fmov w9, s0
601+
; CHECK-NEXT: fmov w8, s1
602+
; CHECK-NEXT: cmp w9, w0
603+
; CHECK-NEXT: csel w9, w9, w0, hi
604+
; CHECK-NEXT: cmp w8, w1
605+
; CHECK-NEXT: csel w8, w8, w1, hi
606+
; CHECK-NEXT: cmp w9, w8
607+
; CHECK-NEXT: csel w0, w9, w8, hi
608+
; CHECK-NEXT: ret
609+
%r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
610+
%a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c)
611+
%r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b)
612+
%a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d)
613+
%r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2)
614+
ret i32 %r
615+
}
616+
617+
define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
618+
; CHECK-LABEL: nested_fmin_float:
619+
; CHECK: // %bb.0:
620+
; CHECK-NEXT: fminnmv s1, v1.4s
621+
; CHECK-NEXT: fminnmv s0, v0.4s
622+
; CHECK-NEXT: fminnm s1, s1, s3
623+
; CHECK-NEXT: fminnm s0, s0, s2
624+
; CHECK-NEXT: fminnm s0, s0, s1
625+
; CHECK-NEXT: ret
626+
%r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
627+
%a1 = call float @llvm.minnum.f32(float %r1, float %c)
628+
%r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
629+
%a2 = call float @llvm.minnum.f32(float %r2, float %d)
630+
%r = call float @llvm.minnum.f32(float %a1, float %a2)
631+
ret float %r
632+
}
633+
634+
define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
635+
; CHECK-LABEL: nested_fmax_float:
636+
; CHECK: // %bb.0:
637+
; CHECK-NEXT: fmaxnmv s1, v1.4s
638+
; CHECK-NEXT: fmaxnmv s0, v0.4s
639+
; CHECK-NEXT: fmaxnm s1, s1, s3
640+
; CHECK-NEXT: fmaxnm s0, s0, s2
641+
; CHECK-NEXT: fmaxnm s0, s0, s1
642+
; CHECK-NEXT: ret
643+
%r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
644+
%a1 = call float @llvm.maxnum.f32(float %r1, float %c)
645+
%r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
646+
%a2 = call float @llvm.maxnum.f32(float %r2, float %d)
647+
%r = call float @llvm.maxnum.f32(float %a1, float %a2)
648+
ret float %r
649+
}
650+
651+
287652
declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
288653
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
289654
declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)

0 commit comments

Comments
 (0)