|
| 1 | +; REQUIRES: asserts |
| 2 | +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s |
| 3 | +; |
| 4 | +; The Cortex-A53 machine model will cause the MADD instruction to be scheduled |
| 5 | +; much higher than the ADD instructions in order to hide latency. When not |
| 6 | +; specifying a subtarget, the MADD will remain near the end of the block. |
| 7 | +; |
| 8 | +; CHECK: ********** MI Scheduling ********** |
| 9 | +; CHECK: main |
| 10 | +; CHECK: *** Final schedule for BB#2 *** |
| 11 | +; CHECK: SU(13) |
| 12 | +; CHECK: MADDWrrr |
| 13 | +; CHECK: SU(4) |
| 14 | +; CHECK: ADDWri |
| 15 | +; CHECK: ********** INTERVALS ********** |
| 16 | +@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4 |
| 17 | +@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4 |
| 18 | + |
| 19 | +; Function Attrs: nounwind |
| 20 | +define i32 @main() #0 { |
| 21 | +entry: |
| 22 | + %retval = alloca i32, align 4 |
| 23 | + %x = alloca [8 x i32], align 4 |
| 24 | + %y = alloca [8 x i32], align 4 |
| 25 | + %i = alloca i32, align 4 |
| 26 | + %xx = alloca i32, align 4 |
| 27 | + %yy = alloca i32, align 4 |
| 28 | + store i32 0, i32* %retval |
| 29 | + %0 = bitcast [8 x i32]* %x to i8* |
| 30 | + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false) |
| 31 | + %1 = bitcast [8 x i32]* %y to i8* |
| 32 | + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false) |
| 33 | + store i32 0, i32* %xx, align 4 |
| 34 | + store i32 0, i32* %yy, align 4 |
| 35 | + store i32 0, i32* %i, align 4 |
| 36 | + br label %for.cond |
| 37 | + |
| 38 | +for.cond: ; preds = %for.inc, %entry |
| 39 | + %2 = load i32* %i, align 4 |
| 40 | + %cmp = icmp slt i32 %2, 8 |
| 41 | + br i1 %cmp, label %for.body, label %for.end |
| 42 | + |
| 43 | +for.body: ; preds = %for.cond |
| 44 | + %3 = load i32* %i, align 4 |
| 45 | + %idxprom = sext i32 %3 to i64 |
| 46 | + %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom |
| 47 | + %4 = load i32* %arrayidx, align 4 |
| 48 | + %add = add nsw i32 %4, 1 |
| 49 | + store i32 %add, i32* %xx, align 4 |
| 50 | + %5 = load i32* %xx, align 4 |
| 51 | + %add1 = add nsw i32 %5, 12 |
| 52 | + store i32 %add1, i32* %xx, align 4 |
| 53 | + %6 = load i32* %xx, align 4 |
| 54 | + %add2 = add nsw i32 %6, 23 |
| 55 | + store i32 %add2, i32* %xx, align 4 |
| 56 | + %7 = load i32* %xx, align 4 |
| 57 | + %add3 = add nsw i32 %7, 34 |
| 58 | + store i32 %add3, i32* %xx, align 4 |
| 59 | + %8 = load i32* %i, align 4 |
| 60 | + %idxprom4 = sext i32 %8 to i64 |
| 61 | + %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4 |
| 62 | + %9 = load i32* %arrayidx5, align 4 |
| 63 | + %10 = load i32* %yy, align 4 |
| 64 | + %mul = mul nsw i32 %10, %9 |
| 65 | + store i32 %mul, i32* %yy, align 4 |
| 66 | + br label %for.inc |
| 67 | + |
| 68 | +for.inc: ; preds = %for.body |
| 69 | + %11 = load i32* %i, align 4 |
| 70 | + %inc = add nsw i32 %11, 1 |
| 71 | + store i32 %inc, i32* %i, align 4 |
| 72 | + br label %for.cond |
| 73 | + |
| 74 | +for.end: ; preds = %for.cond |
| 75 | + %12 = load i32* %xx, align 4 |
| 76 | + %13 = load i32* %yy, align 4 |
| 77 | + %add6 = add nsw i32 %12, %13 |
| 78 | + ret i32 %add6 |
| 79 | +} |
| 80 | + |
| 81 | + |
| 82 | +; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to |
| 83 | +; hide latency. Whereas normally there would only be a single FADDvvv_4s |
| 84 | +; after it, this test checks to make sure there are more than one. |
| 85 | +; |
| 86 | +; CHECK: ********** MI Scheduling ********** |
| 87 | +; CHECK: neon4xfloat:BB#0 |
| 88 | +; CHECK: *** Final schedule for BB#0 *** |
| 89 | +; CHECK: FDIVv4f32 |
| 90 | +; CHECK: FADDv4f32 |
| 91 | +; CHECK: FADDv4f32 |
| 92 | +; CHECK: ********** INTERVALS ********** |
| 93 | +define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) { |
| 94 | + %tmp1 = fadd <4 x float> %A, %B; |
| 95 | + %tmp2 = fadd <4 x float> %A, %tmp1; |
| 96 | + %tmp3 = fadd <4 x float> %A, %tmp2; |
| 97 | + %tmp4 = fadd <4 x float> %A, %tmp3; |
| 98 | + %tmp5 = fadd <4 x float> %A, %tmp4; |
| 99 | + %tmp6 = fadd <4 x float> %A, %tmp5; |
| 100 | + %tmp7 = fadd <4 x float> %A, %tmp6; |
| 101 | + %tmp8 = fadd <4 x float> %A, %tmp7; |
| 102 | + %tmp9 = fdiv <4 x float> %A, %B; |
| 103 | + %tmp10 = fadd <4 x float> %tmp8, %tmp9; |
| 104 | + |
| 105 | + ret <4 x float> %tmp10 |
| 106 | +} |
| 107 | + |
| 108 | +; Function Attrs: nounwind |
| 109 | +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 |
| 110 | + |
| 111 | +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } |
| 112 | +attributes #1 = { nounwind } |
0 commit comments