llvm
diff --git a/‎llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll
Lines changed: 170 additions & 0 deletions b/‎llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll
Lines changed: 170 additions & 0 deletions
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v < %s | FileCheck --check-prefix=RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck --check-prefix=RV64 %s
+
+; FIXME: We can rematerialize "addi s0, a2, 32" (ideally along the edge
+; %do.call -> %exit), and shrink wrap this routine
+define void @vecaddr_straightline(i32 zeroext %a, ptr %p) {
+; RV32-LABEL: vecaddr_straightline:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, a1, 32
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vle32.v v8, (s0)
+; RV32-NEXT:    vadd.vi v8, v8, 1
+; RV32-NEXT:    li a1, 57
+; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    beq a0, a1, .LBB0_2
+; RV32-NEXT:  # %bb.1: # %do_call
+; RV32-NEXT:    call foo
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:  .LBB0_2: # %exit
+; RV32-NEXT:    vle32.v v8, (s0)
+; RV32-NEXT:    vadd.vi v8, v8, 1
+; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vecaddr_straightline:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, a1, 32
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vle32.v v8, (s0)
+; RV64-NEXT:    vadd.vi v8, v8, 1
+; RV64-NEXT:    li a1, 57
+; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    beq a0, a1, .LBB0_2
+; RV64-NEXT:  # %bb.1: # %do_call
+; RV64-NEXT:    call foo
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:  .LBB0_2: # %exit
+; RV64-NEXT:    vle32.v v8, (s0)
+; RV64-NEXT:    vadd.vi v8, v8, 1
+; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %gep = getelementptr i8, ptr %p, i32 32
+  %v1 = load <4 x i32>, ptr %gep
+  %v2 = add <4 x i32> %v1, splat (i32 1)
+  store <4 x i32> %v2, ptr %gep
+  %cmp0 = icmp eq i32 %a, 57
+  br i1 %cmp0, label %exit, label %do_call
+do_call:
+  call i32 @foo()
+  br label %exit
+exit:
+  %v3 = load <4 x i32>, ptr %gep
+  %v4 = add <4 x i32> %v3, splat (i32 1)
+  store <4 x i32> %v4, ptr %gep
+  ret void
+}
+
+; In this case, the second use is in a loop, so using a callee
+; saved register to avoid a remat is the profitable choice.
+; FIXME: We can shrink wrap the frame setup around the loop
+; and avoid it along the %bb.0 -> %exit edge
+define void @vecaddr_loop(i32 zeroext %a, ptr %p) {
+; RV32-LABEL: vecaddr_loop:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, a1, 32
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vle32.v v8, (s0)
+; RV32-NEXT:    vadd.vi v8, v8, 1
+; RV32-NEXT:    li a1, 57
+; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    beq a0, a1, .LBB1_2
+; RV32-NEXT:  .LBB1_1: # %do_call
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    call foo
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vle32.v v8, (s0)
+; RV32-NEXT:    vadd.vi v8, v8, 1
+; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    bnez a0, .LBB1_1
+; RV32-NEXT:  .LBB1_2: # %exit
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vecaddr_loop:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, a1, 32
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vle32.v v8, (s0)
+; RV64-NEXT:    vadd.vi v8, v8, 1
+; RV64-NEXT:    li a1, 57
+; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    beq a0, a1, .LBB1_2
+; RV64-NEXT:  .LBB1_1: # %do_call
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    call foo
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vle32.v v8, (s0)
+; RV64-NEXT:    vadd.vi v8, v8, 1
+; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    bnez a0, .LBB1_1
+; RV64-NEXT:  .LBB1_2: # %exit
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %gep = getelementptr i8, ptr %p, i32 32
+  %v1 = load <4 x i32>, ptr %gep
+  %v2 = add <4 x i32> %v1, splat (i32 1)
+  store <4 x i32> %v2, ptr %gep
+  %cmp0 = icmp eq i32 %a, 57
+  br i1 %cmp0, label %exit, label %do_call
+do_call:
+  %b = call i32 @foo()
+  %v3 = load <4 x i32>, ptr %gep
+  %v4 = add <4 x i32> %v3, splat (i32 1)
+  store <4 x i32> %v4, ptr %gep
+
+  %cmp1 = icmp eq i32 %b, 0
+  br i1 %cmp1, label %exit, label %do_call
+exit:
+  ret void
+}
+
+declare zeroext i32 @foo()
+