Skip to content

[AArch64] Override isLSRCostLess, take number of instructions into account #84189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
cl::init(true), cl::Hidden);

static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
cl::init(true), cl::Hidden);

namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
Expand Down Expand Up @@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
return true;
return BaseT::shouldTreatInstructionLikeSelect(I);
}

bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) {
// AArch64 specific here is adding the number of instructions to the
// comparison (though not as the first consideration, as some targets do)
// along with changing the priority of the base additions.
// TODO: Maybe a more nuanced tradeoff between instruction count
// and number of registers? To be investigated at a later date.
if (EnableLSRCostOpt)
return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);

return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
}

std::optional<unsigned> getMinPageSize() const { return 4096; }

bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2);
};

} // end namespace llvm
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ entry:

for.body:
; CHECK: for.body
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
; CHECK: add x[[REG:[0-9]+]],
; CHECK: x[[REG]], #1, lsl #12
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,13 @@ exit:
; CHECK: ********** MI Scheduling **********
; CHECK: LDURDi_LDRDui:%bb.1 vector_body
;
; CHECK: Cluster ld/st SU(2) - SU(6)
; CHECK: Cluster ld/st SU(3) - SU(7)
; CHECK: Cluster ld/st SU(0) - SU(4)
; CHECK: Cluster ld/st SU(1) - SU(5)
;
; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi
; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi
; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui
; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui
;
define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,34 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: mov w9, #100 // =0x64
; CHECK-NEXT: cntd x10
; CHECK-NEXT: whilelo p1.d, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: cntd x9
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: rdvl x10, #2
; CHECK-NEXT: mov x11, x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x12, x10
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: add x13, x0, x8
; CHECK-NEXT: add x14, x1, x8
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl]
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14]
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
; CHECK-NEXT: whilelo p1.d, x12, x9
; CHECK-NEXT: add x12, x12, x10
; CHECK-NEXT: whilelo p1.d, x11, x8
; CHECK-NEXT: add x11, x11, x9
; CHECK-NEXT: b.mi .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
Expand Down Expand Up @@ -114,39 +112,37 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
; CHECK-LABEL: complex_mul_predicated_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: cntd x10
; CHECK-NEXT: mov w12, #100 // =0x64
; CHECK-NEXT: neg x11, x10
; CHECK-NEXT: cntd x9
; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: neg x10, x9
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: and x11, x11, x12
; CHECK-NEXT: rdvl x12, #2
; CHECK-NEXT: and x10, x10, x11
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
; CHECK-NEXT: add x13, x0, x8
; CHECK-NEXT: add x14, x1, x8
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2]
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: add x9, x9, x10
; CHECK-NEXT: add x8, x8, x12
; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0
; CHECK-NEXT: cmp x11, x9
; CHECK-NEXT: zip2 p1.d, p2.d, p2.d
; CHECK-NEXT: zip1 p2.d, p2.d, p2.d
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0
; CHECK-NEXT: cmp x10, x8
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
; CHECK-NEXT: add x1, x1, x11
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p1/m, z7.d
; CHECK-NEXT: mov z1.d, p2/m, z6.d
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
Expand Down Expand Up @@ -218,38 +214,38 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: mov w10, #100 // =0x64
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: cntd x9
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: rdvl x10, #2
; CHECK-NEXT: cnth x11
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: whilelo p1.d, xzr, x10
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: cntd x11
; CHECK-NEXT: rdvl x12, #2
; CHECK-NEXT: mov x12, x9
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
; CHECK-NEXT: add x13, x0, x8
; CHECK-NEXT: add x14, x1, x8
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: add x9, x9, x11
; CHECK-NEXT: add x8, x8, x12
; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0
; CHECK-NEXT: zip2 p1.d, p2.d, p2.d
; CHECK-NEXT: zip1 p2.d, p2.d, p2.d
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
; CHECK-NEXT: add x2, x2, x11
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p1/m, z7.d
; CHECK-NEXT: whilelo p1.d, x9, x10
; CHECK-NEXT: mov z1.d, p2/m, z6.d
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
; CHECK-NEXT: whilelo p1.d, x12, x8
; CHECK-NEXT: add x12, x12, x9
; CHECK-NEXT: b.mi .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
Expand Down
Loading
Loading