-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][MachineCombiner] Reassociate long chains of accumulation instructions into a tree to increase ILP #126060
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][MachineCombiner] Reassociate long chains of accumulation instructions into a tree to increase ILP #126060
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Jonathan Cohen (jcohen-apple) ChangesThis pattern shows up often in media codes. The optimization should only kick in for O3. Currently only supports a single accumulation instruction, but can easily be expanded to support additional instructions in the future. Full diff: https://github.com/llvm/llvm-project/pull/126060.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0f2b969fba35c7..af8d6a8031c33d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -78,6 +79,18 @@ static cl::opt<unsigned>
BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
cl::desc("Restrict range of B instructions (DEBUG)"));
+static cl::opt<bool>
+ EnableAccReassociation("aarch64-acc-reassoc", cl::Hidden, cl::init(true),
+ cl::desc("Enable reassociation of accumulation chains"));
+
+static cl::opt<unsigned int>
+ MinAccumulatorDepth("aarch64-acc-min-depth", cl::Hidden, cl::init(8),
+ cl::desc("Minimum length of accumulator chains required for the optimization to kick in"));
+
+static cl::opt<unsigned int>
+ MaxAccumulatorWidth("aarch64-acc-max-width", cl::Hidden, cl::init(3), cl::desc("Maximum number of branches in the accumulator tree"));
+
+
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
AArch64::CATCHRET),
@@ -6674,6 +6687,118 @@ static bool getMaddPatterns(MachineInstr &Root,
}
return Found;
}
+
+static bool isAccumulationOpcode(unsigned Opcode) {
+ switch(Opcode) {
+ default:
+ break;
+ case AArch64::UABALB_ZZZ_D:
+ case AArch64::UABALB_ZZZ_H:
+ case AArch64::UABALB_ZZZ_S:
+ case AArch64::UABALT_ZZZ_D:
+ case AArch64::UABALT_ZZZ_H:
+ case AArch64::UABALT_ZZZ_S:
+ case AArch64::UABALv16i8_v8i16:
+ case AArch64::UABALv2i32_v2i64:
+ case AArch64::UABALv4i16_v4i32:
+ case AArch64::UABALv4i32_v2i64:
+ case AArch64::UABALv8i16_v4i32:
+ case AArch64::UABALv8i8_v8i16:
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned getAccumulationStartOpCode(unsigned AccumulationOpcode) {
+ switch(AccumulationOpcode) {
+ default:
+ llvm_unreachable("Unknown accumulator opcode");
+ case AArch64::UABALB_ZZZ_D:
+ return AArch64::UABDLB_ZZZ_D;
+ case AArch64::UABALB_ZZZ_H:
+ return AArch64::UABDLB_ZZZ_H;
+ case AArch64::UABALB_ZZZ_S:
+ return AArch64::UABDLB_ZZZ_S;
+ case AArch64::UABALT_ZZZ_D:
+ return AArch64::UABDLT_ZZZ_D;
+ case AArch64::UABALT_ZZZ_H:
+ return AArch64::UABDLT_ZZZ_H;
+ case AArch64::UABALT_ZZZ_S:
+ return AArch64::UABDLT_ZZZ_S;
+ case AArch64::UABALv16i8_v8i16:
+ return AArch64::UABDLv16i8_v8i16;
+ case AArch64::UABALv2i32_v2i64:
+ return AArch64::UABDLv2i32_v2i64;
+ case AArch64::UABALv4i16_v4i32:
+ return AArch64::UABDLv4i16_v4i32;
+ case AArch64::UABALv4i32_v2i64:
+ return AArch64::UABDLv4i32_v2i64;
+ case AArch64::UABALv8i16_v4i32:
+ return AArch64::UABDLv8i16_v4i32;
+ case AArch64::UABALv8i8_v8i16:
+ return AArch64::UABDLv8i8_v8i16;
+ }
+}
+
+static void getAccumulatorChain(MachineInstr *CurrentInstr, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, SmallVectorImpl<Register> &Chain) {
+ // Walk up the chain of accumulation instructions and collect them in the vector.
+ unsigned AccumulatorOpcode = CurrentInstr->getOpcode();
+ unsigned ChainStartOpCode = getAccumulationStartOpCode(AccumulatorOpcode);
+ while(CurrentInstr && (canCombine(MBB, CurrentInstr->getOperand(1), AccumulatorOpcode) || canCombine(MBB, CurrentInstr->getOperand(1), ChainStartOpCode))) {
+ Chain.push_back(CurrentInstr->getOperand(0).getReg());
+ CurrentInstr = MRI.getUniqueVRegDef(CurrentInstr->getOperand(1).getReg());
+ }
+
+ // Add the instruction at the top of the chain.
+ if (CurrentInstr->getOpcode() == ChainStartOpCode)
+ Chain.push_back(CurrentInstr->getOperand(0).getReg());
+}
+
+/// Find chains of accumulations, likely linearized by reassocation pass,
+/// that can be rewritten as a tree for increased ILP.
+static bool getAccumulatorReassociationPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+ // find a chain of depth 4, which would make it profitable to rewrite
+ // as a tree. This pattern should be applied recursively in case we
+ // have a longer chain.
+ if (!EnableAccReassociation)
+ return false;
+
+ unsigned Opc = Root.getOpcode();
+ if (!isAccumulationOpcode(Opc))
+ return false;
+
+ // Verify that this is the end of the chain.
+ MachineBasicBlock &MBB = *Root.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ if (!MRI.hasOneNonDBGUser(Root.getOperand(0).getReg()))
+ return false;
+
+ auto User = MRI.use_instr_begin(Root.getOperand(0).getReg());
+ if (User->getOpcode() == Opc)
+ return false;
+
+ // Walk up the use chain and collect the reduction chain.
+ SmallVector<Register, 32> Chain;
+ getAccumulatorChain(&Root, MBB, MRI, Chain);
+
+ // Reject chains which are too short to be worth modifying.
+ if (Chain.size() < MinAccumulatorDepth)
+ return false;
+
+ // Check if the MBB this instruction is a part of contains any other chains. If so, don't apply it.
+ SmallSet<Register, 32> ReductionChain(Chain.begin(), Chain.end());
+ for (const auto &I : MBB) {
+ if (I.getOpcode() == Opc && !ReductionChain.contains(I.getOperand(0).getReg()))
+ return false;
+ }
+
+ typedef AArch64MachineCombinerPattern MCP;
+ Patterns.push_back(MCP::ACC_CHAIN);
+ return true;
+}
+
/// Floating-Point Support
/// Find instructions that can be turned into madd.
@@ -7061,6 +7186,7 @@ AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::ACC_CHAIN:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7078,6 +7204,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
// Integer patterns
if (getMaddPatterns(Root, Patterns))
return true;
+ if (getAccumulatorReassociationPatterns(Root, Patterns))
+ return true;
// Floating point patterns
if (getFMULPatterns(Root, Patterns))
return true;
@@ -7436,6 +7564,72 @@ genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
DelInstrs.push_back(&Root);
}
+static unsigned int getReduceOpCodeForAccumulator(unsigned int AccumulatorOpCode) {
+ switch (AccumulatorOpCode) {
+ case AArch64::UABALB_ZZZ_D:
+ return AArch64::ADD_ZZZ_D;
+ case AArch64::UABALB_ZZZ_H:
+ return AArch64::ADD_ZZZ_H;
+ case AArch64::UABALB_ZZZ_S:
+ return AArch64::ADD_ZZZ_S;
+ case AArch64::UABALT_ZZZ_D:
+ return AArch64::ADD_ZZZ_D;
+ case AArch64::UABALT_ZZZ_H:
+ return AArch64::ADD_ZZZ_H;
+ case AArch64::UABALT_ZZZ_S:
+ return AArch64::ADD_ZZZ_S;
+ case AArch64::UABALv16i8_v8i16:
+ return AArch64::ADDv8i16;
+ case AArch64::UABALv2i32_v2i64:
+ return AArch64::ADDv2i64;
+ case AArch64::UABALv4i16_v4i32:
+ return AArch64::ADDv4i32;
+ case AArch64::UABALv4i32_v2i64:
+ return AArch64::ADDv2i64;
+ case AArch64::UABALv8i16_v4i32:
+ return AArch64::ADDv4i32;
+ case AArch64::UABALv8i8_v8i16:
+ return AArch64::ADDv8i16;
+ default:
+ llvm_unreachable("Unknown accumulator opcode");
+ }
+}
+
+// Reduce branches of the accumulator tree by adding them together.
+static void reduceAccumulatorTree(SmallVectorImpl<Register> &RegistersToReduce, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ MachineFunction &MF, MachineInstr &Root, MachineRegisterInfo &MRI,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, Register ResultReg) {
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ SmallVector<Register, 8> NewRegs;
+ for (unsigned int i = 1; i <= (RegistersToReduce.size() / 2); i+=2) {
+ auto RHS = RegistersToReduce[i - 1];
+ auto LHS = RegistersToReduce[i];
+ Register Dest;
+ // If we are reducing 2 registers, reuse the original result register.
+ if (RegistersToReduce.size() == 2)
+ Dest = ResultReg;
+ // Otherwise, create a new virtual register to hold the partial sum.
+ else {
+ auto NewVR = MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(0).getReg()));
+ Dest = NewVR;
+ NewRegs.push_back(Dest);
+ InstrIdxForVirtReg.insert(std::make_pair(Dest, InsInstrs.size()));
+ }
+
+ // Create the new add instruction.
+ MachineInstrBuilder MIB = BuildMI(MF, MIMetadata(Root), TII->get(getReduceOpCodeForAccumulator(Root.getOpcode())), Dest).addReg(RHS, getKillRegState(true)).addReg(LHS, getKillRegState(true));
+ // Copy any flags needed from the original instruction.
+ MIB->setFlags(Root.getFlags());
+ InsInstrs.push_back(MIB);
+ }
+
+ // If the number of registers to reduce is odd, add the reminaing register to the vector of registers to reduce.
+ if (RegistersToReduce.size() % 2 != 0)
+ NewRegs.push_back(RegistersToReduce[RegistersToReduce.size() - 1]);
+
+ RegistersToReduce = NewRegs;
+}
+
/// When getMachineCombinerPatterns() finds potential patterns,
/// this function generates the instructions that could replace the
/// original code sequence
@@ -7671,7 +7865,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
break;
}
+ case AArch64MachineCombinerPattern::ACC_CHAIN: {
+ SmallVector<Register, 32> ChainRegs;
+ getAccumulatorChain(&Root, MBB, MRI, ChainRegs);
+
+ unsigned int Depth = ChainRegs.size();
+ assert(MaxAccumulatorWidth > 1 && "Max accumulator width set to illegal value");
+ unsigned int MaxWidth = Log2_32(Depth) < MaxAccumulatorWidth ? Log2_32(Depth) : MaxAccumulatorWidth;
+
+ // Walk down the chain and rewrite it as a tree.
+ for (auto IndexedReg : llvm::enumerate(llvm::reverse(ChainRegs))) {
+ // No need to rewrite the first node, it is already perfect as it is.
+ if (IndexedReg.index() == 0)
+ continue;
+
+ MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
+ MachineInstrBuilder MIB;
+ Register AccReg;
+ if (IndexedReg.index() < MaxWidth) {
+ // Now we need to create new instructions for the first row.
+ AccReg = Instr->getOperand(0).getReg();
+ MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(MRI.getUniqueVRegDef(ChainRegs.back())->getOpcode()), AccReg).addReg(Instr->getOperand(2).getReg(), getKillRegState(Instr->getOperand(2).isKill())).addReg(Instr->getOperand(3).getReg(), getKillRegState(Instr->getOperand(3).isKill()));
+ } else {
+ // For the remaining cases, we need ot use an output register of one of the newly inserted instuctions as operand 1
+ AccReg = Instr->getOperand(0).getReg() == Root.getOperand(0).getReg() ? MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(0).getReg())) : Instr->getOperand(0).getReg();
+ assert(IndexedReg.index() - MaxWidth >= 0);
+ auto AccumulatorInput = ChainRegs[Depth - (IndexedReg.index() - MaxWidth) - 1];
+ MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(Instr->getOpcode()), AccReg).addReg(AccumulatorInput, getKillRegState(true)).addReg(Instr->getOperand(2).getReg(), getKillRegState(Instr->getOperand(2).isKill())).addReg(Instr->getOperand(3).getReg(), getKillRegState(Instr->getOperand(3).isKill()));
+ }
+
+ MIB->setFlags(Instr->getFlags());
+ InstrIdxForVirtReg.insert(std::make_pair(AccReg, InsInstrs.size()));
+ InsInstrs.push_back(MIB);
+ DelInstrs.push_back(Instr);
+ }
+
+ SmallVector<Register, 8> RegistersToReduce;
+ for (int i = (InsInstrs.size() - MaxWidth); i < InsInstrs.size(); ++i) {
+ auto Reg = InsInstrs[i]->getOperand(0).getReg();
+ RegistersToReduce.push_back(Reg);
+ }
+ while (RegistersToReduce.size() > 1)
+ reduceAccumulatorTree(RegistersToReduce, InsInstrs, MF, Root, MRI, InstrIdxForVirtReg, Root.getOperand(0).getReg());
+
+ // We don't want to break, we handle setting flags and adding Root to DelInstrs from here.
+ return;
+ }
case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
Opc = AArch64::MLAv8i8;
RC = &AArch64::FPR64RegClass;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 9a0034223ab9ba..76acdc22d1c589 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ ACC_CHAIN
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
new file mode 100644
index 00000000000000..a5cc1c5af13eb2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
@@ -0,0 +1,105 @@
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-max-width=2 %s -o - | FileCheck %s --check-prefix=NARROW-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-min-depth=100 %s -o - | FileCheck %s --check-prefix=NO-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-reassoc=false %s -o - | FileCheck %s --check-prefix=NO-TREE
+
+# A chain of UABAL instructions that can be reassociated for better ILP.
+# Before the optimization, we accumulate in a single long chain.
+# CHECK-LABEL: uabal_accumulation
+# CHECK: [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK: [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK: [[START3:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK: [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# CHECK: [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]]
+# CHECK: [[C1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START3]]
+# CHECK: [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]
+# CHECK: [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]]
+# CHECK: [[C2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[C1]]
+# CHECK: [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[A2]], killed [[B2]]
+# CHECK: [[TOTAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[PARTIAL_SUM]], killed [[C2]]
+# CHECK: [[END:%.*]]:fpr32 = ADDVv4i32v killed [[TOTAL_SUM]]
+
+# NARROW-TREE: [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NARROW-TREE: [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NARROW-TREE: [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NARROW-TREE: [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]]
+# NARROW-TREE: [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]
+# NARROW-TREE: [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]]
+# NARROW-TREE: [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NARROW-TREE: [[B3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B2]]
+# NARROW-TREE: [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]
+# NARROW-TREE: [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[B3]], killed [[A4]]
+# NARROW-TREE: [[END:%.*]]:fpr32 = ADDVv4i32v killed [[PARTIAL_SUM]]
+
+# NO-TREE: [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NO-TREE: [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NO-TREE: [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]
+# NO-TREE: [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NO-TREE: [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]
+# NO-TREE: [[A5:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A4]]
+# NO-TREE: [[A6:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A5]]
+# NO-TREE: [[A7:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A6]]
+# NO-TREE: [[A8:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A7]]
+# NO-TREE: [[END:%.*]]:fpr32 = ADDVv4i32v killed [[A8]]
+
+---
+name: uabal_accumulation
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3
+
+ %3:gpr64 = COPY $x3
+ %2:gpr64common = COPY $x2
+ %1:gpr64 = COPY $x1
+ %0:gpr64common = COPY $x0
+ %4:fpr64 = LDRDui %0, 0 :: (load (s64))
+ %5:fpr64 = LDRDui %2, 0 :: (load (s64))
+ %6:gpr64common = ADDXrr %0, %1
+ %7:gpr64common = ADDXrr %2, %3
+ %8:fpr64 = LDRDui %6, 0 :: (load (s64))
+ %9:fpr64 = LDRDui %7, 0 :: (load (s64))
+ %10:fpr128 = UABDLv4i16_v4i32 killed %8, killed %9
+ %11:fpr128 = UABALv4i16_v4i32 killed %10, killed %4, killed %5
+ %12:gpr64common = ADDXrr %6, %1
+ %13:gpr64common = ADDXrr %7, %3
+ %14:fpr64 = LDRDui %12, 0 :: (load (s64))
+ %15:fpr64 = LDRDui %13, 0 :: (load (s64))
+ %16:fpr128 = UABALv4i16_v4i32 killed %11, killed %14, killed %15
+ %17:gpr64common = ADDXrr %12, %1
+ %18:gpr64common = ADDXrr %13, %3
+ %19:fpr64 = LDRDui %17, 0 :: (load (s64))
+ %20:fpr64 = LDRDui %18, 0 :: (load (s64))
+ %21:fpr128 = UABALv4i16_v4i32 killed %16, killed %19, killed %20
+ %22:gpr64common = ADDXrr %17, %1
+ %23:gpr64common = ADDXrr %18, %3
+ %24:fpr64 = LDRDui %22, 0 :: (load (s64))
+ %25:fpr64 = LDRDui %23, 0 :: (load (s64))
+ %26:fpr128 = UABALv4i16_v4i32 killed %21, killed %24, killed %25
+ %27:gpr64common = ADDXrr %22, %1
+ %28:gpr64common = ADDXrr %23, %3
+ %29:fpr64 = LDRDui %27, 0 :: (load (s64))
+ %30:fpr64 = LDRDui %28, 0 :: (load (s64))
+ %31:fpr128 = UABALv4i16_v4i32 killed %26, killed %29, killed %30
+ %32:gpr64common = ADDXrr %27, %1
+ %33:gpr64common = ADDXrr %28, %3
+ %34:fpr64 = LDRDui %32, 0 :: (load (s64))
+ %35:fpr64 = LDRDui %33, 0 :: (load (s64))
+ %36:fpr128 = UABALv4i16_v4i32 killed %31, killed %34, killed %35
+ %37:gpr64common = ADDXrr %32, %1
+ %38:gpr64common = ADDXrr %33, %3
+ %39:fpr64 = LDRDui %37, 0 :: (load (s64))
+ %40:fpr64 = LDRDui %38, 0 :: (load (s64))
+ %41:fpr128 = UABALv4i16_v4i32 killed %36, killed %39, killed %40
+ %42:gpr64common = ADDXrr %37, %1
+ %43:gpr64common = ADDXrr %38, %3
+ %44:fpr64 = LDRDui %42, 0 :: (load (s64))
+ %45:fpr64 = LDRDui %43, 0 :: (load (s64))
+ %46:fpr128 = UABALv4i16_v4i32 killed %41, killed %44, killed %45
+ %47:fpr32 = ADDVv4i32v killed %46
+ %48:fpr128 = IMPLICIT_DEF
+ %49:fpr128 = INSERT_SUBREG %48, killed %47, %subreg.ssub
+ %50:gpr32all = COPY %49.ssub
+ $w0 = COPY %50
+ RET_ReallyLR implicit $w0
+
+...
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
830c373
to
23eb60b
Compare
I'm not familiar with AArch64 target so I added some reviewers and changed the title to increase visibility. |
Thanks, github suggested you and I wasn't sure who to add. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UABAL is a more unusual choice to pick as the first instruction. Did you try other operations like MLA, that are likely more common and I believe still perform the same accumulation? The idea sounds like a nice one, but adding extra instructions does sometimes make transforms like this less reliable than if we were purely reassociating.
Could the main part of the logic be added to base MachineCombiner, like the existing code that uses isAssociativeAndCommutative?
It is probably worth making sure there are tests for all the types, preferably with tests at the .ll level so we can see the whole output. Things like different chain lengths too, maybe, and signed/unsigned, loops and the "2" variants of the instructions.
MLA is indeed more common, but less likely to get linearized by the compiler. See this godbolt example where the compiler linearizes multiple accumulators due to reassociate pass running after inlining a function - https://godbolt.org/z/1aPbb3YY1. This same bug should appear for other intrinsics whose ISEL pattern also contains |
I was mostly worried about UABAL not being generated very often, and this turning out to not be profitable because we hadn't tested it on a larger number of cases. With how many ops are needed to make it profitable (8?) it seems like it should be OK. It is probably worth adding SABAL too though, to keep them symmetric. I wasn't sure how the end of the chain was calculated, I thought it might be worth looking at a looping chain too as in https://godbolt.org/z/8conT4xaa. It might need a longer chain to be profitable though, longer than might be realistic if it loses the ability to forward the uabal (if that is how it works https://godbolt.org/z/YWhY98dxj). Perhaps it needs a higher-level transform that could accumulate multiple chains after the loop. And it might be more awkward, but is it worth trying to support uabal and uabal2 operations in the same chain? I imagine the interleaving of the high instructions to be quite efficient in well written vector code that can use them. |
23eb60b
to
db7eb68
Compare
✅ With the latest revision this PR passed the undef deprecator. |
Updated the PR with the following changes:
I still need to:
I don't think this case should kick in for loops that are not fully unrolled. If I'm not mistaken, when LLVM unrolls the loops it will create multiple accumulators, but I need to check that. |
db7eb68
to
0214b16
Compare
ab1c4a2
to
db9f112
Compare
@davemgreen Made most of the changes you suggested. For this pattern to apply on MLA instructions I had to modify machine-combiner to run recursively while there are still changes to apply to the basic block. Let me know what you think of this approach before I expand the pattern to additional data types. |
7944de0
to
9644e2a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this pattern to apply on MLA instructions I had to modify machine-combiner to run recursively while there are still changes to apply to the basic block. Let me know what you think of this approach before I expand the pattern to additional data types.
I gave it a test (with udot too) and I think it looked OK. Thanks for the updates.
The recursion is because there are multiple MLA patterns? Often routines like instcombine have a worklist that they can re-add to if needed. Perhaps so long as we know this it's causing problems lets get the UABA and SABA in first then, and we can add MLA with a later patch if needed where we can sort out re-visiting nodes.
b17a7e8
to
dbb0eb6
Compare
llvm/lib/CodeGen/TargetInstrInfo.cpp
Outdated
std::optional<unsigned> ReduceOpCode = | ||
getReduceOpcodeForAccumulator(Root.getOpcode()); | ||
|
||
if (!ReduceOpCode.value()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For example, I believe if we get here we have already made alterations to the code, so it might be bad if we decide to bail out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assumed that the code alteration will only be applied if the code pattern is applied successfully, but I agree it makes more sense to bail if this is used incorrectly. I changed them back to use only llvm_unreachable
700eec4
to
1388d51
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - LGTM
…nstructions into a tree This pass is designed to increase ILP by performing accumulation into multiple registers. It currently supports only the S/UABAL accumulation instruction, but can be extended to support additional instructions.
1388d51
to
7b8c1b4
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/157/builds/23310 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/28/builds/7756 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/26650 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/10765 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/7/builds/12581 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/54/builds/7595 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/195/builds/6597 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/97/builds/5598 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/77/builds/10402 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/75/builds/5704 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/118/builds/5351 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/197/builds/3060 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/95/builds/11120 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/13/builds/6123 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/10064 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/45/builds/10412 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/5555 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/107/builds/9189 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/90/builds/5110 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/23/builds/8723 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/7576 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/72/builds/9471 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/141/builds/7268 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/13630 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/15926 Here is the relevant piece of the build log for the reference
|
…nstructions into a tree (#132728) This pass is designed to increase ILP by performing accumulation into multiple registers. It currently supports only the S/UABAL accumulation instruction, but can be extended to support additional instructions. Reland of #126060 which was reverted due to a conflict with #131272.
…nstructions into a tree (llvm#132728) This pass is designed to increase ILP by performing accumulation into multiple registers. It currently supports only the S/UABAL accumulation instruction, but can be extended to support additional instructions. Reland of llvm#126060 which was reverted due to a conflict with llvm#131272.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/2/builds/20004 Here is the relevant piece of the build log for the reference
|
This pattern shows up often in media codes. The optimization should only kick in for O3. Currently only supports a single accumulation instruction, but can easily be expanded to support additional instructions in the future.