@@ -4880,6 +4880,55 @@ static bool getFMAPatterns(MachineInstr &Root,
4880
4880
return Found;
4881
4881
}
4882
4882
4883
+ static bool getFMULPatterns (MachineInstr &Root,
4884
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4885
+ MachineBasicBlock &MBB = *Root.getParent ();
4886
+ bool Found = false ;
4887
+
4888
+ auto Match = [&](unsigned Opcode, int Operand,
4889
+ MachineCombinerPattern Pattern) -> bool {
4890
+ MachineRegisterInfo &MRI = MBB.getParent ()->getRegInfo ();
4891
+ MachineOperand &MO = Root.getOperand (Operand);
4892
+ MachineInstr *MI = nullptr ;
4893
+ if (MO.isReg () && Register::isVirtualRegister (MO.getReg ()))
4894
+ MI = MRI.getUniqueVRegDef (MO.getReg ());
4895
+ if (MI && MI->getOpcode () == Opcode) {
4896
+ Patterns.push_back (Pattern);
4897
+ return true ;
4898
+ }
4899
+ return false ;
4900
+ };
4901
+
4902
+ typedef MachineCombinerPattern MCP;
4903
+
4904
+ switch (Root.getOpcode ()) {
4905
+ default :
4906
+ return false ;
4907
+ case AArch64::FMULv2f32:
4908
+ Found = Match (AArch64::DUPv2i32lane, 1 , MCP::FMULv2i32_indexed_OP1);
4909
+ Found |= Match (AArch64::DUPv2i32lane, 2 , MCP::FMULv2i32_indexed_OP2);
4910
+ break ;
4911
+ case AArch64::FMULv2f64:
4912
+ Found = Match (AArch64::DUPv2i64lane, 1 , MCP::FMULv2i64_indexed_OP1);
4913
+ Found |= Match (AArch64::DUPv2i64lane, 2 , MCP::FMULv2i64_indexed_OP2);
4914
+ break ;
4915
+ case AArch64::FMULv4f16:
4916
+ Found = Match (AArch64::DUPv4i16lane, 1 , MCP::FMULv4i16_indexed_OP1);
4917
+ Found |= Match (AArch64::DUPv4i16lane, 2 , MCP::FMULv4i16_indexed_OP2);
4918
+ break ;
4919
+ case AArch64::FMULv4f32:
4920
+ Found = Match (AArch64::DUPv4i32lane, 1 , MCP::FMULv4i32_indexed_OP1);
4921
+ Found |= Match (AArch64::DUPv4i32lane, 2 , MCP::FMULv4i32_indexed_OP2);
4922
+ break ;
4923
+ case AArch64::FMULv8f16:
4924
+ Found = Match (AArch64::DUPv8i16lane, 1 , MCP::FMULv8i16_indexed_OP1);
4925
+ Found |= Match (AArch64::DUPv8i16lane, 2 , MCP::FMULv8i16_indexed_OP2);
4926
+ break ;
4927
+ }
4928
+
4929
+ return Found;
4930
+ }
4931
+
4883
4932
// / Return true when a code sequence can improve throughput. It
4884
4933
// / should be called only for instructions in loops.
4885
4934
// / \param Pattern - combiner pattern
@@ -4943,6 +4992,16 @@ bool AArch64InstrInfo::isThroughputPattern(
4943
4992
case MachineCombinerPattern::FMLSv2f64_OP2:
4944
4993
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4945
4994
case MachineCombinerPattern::FMLSv4f32_OP2:
4995
+ case MachineCombinerPattern::FMULv2i32_indexed_OP1:
4996
+ case MachineCombinerPattern::FMULv2i32_indexed_OP2:
4997
+ case MachineCombinerPattern::FMULv2i64_indexed_OP1:
4998
+ case MachineCombinerPattern::FMULv2i64_indexed_OP2:
4999
+ case MachineCombinerPattern::FMULv4i16_indexed_OP1:
5000
+ case MachineCombinerPattern::FMULv4i16_indexed_OP2:
5001
+ case MachineCombinerPattern::FMULv4i32_indexed_OP1:
5002
+ case MachineCombinerPattern::FMULv4i32_indexed_OP2:
5003
+ case MachineCombinerPattern::FMULv8i16_indexed_OP1:
5004
+ case MachineCombinerPattern::FMULv8i16_indexed_OP2:
4946
5005
case MachineCombinerPattern::MULADDv8i8_OP1:
4947
5006
case MachineCombinerPattern::MULADDv8i8_OP2:
4948
5007
case MachineCombinerPattern::MULADDv16i8_OP1:
@@ -4999,6 +5058,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
4999
5058
if (getMaddPatterns (Root, Patterns))
5000
5059
return true ;
5001
5060
// Floating point patterns
5061
+ if (getFMULPatterns (Root, Patterns))
5062
+ return true ;
5002
5063
if (getFMAPatterns (Root, Patterns))
5003
5064
return true ;
5004
5065
@@ -5087,6 +5148,42 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
5087
5148
return MUL;
5088
5149
}
5089
5150
5151
+ // / Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
5152
+ static MachineInstr *
5153
+ genIndexedMultiply (MachineInstr &Root,
5154
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
5155
+ unsigned IdxDupOp, unsigned MulOpc,
5156
+ const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
5157
+ assert (((IdxDupOp == 1 ) || (IdxDupOp == 2 )) &&
5158
+ " Invalid index of FMUL operand" );
5159
+
5160
+ MachineFunction &MF = *Root.getMF ();
5161
+ const TargetInstrInfo *TII = MF.getSubtarget ().getInstrInfo ();
5162
+
5163
+ MachineInstr *Dup =
5164
+ MF.getRegInfo ().getUniqueVRegDef (Root.getOperand (IdxDupOp).getReg ());
5165
+
5166
+ Register DupSrcReg = Dup->getOperand (1 ).getReg ();
5167
+ MRI.clearKillFlags (DupSrcReg);
5168
+ MRI.constrainRegClass (DupSrcReg, RC);
5169
+
5170
+ unsigned DupSrcLane = Dup->getOperand (2 ).getImm ();
5171
+
5172
+ unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1 ;
5173
+ MachineOperand &MulOp = Root.getOperand (IdxMulOp);
5174
+
5175
+ Register ResultReg = Root.getOperand (0 ).getReg ();
5176
+
5177
+ MachineInstrBuilder MIB;
5178
+ MIB = BuildMI (MF, Root.getDebugLoc (), TII->get (MulOpc), ResultReg)
5179
+ .add (MulOp)
5180
+ .addReg (DupSrcReg)
5181
+ .addImm (DupSrcLane);
5182
+
5183
+ InsInstrs.push_back (MIB);
5184
+ return &Root;
5185
+ }
5186
+
5090
5187
// / genFusedMultiplyAcc - Helper to generate fused multiply accumulate
5091
5188
// / instructions.
5092
5189
// /
@@ -6045,12 +6142,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
6045
6142
}
6046
6143
break ;
6047
6144
}
6145
+ case MachineCombinerPattern::FMULv2i32_indexed_OP1:
6146
+ case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
6147
+ unsigned IdxDupOp =
6148
+ (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2 ;
6149
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
6150
+ &AArch64::FPR128RegClass, MRI);
6151
+ break ;
6152
+ }
6153
+ case MachineCombinerPattern::FMULv2i64_indexed_OP1:
6154
+ case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
6155
+ unsigned IdxDupOp =
6156
+ (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2 ;
6157
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
6158
+ &AArch64::FPR128RegClass, MRI);
6159
+ break ;
6160
+ }
6161
+ case MachineCombinerPattern::FMULv4i16_indexed_OP1:
6162
+ case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
6163
+ unsigned IdxDupOp =
6164
+ (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2 ;
6165
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
6166
+ &AArch64::FPR128_loRegClass, MRI);
6167
+ break ;
6168
+ }
6169
+ case MachineCombinerPattern::FMULv4i32_indexed_OP1:
6170
+ case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
6171
+ unsigned IdxDupOp =
6172
+ (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2 ;
6173
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
6174
+ &AArch64::FPR128RegClass, MRI);
6175
+ break ;
6176
+ }
6177
+ case MachineCombinerPattern::FMULv8i16_indexed_OP1:
6178
+ case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
6179
+ unsigned IdxDupOp =
6180
+ (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2 ;
6181
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
6182
+ &AArch64::FPR128_loRegClass, MRI);
6183
+ break ;
6184
+ }
6048
6185
} // end switch (Pattern)
6049
6186
// Record MUL and ADD/SUB for deletion
6050
6187
// FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
6051
6188
// CodeGen/AArch64/urem-seteq-nonzero.ll.
6052
6189
// assert(MUL && "MUL was never set");
6053
- DelInstrs.push_back (MUL);
6190
+ if (MUL)
6191
+ DelInstrs.push_back (MUL);
6054
6192
DelInstrs.push_back (&Root);
6055
6193
}
6056
6194
0 commit comments