@@ -4917,6 +4917,55 @@ static bool getFMAPatterns(MachineInstr &Root,
4917
4917
return Found;
4918
4918
}
4919
4919
4920
+ static bool getFMULPatterns (MachineInstr &Root,
4921
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4922
+ MachineBasicBlock &MBB = *Root.getParent ();
4923
+ bool Found = false ;
4924
+
4925
+ auto Match = [&](unsigned Opcode, int Operand,
4926
+ MachineCombinerPattern Pattern) -> bool {
4927
+ MachineRegisterInfo &MRI = MBB.getParent ()->getRegInfo ();
4928
+ MachineOperand &MO = Root.getOperand (Operand);
4929
+ MachineInstr *MI = nullptr ;
4930
+ if (MO.isReg () && Register::isVirtualRegister (MO.getReg ()))
4931
+ MI = MRI.getUniqueVRegDef (MO.getReg ());
4932
+ if (MI && MI->getOpcode () == Opcode) {
4933
+ Patterns.push_back (Pattern);
4934
+ return true ;
4935
+ }
4936
+ return false ;
4937
+ };
4938
+
4939
+ typedef MachineCombinerPattern MCP;
4940
+
4941
+ switch (Root.getOpcode ()) {
4942
+ default :
4943
+ return false ;
4944
+ case AArch64::FMULv2f32:
4945
+ Found = Match (AArch64::DUPv2i32lane, 1 , MCP::FMULv2i32_indexed_OP1);
4946
+ Found |= Match (AArch64::DUPv2i32lane, 2 , MCP::FMULv2i32_indexed_OP2);
4947
+ break ;
4948
+ case AArch64::FMULv2f64:
4949
+ Found = Match (AArch64::DUPv2i64lane, 1 , MCP::FMULv2i64_indexed_OP1);
4950
+ Found |= Match (AArch64::DUPv2i64lane, 2 , MCP::FMULv2i64_indexed_OP2);
4951
+ break ;
4952
+ case AArch64::FMULv4f16:
4953
+ Found = Match (AArch64::DUPv4i16lane, 1 , MCP::FMULv4i16_indexed_OP1);
4954
+ Found |= Match (AArch64::DUPv4i16lane, 2 , MCP::FMULv4i16_indexed_OP2);
4955
+ break ;
4956
+ case AArch64::FMULv4f32:
4957
+ Found = Match (AArch64::DUPv4i32lane, 1 , MCP::FMULv4i32_indexed_OP1);
4958
+ Found |= Match (AArch64::DUPv4i32lane, 2 , MCP::FMULv4i32_indexed_OP2);
4959
+ break ;
4960
+ case AArch64::FMULv8f16:
4961
+ Found = Match (AArch64::DUPv8i16lane, 1 , MCP::FMULv8i16_indexed_OP1);
4962
+ Found |= Match (AArch64::DUPv8i16lane, 2 , MCP::FMULv8i16_indexed_OP2);
4963
+ break ;
4964
+ }
4965
+
4966
+ return Found;
4967
+ }
4968
+
4920
4969
// / Return true when a code sequence can improve throughput. It
4921
4970
// / should be called only for instructions in loops.
4922
4971
// / \param Pattern - combiner pattern
@@ -4980,6 +5029,16 @@ bool AArch64InstrInfo::isThroughputPattern(
4980
5029
case MachineCombinerPattern::FMLSv2f64_OP2:
4981
5030
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4982
5031
case MachineCombinerPattern::FMLSv4f32_OP2:
5032
+ case MachineCombinerPattern::FMULv2i32_indexed_OP1:
5033
+ case MachineCombinerPattern::FMULv2i32_indexed_OP2:
5034
+ case MachineCombinerPattern::FMULv2i64_indexed_OP1:
5035
+ case MachineCombinerPattern::FMULv2i64_indexed_OP2:
5036
+ case MachineCombinerPattern::FMULv4i16_indexed_OP1:
5037
+ case MachineCombinerPattern::FMULv4i16_indexed_OP2:
5038
+ case MachineCombinerPattern::FMULv4i32_indexed_OP1:
5039
+ case MachineCombinerPattern::FMULv4i32_indexed_OP2:
5040
+ case MachineCombinerPattern::FMULv8i16_indexed_OP1:
5041
+ case MachineCombinerPattern::FMULv8i16_indexed_OP2:
4983
5042
case MachineCombinerPattern::MULADDv8i8_OP1:
4984
5043
case MachineCombinerPattern::MULADDv8i8_OP2:
4985
5044
case MachineCombinerPattern::MULADDv16i8_OP1:
@@ -5036,6 +5095,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
5036
5095
if (getMaddPatterns (Root, Patterns))
5037
5096
return true ;
5038
5097
// Floating point patterns
5098
+ if (getFMULPatterns (Root, Patterns))
5099
+ return true ;
5039
5100
if (getFMAPatterns (Root, Patterns))
5040
5101
return true ;
5041
5102
@@ -5124,6 +5185,42 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
5124
5185
return MUL;
5125
5186
}
5126
5187
5188
+ // / Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
5189
+ static MachineInstr *
5190
+ genIndexedMultiply (MachineInstr &Root,
5191
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
5192
+ unsigned IdxDupOp, unsigned MulOpc,
5193
+ const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
5194
+ assert (((IdxDupOp == 1 ) || (IdxDupOp == 2 )) &&
5195
+ " Invalid index of FMUL operand" );
5196
+
5197
+ MachineFunction &MF = *Root.getMF ();
5198
+ const TargetInstrInfo *TII = MF.getSubtarget ().getInstrInfo ();
5199
+
5200
+ MachineInstr *Dup =
5201
+ MF.getRegInfo ().getUniqueVRegDef (Root.getOperand (IdxDupOp).getReg ());
5202
+
5203
+ Register DupSrcReg = Dup->getOperand (1 ).getReg ();
5204
+ MRI.clearKillFlags (DupSrcReg);
5205
+ MRI.constrainRegClass (DupSrcReg, RC);
5206
+
5207
+ unsigned DupSrcLane = Dup->getOperand (2 ).getImm ();
5208
+
5209
+ unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1 ;
5210
+ MachineOperand &MulOp = Root.getOperand (IdxMulOp);
5211
+
5212
+ Register ResultReg = Root.getOperand (0 ).getReg ();
5213
+
5214
+ MachineInstrBuilder MIB;
5215
+ MIB = BuildMI (MF, Root.getDebugLoc (), TII->get (MulOpc), ResultReg)
5216
+ .add (MulOp)
5217
+ .addReg (DupSrcReg)
5218
+ .addImm (DupSrcLane);
5219
+
5220
+ InsInstrs.push_back (MIB);
5221
+ return &Root;
5222
+ }
5223
+
5127
5224
// / genFusedMultiplyAcc - Helper to generate fused multiply accumulate
5128
5225
// / instructions.
5129
5226
// /
@@ -6082,12 +6179,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
6082
6179
}
6083
6180
break ;
6084
6181
}
6182
+ case MachineCombinerPattern::FMULv2i32_indexed_OP1:
6183
+ case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
6184
+ unsigned IdxDupOp =
6185
+ (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2 ;
6186
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
6187
+ &AArch64::FPR128RegClass, MRI);
6188
+ break ;
6189
+ }
6190
+ case MachineCombinerPattern::FMULv2i64_indexed_OP1:
6191
+ case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
6192
+ unsigned IdxDupOp =
6193
+ (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2 ;
6194
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
6195
+ &AArch64::FPR128RegClass, MRI);
6196
+ break ;
6197
+ }
6198
+ case MachineCombinerPattern::FMULv4i16_indexed_OP1:
6199
+ case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
6200
+ unsigned IdxDupOp =
6201
+ (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2 ;
6202
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
6203
+ &AArch64::FPR128_loRegClass, MRI);
6204
+ break ;
6205
+ }
6206
+ case MachineCombinerPattern::FMULv4i32_indexed_OP1:
6207
+ case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
6208
+ unsigned IdxDupOp =
6209
+ (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2 ;
6210
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
6211
+ &AArch64::FPR128RegClass, MRI);
6212
+ break ;
6213
+ }
6214
+ case MachineCombinerPattern::FMULv8i16_indexed_OP1:
6215
+ case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
6216
+ unsigned IdxDupOp =
6217
+ (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2 ;
6218
+ genIndexedMultiply (Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
6219
+ &AArch64::FPR128_loRegClass, MRI);
6220
+ break ;
6221
+ }
6085
6222
} // end switch (Pattern)
6086
6223
// Record MUL and ADD/SUB for deletion
6087
6224
// FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
6088
6225
// CodeGen/AArch64/urem-seteq-nonzero.ll.
6089
6226
// assert(MUL && "MUL was never set");
6090
- DelInstrs.push_back (MUL);
6227
+ if (MUL)
6228
+ DelInstrs.push_back (MUL);
6091
6229
DelInstrs.push_back (&Root);
6092
6230
}
6093
6231
0 commit comments