@@ -862,6 +862,28 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(
862
862
case X86::MMX_MOVD64rm:
863
863
case X86::MMX_MOVQ64rm:
864
864
// AVX-512
865
+ case X86::VPBROADCASTBZ128rm:
866
+ case X86::VPBROADCASTBZ256rm:
867
+ case X86::VPBROADCASTBZrm:
868
+ case X86::VBROADCASTF32X2Z256rm:
869
+ case X86::VBROADCASTF32X2Zrm:
870
+ case X86::VBROADCASTI32X2Z128rm:
871
+ case X86::VBROADCASTI32X2Z256rm:
872
+ case X86::VBROADCASTI32X2Zrm:
873
+ case X86::VPBROADCASTWZ128rm:
874
+ case X86::VPBROADCASTWZ256rm:
875
+ case X86::VPBROADCASTWZrm:
876
+ case X86::VPBROADCASTDZ128rm:
877
+ case X86::VPBROADCASTDZ256rm:
878
+ case X86::VPBROADCASTDZrm:
879
+ case X86::VBROADCASTSSZ128rm:
880
+ case X86::VBROADCASTSSZ256rm:
881
+ case X86::VBROADCASTSSZrm:
882
+ case X86::VPBROADCASTQZ128rm:
883
+ case X86::VPBROADCASTQZ256rm:
884
+ case X86::VPBROADCASTQZrm:
885
+ case X86::VBROADCASTSDZ256rm:
886
+ case X86::VBROADCASTSDZrm:
865
887
case X86::VMOVSSZrm:
866
888
case X86::VMOVSSZrm_alt:
867
889
case X86::VMOVSDZrm:
@@ -8067,6 +8089,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
8067
8089
MOs.push_back (MachineOperand::CreateReg (0 , false ));
8068
8090
break ;
8069
8091
}
8092
+ case X86::VPBROADCASTBZ128rm:
8093
+ case X86::VPBROADCASTBZ256rm:
8094
+ case X86::VPBROADCASTBZrm:
8095
+ case X86::VBROADCASTF32X2Z256rm:
8096
+ case X86::VBROADCASTF32X2Zrm:
8097
+ case X86::VBROADCASTI32X2Z128rm:
8098
+ case X86::VBROADCASTI32X2Z256rm:
8099
+ case X86::VBROADCASTI32X2Zrm:
8100
+ // No instructions currently fuse with 8bits or 32bits x 2.
8101
+ return nullptr ;
8102
+
8103
+ #define FOLD_BROADCAST (SIZE ) \
8104
+ MOs.append (LoadMI.operands_begin () + NumOps - X86::AddrNumOperands, \
8105
+ LoadMI.operands_begin () + NumOps); \
8106
+ return foldMemoryBroadcast (MF, MI, Ops[0 ], MOs, InsertPt, /* Size=*/ SIZE, \
8107
+ Alignment, /* AllowCommute=*/ true );
8108
+ case X86::VPBROADCASTWZ128rm:
8109
+ case X86::VPBROADCASTWZ256rm:
8110
+ case X86::VPBROADCASTWZrm:
8111
+ FOLD_BROADCAST (16 );
8112
+ case X86::VPBROADCASTDZ128rm:
8113
+ case X86::VPBROADCASTDZ256rm:
8114
+ case X86::VPBROADCASTDZrm:
8115
+ case X86::VBROADCASTSSZ128rm:
8116
+ case X86::VBROADCASTSSZ256rm:
8117
+ case X86::VBROADCASTSSZrm:
8118
+ FOLD_BROADCAST (32 );
8119
+ case X86::VPBROADCASTQZ128rm:
8120
+ case X86::VPBROADCASTQZ256rm:
8121
+ case X86::VPBROADCASTQZrm:
8122
+ case X86::VBROADCASTSDZ256rm:
8123
+ case X86::VBROADCASTSDZrm:
8124
+ FOLD_BROADCAST (64 );
8070
8125
default : {
8071
8126
if (isNonFoldablePartialRegisterLoad (LoadMI, MI, MF))
8072
8127
return nullptr ;
@@ -8081,6 +8136,78 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
8081
8136
/* Size=*/ 0 , Alignment, /* AllowCommute=*/ true );
8082
8137
}
8083
8138
8139
+ MachineInstr *X86InstrInfo::foldMemoryBroadcast (
8140
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
8141
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
8142
+ unsigned BitsSize, Align Alignment, bool AllowCommute) const {
8143
+
8144
+ if (auto *I = lookupBroadcastFoldTable (MI.getOpcode (), OpNum))
8145
+ return matchBroadcastSize (*I, BitsSize)
8146
+ ? FuseInst (MF, I->DstOp , OpNum, MOs, InsertPt, MI, *this )
8147
+ : nullptr ;
8148
+
8149
+ // TODO: Share code with foldMemoryOperandImpl for the commute
8150
+ if (AllowCommute) {
8151
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
8152
+ if (findCommutedOpIndices (MI, CommuteOpIdx1, CommuteOpIdx2)) {
8153
+ bool HasDef = MI.getDesc ().getNumDefs ();
8154
+ Register Reg0 = HasDef ? MI.getOperand (0 ).getReg () : Register ();
8155
+ Register Reg1 = MI.getOperand (CommuteOpIdx1).getReg ();
8156
+ Register Reg2 = MI.getOperand (CommuteOpIdx2).getReg ();
8157
+ bool Tied1 =
8158
+ 0 == MI.getDesc ().getOperandConstraint (CommuteOpIdx1, MCOI::TIED_TO);
8159
+ bool Tied2 =
8160
+ 0 == MI.getDesc ().getOperandConstraint (CommuteOpIdx2, MCOI::TIED_TO);
8161
+
8162
+ // If either of the commutable operands are tied to the destination
8163
+ // then we can not commute + fold.
8164
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
8165
+ (HasDef && Reg0 == Reg2 && Tied2))
8166
+ return nullptr ;
8167
+
8168
+ MachineInstr *CommutedMI =
8169
+ commuteInstruction (MI, false , CommuteOpIdx1, CommuteOpIdx2);
8170
+ if (!CommutedMI) {
8171
+ // Unable to commute.
8172
+ return nullptr ;
8173
+ }
8174
+ if (CommutedMI != &MI) {
8175
+ // New instruction. We can't fold from this.
8176
+ CommutedMI->eraseFromParent ();
8177
+ return nullptr ;
8178
+ }
8179
+
8180
+ // Attempt to fold with the commuted version of the instruction.
8181
+ MachineInstr *NewMI = foldMemoryBroadcast (MF, MI, CommuteOpIdx2, MOs,
8182
+ InsertPt, BitsSize, Alignment,
8183
+ /* AllowCommute=*/ false );
8184
+ if (NewMI)
8185
+ return NewMI;
8186
+
8187
+ // Folding failed again - undo the commute before returning.
8188
+ MachineInstr *UncommutedMI =
8189
+ commuteInstruction (MI, false , CommuteOpIdx1, CommuteOpIdx2);
8190
+ if (!UncommutedMI) {
8191
+ // Unable to commute.
8192
+ return nullptr ;
8193
+ }
8194
+ if (UncommutedMI != &MI) {
8195
+ // New instruction. It doesn't need to be kept.
8196
+ UncommutedMI->eraseFromParent ();
8197
+ return nullptr ;
8198
+ }
8199
+
8200
+ // Return here to prevent duplicate fuse failure report.
8201
+ return nullptr ;
8202
+ }
8203
+ }
8204
+
8205
+ // No fusion
8206
+ if (PrintFailedFusing && !MI.isCopy ())
8207
+ dbgs () << " We failed to fuse operand " << OpNum << " in " << MI;
8208
+ return nullptr ;
8209
+ }
8210
+
8084
8211
static SmallVector<MachineMemOperand *, 2 >
8085
8212
extractLoadMMOs (ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
8086
8213
SmallVector<MachineMemOperand *, 2 > LoadMMOs;
0 commit comments