-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86][CodeGen] Support folding memory broadcast in X86InstrInfo::foldMemoryOperandImpl #79761
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Shengchen Kan (KanRobert) ChangesPatch is 360.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79761.diff 9 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index d4af94c7f92ee7..037a745d632fbc 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -406,14 +406,14 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
if (OpSrc32) {
if (const X86FoldTableEntry *Mem2Bcst =
- llvm::lookupBroadcastFoldTable(OpSrc32, 32)) {
+ llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
OpBcst32 = Mem2Bcst->DstOp;
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
}
if (OpSrc64) {
if (const X86FoldTableEntry *Mem2Bcst =
- llvm::lookupBroadcastFoldTable(OpSrc64, 64)) {
+ llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
OpBcst64 = Mem2Bcst->DstOp;
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index bb5e22c7142793..b588f660e2744e 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1067,7 +1067,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
MaskInfo.RC:$src0))],
DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K, Sched<[SchedRR]>;
- let hasSideEffects = 0, mayLoad = 1 in
+ let hasSideEffects = 0, mayLoad = 1, isReMaterializable = 1, canFoldAsLoad = 1 in
def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 63136af2295f4b..b27936d381b6ee 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -145,6 +145,23 @@ llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
return lookupFoldTableImpl(FoldTable, RegOp);
}
+const X86FoldTableEntry *
+llvm::lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum) {
+ ArrayRef<X86FoldTableEntry> FoldTable;
+ if (OpNum == 1)
+ FoldTable = ArrayRef(BroadcastTable1);
+ else if (OpNum == 2)
+ FoldTable = ArrayRef(BroadcastTable2);
+ else if (OpNum == 3)
+ FoldTable = ArrayRef(BroadcastTable3);
+ else if (OpNum == 4)
+ FoldTable = ArrayRef(BroadcastTable4);
+ else
+ return nullptr;
+
+ return lookupFoldTableImpl(FoldTable, RegOp);
+}
+
namespace {
// This class stores the memory unfolding tables. It is instantiated as a
@@ -288,8 +305,8 @@ struct X86BroadcastFoldTable {
};
} // namespace
-static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
- unsigned BroadcastBits) {
+bool llvm::matchBroadcastSize(const X86FoldTableEntry &Entry,
+ unsigned BroadcastBits) {
switch (Entry.Flags & TB_BCAST_MASK) {
case TB_BCAST_W:
case TB_BCAST_SH:
@@ -305,7 +322,7 @@ static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
}
const X86FoldTableEntry *
-llvm::lookupBroadcastFoldTable(unsigned MemOp, unsigned BroadcastBits) {
+llvm::lookupBroadcastFoldTableBySize(unsigned MemOp, unsigned BroadcastBits) {
static X86BroadcastFoldTable BroadcastFoldTable;
auto &Table = BroadcastFoldTable.Table;
for (auto I = llvm::lower_bound(Table, MemOp);
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index e3890d6aa8eb0f..5fb5b17ef6125a 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -44,14 +44,20 @@ const X86FoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
// operand OpNum.
const X86FoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+// Look up the broadcast folding table entry for folding a broadcast with
+// operand OpNum.
+const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned RegOp,
+ unsigned OpNum);
+
// Look up the memory unfolding table entry for this instruction.
const X86FoldTableEntry *lookupUnfoldTable(unsigned MemOp);
// Look up the broadcast folding table entry for this instruction from
// the regular memory instruction.
-const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned MemOp,
+const X86FoldTableEntry *lookupBroadcastFoldTableBySize(unsigned MemOp,
unsigned BroadcastBits);
+bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits);
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 9a95464287c5dc..e71407b727b644 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -862,6 +862,28 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
// AVX-512
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
+ case X86::VBROADCASTF32X2Z256rm:
+ case X86::VBROADCASTF32X2Zrm:
+ case X86::VBROADCASTI32X2Z128rm:
+ case X86::VBROADCASTI32X2Z256rm:
+ case X86::VBROADCASTI32X2Zrm:
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
case X86::VMOVSSZrm:
case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
@@ -8063,6 +8085,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MOs.push_back(MachineOperand::CreateReg(0, false));
break;
}
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
+ case X86::VBROADCASTF32X2Z256rm:
+ case X86::VBROADCASTF32X2Zrm:
+ case X86::VBROADCASTI32X2Z128rm:
+ case X86::VBROADCASTI32X2Z256rm:
+ case X86::VBROADCASTI32X2Zrm:
+ // No instructions currently fuse with 8bits or 32bits x 2.
+ return nullptr;
+
+#define FOLD_BROADCAST(SIZE) \
+ MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
+ LoadMI.operands_begin() + NumOps); \
+ return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
+ Alignment, /*AllowCommute=*/true);
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ FOLD_BROADCAST(16);
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
+ FOLD_BROADCAST(32);
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
+ FOLD_BROADCAST(64);
default: {
if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
return nullptr;
@@ -8077,6 +8132,80 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
+MachineInstr *X86InstrInfo::foldMemoryBroadcast(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned BitsSize, Align Alignment, bool AllowCommute) const {
+
+ const X86FoldTableEntry *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum);
+
+ if (I)
+ return matchBroadcastSize(*I, BitsSize)
+ ? FuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
+ : nullptr;
+
+ // TODO: Share code with foldMemoryOperandImpl for the commute
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI.getDesc().getNumDefs();
+ Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+ Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
+
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != &MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Attempt to fold with the commuted version of the instruction.
+ MachineInstr *NewMI = foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs,
+ InsertPt, BitsSize, Alignment,
+ /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
+
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != &MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI.isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+ return nullptr;
+}
+
static SmallVector<MachineMemOperand *, 2>
extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
SmallVector<MachineMemOperand *, 2> LoadMMOs;
@@ -8130,6 +8259,18 @@ static unsigned getBroadcastOpcode(const X86FoldTableEntry *I,
switch (I->Flags & TB_BCAST_MASK) {
default:
llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_W:
+ switch (SpillSize) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 16:
+ return X86::VPBROADCASTWZ128rm;
+ case 32:
+ return X86::VPBROADCASTWZ256rm;
+ case 64:
+ return X86::VPBROADCASTWZrm;
+ }
+ break;
case TB_BCAST_D:
switch (SpillSize) {
default:
@@ -8191,7 +8332,11 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
- bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ unsigned BCastType = I->Flags & TB_FOLDED_BCAST;
+ // FIXME: Support TB_BCAST_SH in getBroadcastOpcode?
+ if (BCastType == TB_BCAST_SH)
+ return false;
+
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -8231,7 +8376,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
unsigned Opc;
- if (FoldedBCast) {
+ if (BCastType) {
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
@@ -8341,7 +8486,10 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
- bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ unsigned BCastType = I->Flags & TB_FOLDED_BCAST;
+ // FIXME: Support TB_BCAST_SH in getBroadcastOpcode?
+ if (BCastType == TB_BCAST_SH)
+ return false;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -8377,7 +8525,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
// memory access is slow above.
unsigned Opc;
- if (FoldedBCast) {
+ if (BCastType) {
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0cb69050656109..3a1f98a005ca3a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -643,6 +643,13 @@ class X86InstrInfo final : public X86GenInstrInfo {
MachineBasicBlock::iterator InsertPt,
unsigned Size, Align Alignment) const;
+ MachineInstr *foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned BitsSize, Align Alignment,
+ bool AllowCommute) const;
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index c562e9d9a32808..85dd0dcd0d4daf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14967,14 +14967,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22
; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -14985,11 +14985,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm27
+; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm30 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm28[0,1,0,2]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2]
; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
@@ -15006,8 +15006,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
@@ -15017,146 +15017,145 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm22
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT: ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
…tore-*.ll to suppress warnings Suppress warnings like WARNING: Prefix AVX had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX1 had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX2 had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX2-ONLY had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX512 had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX512F had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX512F-ONLY had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX512-FAST had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll WARNING: Prefix AVX512DQ-ONLY had conflicting output from different RUN lines for all functions in test vector-interleaved-store-i16-stride-7.ll
…eBySize Address RKSimon's comments in #79761
5d868c1
to
bc071ff
Compare
@@ -143,6 +143,23 @@ const X86FoldTableEntry *llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) { | |||
return lookupFoldTableImpl(FoldTable, RegOp); | |||
} | |||
|
|||
const X86FoldTableEntry * | |||
llvm::lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-format this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
…ctions To share code for folding broadcast in #79761
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks @RKSimon ! |
No description provided.