Skip to content

Commit 3391c1a

Browse files
authored
Merge pull request llvm#471 from AMD-Lightning-Internal/amd/dev/rnimmaka/fix-merges-04022025
merge main into amd-staging
2 parents 837724e + 4430640 commit 3391c1a

File tree

27 files changed

+836
-363
lines changed

27 files changed

+836
-363
lines changed

lldb/source/Target/Process.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6677,11 +6677,8 @@ static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
66776677

66786678
for (const auto &range : regions) {
66796679
auto entry = option_ranges.FindEntryThatContains(range.GetRange());
6680-
if (entry) {
6681-
ranges.Append(range.GetRange().GetRangeBase(),
6682-
range.GetRange().GetByteSize(),
6683-
CreateCoreFileMemoryRange(range));
6684-
}
6680+
if (entry)
6681+
AddRegion(range, true, ranges);
66856682
}
66866683
}
66876684

lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,3 +636,42 @@ def minidump_saves_fs_base_region(self):
636636
self.assertTrue(self.dbg.DeleteTarget(target))
637637
if os.path.isfile(tls_file):
638638
os.unlink(tls_file)
639+
640+
@skipUnlessPlatform(["linux"])
641+
@skipUnlessArch("x86_64")
642+
def test_invalid_custom_regions_not_included(self):
643+
options = lldb.SBSaveCoreOptions()
644+
self.build()
645+
exe = self.getBuildArtifact("a.out")
646+
output_file = self.getBuildArtifact("no_empty_regions.dmp")
647+
try:
648+
target = self.dbg.CreateTarget(exe)
649+
process = target.LaunchSimple(
650+
None, None, self.get_process_working_directory()
651+
)
652+
self.assertState(process.GetState(), lldb.eStateStopped)
653+
options.SetPluginName("minidump")
654+
options.SetOutputFile(lldb.SBFileSpec(output_file))
655+
options.SetStyle(lldb.eSaveCoreCustomOnly)
656+
region_one = lldb.SBMemoryRegionInfo()
657+
process.GetMemoryRegions().GetMemoryRegionAtIndex(0, region_one)
658+
options.AddMemoryRegionToSave(region_one)
659+
empty_region = lldb.SBMemoryRegionInfo(
660+
"empty region", 0x0, 0x0, 3, True, False
661+
)
662+
options.AddMemoryRegionToSave(empty_region)
663+
region_with_no_permissions = lldb.SBMemoryRegionInfo(
664+
"no permissions", 0x2AAA, 0x2BBB, 0, True, False
665+
)
666+
options.AddMemoryRegionToSave(region_with_no_permissions)
667+
error = process.SaveCore(options)
668+
self.assertTrue(error.Success(), error.GetCString())
669+
core_target = self.dbg.CreateTarget(None)
670+
core_process = core_target.LoadCore(output_file)
671+
self.assertNotIn(
672+
region_with_no_permissions, core_process.GetMemoryRegions()
673+
)
674+
self.assertNotIn(empty_region, core_process.GetMemoryRegions())
675+
finally:
676+
if os.path.isfile(output_file):
677+
os.unlink(output_file)

llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -967,21 +967,20 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() {
967967
// here because we validate this in the .debug_info verifier.
968968
continue;
969969
}
970-
auto Iter = StmtListToDie.find(LineTableOffset);
971-
if (Iter != StmtListToDie.end()) {
970+
auto [Iter, Inserted] = StmtListToDie.try_emplace(LineTableOffset, Die);
971+
if (!Inserted) {
972972
++NumDebugLineErrors;
973+
const auto &OldDie = Iter->second;
973974
ErrorCategory.Report("Identical DW_AT_stmt_list section offset", [&]() {
974975
error() << "two compile unit DIEs, "
975-
<< format("0x%08" PRIx64, Iter->second.getOffset()) << " and "
976+
<< format("0x%08" PRIx64, OldDie.getOffset()) << " and "
976977
<< format("0x%08" PRIx64, Die.getOffset())
977978
<< ", have the same DW_AT_stmt_list section offset:\n";
978-
dump(Iter->second);
979+
dump(OldDie);
979980
dump(Die) << '\n';
980981
});
981982
// Already verified this line table before, no need to do it again.
982-
continue;
983983
}
984-
StmtListToDie[LineTableOffset] = Die;
985984
}
986985
}
987986

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
42174217
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
42184218
if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
42194219
if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4220-
if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4221-
SDValue BV = stripBitcast(Src.getOperand(0));
4222-
if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4223-
BV.getValueType().getVectorNumElements() == 2) {
4224-
SDValue SrcElt = BV.getOperand(1);
4225-
EVT SrcEltVT = SrcElt.getValueType();
4226-
if (SrcEltVT.isFloatingPoint()) {
4227-
SrcElt = DAG.getNode(ISD::BITCAST, SL,
4228-
SrcEltVT.changeTypeToInteger(), SrcElt);
4220+
SDValue BV = stripBitcast(Src.getOperand(0));
4221+
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4222+
EVT SrcEltVT = BV.getOperand(0).getValueType();
4223+
unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4224+
unsigned BitIndex = K->getZExtValue();
4225+
unsigned PartIndex = BitIndex / SrcEltSize;
4226+
4227+
if (PartIndex * SrcEltSize == BitIndex &&
4228+
PartIndex < BV.getNumOperands()) {
4229+
if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4230+
SDValue SrcElt =
4231+
DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4232+
BV.getOperand(PartIndex));
4233+
return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
42294234
}
4230-
4231-
return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
42324235
}
42334236
}
42344237
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 18 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
416416
return 1024;
417417
}
418418

419-
// FIXME: Should we use narrower types for local/region, or account for when
420-
// unaligned access is legal?
421419
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
422420
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
423421
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426424
if (AtomicElementSize)
427425
return Type::getIntNTy(Context, *AtomicElementSize * 8);
428426

429-
Align MinAlign = std::min(SrcAlign, DestAlign);
430-
431-
// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
432-
// hardware into byte accesses. If you assume all alignments are equally
433-
// probable, it's more efficient on average to use short accesses for this
434-
// case.
435-
if (MinAlign == Align(2))
436-
return Type::getInt16Ty(Context);
437-
438-
// Not all subtargets have 128-bit DS instructions, and we currently don't
439-
// form them by default.
440-
if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441-
SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
442-
DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
443-
DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
444-
return FixedVectorType::get(Type::getInt32Ty(Context), 2);
445-
}
446-
447-
// Global memory works best with 16-byte accesses.
427+
// 16-byte accesses achieve the highest copy throughput.
448428
// If the operation has a fixed known length that is large enough, it is
449429
// worthwhile to return an even wider type and let legalization lower it into
450-
// multiple accesses, effectively unrolling the memcpy loop. Private memory
451-
// also hits this, although accesses may be decomposed.
430+
// multiple accesses, effectively unrolling the memcpy loop.
431+
// We also rely on legalization to decompose into smaller accesses for
432+
// subtargets and address spaces where it is necessary.
452433
//
453434
// Don't unroll if Length is not a constant, since unrolling leads to worse
454435
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
473454
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
474455
DestAlign, AtomicCpySize);
475456

476-
Align MinAlign = std::min(SrcAlign, DestAlign);
477-
478-
if (MinAlign != Align(2)) {
479-
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
480-
while (RemainingBytes >= 16) {
481-
OpsOut.push_back(I32x4Ty);
482-
RemainingBytes -= 16;
483-
}
457+
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
458+
while (RemainingBytes >= 16) {
459+
OpsOut.push_back(I32x4Ty);
460+
RemainingBytes -= 16;
461+
}
484462

485-
Type *I64Ty = Type::getInt64Ty(Context);
486-
while (RemainingBytes >= 8) {
487-
OpsOut.push_back(I64Ty);
488-
RemainingBytes -= 8;
489-
}
463+
Type *I64Ty = Type::getInt64Ty(Context);
464+
while (RemainingBytes >= 8) {
465+
OpsOut.push_back(I64Ty);
466+
RemainingBytes -= 8;
467+
}
490468

491-
Type *I32Ty = Type::getInt32Ty(Context);
492-
while (RemainingBytes >= 4) {
493-
OpsOut.push_back(I32Ty);
494-
RemainingBytes -= 4;
495-
}
469+
Type *I32Ty = Type::getInt32Ty(Context);
470+
while (RemainingBytes >= 4) {
471+
OpsOut.push_back(I32Ty);
472+
RemainingBytes -= 4;
496473
}
497474

498475
Type *I16Ty = Type::getInt16Ty(Context);

llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
287287
RegSeqInfo &CompatibleRSI,
288288
std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
289289
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
290-
if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
291-
return false;
292290
std::vector<MachineInstr *> &MIs =
293291
PreviousRegSeqByUndefCount[NeededUndefs];
292+
if (MIs.empty())
293+
return false;
294294
CompatibleRSI = PreviousRegSeq[MIs.back()];
295295
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
296296
return true;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2944,7 +2944,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,
29442944
}
29452945

29462946
/// Return true if the condition is an signed comparison operation.
2947-
static bool isX86CCSigned(unsigned X86CC) {
2947+
static bool isX86CCSigned(X86::CondCode X86CC) {
29482948
switch (X86CC) {
29492949
default:
29502950
llvm_unreachable("Invalid integer condition!");
@@ -22975,7 +22975,7 @@ static bool isProfitableToUseFlagOp(SDValue Op) {
2297522975

2297622976
/// Emit nodes that will be selected as "test Op0,Op0", or something
2297722977
/// equivalent.
22978-
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22978+
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
2297922979
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
2298022980
// CF and OF aren't always set the way we want. Determine which
2298122981
// of these we need.
@@ -23085,7 +23085,7 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
2308523085

2308623086
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
2308723087
/// equivalent.
23088-
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23088+
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
2308923089
const SDLoc &dl, SelectionDAG &DAG,
2309023090
const X86Subtarget &Subtarget) {
2309123091
if (isNullConstant(Op1))

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class VectorCombine {
126126
bool foldShuffleFromReductions(Instruction &I);
127127
bool foldCastFromReductions(Instruction &I);
128128
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
129+
bool foldInterleaveIntrinsics(Instruction &I);
129130
bool shrinkType(Instruction &I);
130131

131132
void replaceValue(Value &Old, Value &New) {
@@ -3204,6 +3205,47 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
32043205
return true;
32053206
}
32063207

3208+
/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
3209+
/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
3210+
/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
3211+
/// before casting it back into `<vscale x 16 x i32>`.
3212+
bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
3213+
const APInt *SplatVal0, *SplatVal1;
3214+
if (!match(&I, m_Intrinsic<Intrinsic::vector_interleave2>(
3215+
m_APInt(SplatVal0), m_APInt(SplatVal1))))
3216+
return false;
3217+
3218+
LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
3219+
<< "\n");
3220+
3221+
auto *VTy =
3222+
cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
3223+
auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
3224+
unsigned Width = VTy->getElementType()->getIntegerBitWidth();
3225+
3226+
// Just in case the cost of interleave2 intrinsic and bitcast are both
3227+
// invalid, in which case we want to bail out, we use <= rather
3228+
// than < here. Even they both have valid and equal costs, it's probably
3229+
// not a good idea to emit a high-cost constant splat.
3230+
if (TTI.getInstructionCost(&I, CostKind) <=
3231+
TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
3232+
TTI::CastContextHint::None, CostKind)) {
3233+
LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
3234+
<< *I.getType() << " is too high.\n");
3235+
return false;
3236+
}
3237+
3238+
APInt NewSplatVal = SplatVal1->zext(Width * 2);
3239+
NewSplatVal <<= Width;
3240+
NewSplatVal |= SplatVal0->zext(Width * 2);
3241+
auto *NewSplat = ConstantVector::getSplat(
3242+
ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
3243+
3244+
IRBuilder<> Builder(&I);
3245+
replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
3246+
return true;
3247+
}
3248+
32073249
/// This is the entry point for all transforms. Pass manager differences are
32083250
/// handled in the callers of this function.
32093251
bool VectorCombine::run() {
@@ -3248,6 +3290,7 @@ bool VectorCombine::run() {
32483290
MadeChange |= scalarizeBinopOrCmp(I);
32493291
MadeChange |= scalarizeLoadExtract(I);
32503292
MadeChange |= scalarizeVPIntrinsic(I);
3293+
MadeChange |= foldInterleaveIntrinsics(I);
32513294
}
32523295

32533296
if (Opcode == Instruction::Store)

0 commit comments

Comments
 (0)