Skip to content

Commit d53d133

Browse files
SC llvm teamSC llvm team
authored andcommitted
Merged main:077e0c134a31 into amd-gfx:895bae00ec58
Local branch amd-gfx 895bae0 Merged main:93fcef3048b4 into amd-gfx:78b534575bbe Remote branch main 077e0c1 AMDGPU: Generalize truncate of shift of cast build_vector combine (llvm#125617)
2 parents 895bae0 + 077e0c1 commit d53d133

File tree

17 files changed

+452
-102
lines changed

17 files changed

+452
-102
lines changed

lldb/source/Target/Process.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6677,11 +6677,8 @@ static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
66776677

66786678
for (const auto &range : regions) {
66796679
auto entry = option_ranges.FindEntryThatContains(range.GetRange());
6680-
if (entry) {
6681-
ranges.Append(range.GetRange().GetRangeBase(),
6682-
range.GetRange().GetByteSize(),
6683-
CreateCoreFileMemoryRange(range));
6684-
}
6680+
if (entry)
6681+
AddRegion(range, true, ranges);
66856682
}
66866683
}
66876684

lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,3 +636,42 @@ def minidump_saves_fs_base_region(self):
636636
self.assertTrue(self.dbg.DeleteTarget(target))
637637
if os.path.isfile(tls_file):
638638
os.unlink(tls_file)
639+
640+
@skipUnlessPlatform(["linux"])
641+
@skipUnlessArch("x86_64")
642+
def test_invalid_custom_regions_not_included(self):
643+
options = lldb.SBSaveCoreOptions()
644+
self.build()
645+
exe = self.getBuildArtifact("a.out")
646+
output_file = self.getBuildArtifact("no_empty_regions.dmp")
647+
try:
648+
target = self.dbg.CreateTarget(exe)
649+
process = target.LaunchSimple(
650+
None, None, self.get_process_working_directory()
651+
)
652+
self.assertState(process.GetState(), lldb.eStateStopped)
653+
options.SetPluginName("minidump")
654+
options.SetOutputFile(lldb.SBFileSpec(output_file))
655+
options.SetStyle(lldb.eSaveCoreCustomOnly)
656+
region_one = lldb.SBMemoryRegionInfo()
657+
process.GetMemoryRegions().GetMemoryRegionAtIndex(0, region_one)
658+
options.AddMemoryRegionToSave(region_one)
659+
empty_region = lldb.SBMemoryRegionInfo(
660+
"empty region", 0x0, 0x0, 3, True, False
661+
)
662+
options.AddMemoryRegionToSave(empty_region)
663+
region_with_no_permissions = lldb.SBMemoryRegionInfo(
664+
"no permissions", 0x2AAA, 0x2BBB, 0, True, False
665+
)
666+
options.AddMemoryRegionToSave(region_with_no_permissions)
667+
error = process.SaveCore(options)
668+
self.assertTrue(error.Success(), error.GetCString())
669+
core_target = self.dbg.CreateTarget(None)
670+
core_process = core_target.LoadCore(output_file)
671+
self.assertNotIn(
672+
region_with_no_permissions, core_process.GetMemoryRegions()
673+
)
674+
self.assertNotIn(empty_region, core_process.GetMemoryRegions())
675+
finally:
676+
if os.path.isfile(output_file):
677+
os.unlink(output_file)

llvm/include/llvm/Config/llvm-config.h.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/* Indicate that this is LLVM compiled from the amd-gfx branch. */
1818
#define LLVM_HAVE_BRANCH_AMD_GFX
19-
#define LLVM_MAIN_REVISION 526134
19+
#define LLVM_MAIN_REVISION 526143
2020

2121
/* Define if LLVM_ENABLE_DUMP is enabled */
2222
#cmakedefine LLVM_ENABLE_DUMP

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
42174217
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
42184218
if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
42194219
if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4220-
if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4221-
SDValue BV = stripBitcast(Src.getOperand(0));
4222-
if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4223-
BV.getValueType().getVectorNumElements() == 2) {
4224-
SDValue SrcElt = BV.getOperand(1);
4225-
EVT SrcEltVT = SrcElt.getValueType();
4226-
if (SrcEltVT.isFloatingPoint()) {
4227-
SrcElt = DAG.getNode(ISD::BITCAST, SL,
4228-
SrcEltVT.changeTypeToInteger(), SrcElt);
4220+
SDValue BV = stripBitcast(Src.getOperand(0));
4221+
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4222+
EVT SrcEltVT = BV.getOperand(0).getValueType();
4223+
unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4224+
unsigned BitIndex = K->getZExtValue();
4225+
unsigned PartIndex = BitIndex / SrcEltSize;
4226+
4227+
if (PartIndex * SrcEltSize == BitIndex &&
4228+
PartIndex < BV.getNumOperands()) {
4229+
if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4230+
SDValue SrcElt =
4231+
DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4232+
BV.getOperand(PartIndex));
4233+
return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
42294234
}
4230-
4231-
return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
42324235
}
42334236
}
42344237
}

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4298,6 +4298,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
42984298
case Intrinsic::abs:
42994299
handleAbsIntrinsic(I);
43004300
break;
4301+
case Intrinsic::bitreverse:
4302+
handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(),
4303+
/*trailingVerbatimArgs*/ 0);
4304+
break;
43014305
case Intrinsic::is_fpclass:
43024306
handleIsFpClass(I);
43034307
break;

llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,15 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
116116
return std::nullopt;
117117
return ResultReason::DiffOpcodes;
118118
}
119-
case Instruction::Opcode::Select:
119+
case Instruction::Opcode::Select: {
120+
auto *Sel0 = cast<SelectInst>(Bndl[0]);
121+
auto *Cond0 = Sel0->getCondition();
122+
if (VecUtils::getNumLanes(Cond0) != VecUtils::getNumLanes(Sel0))
123+
// TODO: For now we don't vectorize if the lanes in the condition don't
124+
// match those of the select instruction.
125+
return ResultReason::Unimplemented;
126+
return std::nullopt;
127+
}
120128
case Instruction::Opcode::FNeg:
121129
case Instruction::Opcode::Add:
122130
case Instruction::Opcode::FAdd:

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class VectorCombine {
126126
bool foldShuffleFromReductions(Instruction &I);
127127
bool foldCastFromReductions(Instruction &I);
128128
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
129+
bool foldInterleaveIntrinsics(Instruction &I);
129130
bool shrinkType(Instruction &I);
130131

131132
void replaceValue(Value &Old, Value &New) {
@@ -3204,6 +3205,47 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
32043205
return true;
32053206
}
32063207

3208+
/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
3209+
/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
3210+
/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
3211+
/// before casting it back into `<vscale x 16 x i32>`.
3212+
bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
3213+
const APInt *SplatVal0, *SplatVal1;
3214+
if (!match(&I, m_Intrinsic<Intrinsic::vector_interleave2>(
3215+
m_APInt(SplatVal0), m_APInt(SplatVal1))))
3216+
return false;
3217+
3218+
LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
3219+
<< "\n");
3220+
3221+
auto *VTy =
3222+
cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
3223+
auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
3224+
unsigned Width = VTy->getElementType()->getIntegerBitWidth();
3225+
3226+
// Just in case the cost of interleave2 intrinsic and bitcast are both
3227+
// invalid, in which case we want to bail out, we use <= rather
3228+
// than < here. Even they both have valid and equal costs, it's probably
3229+
// not a good idea to emit a high-cost constant splat.
3230+
if (TTI.getInstructionCost(&I, CostKind) <=
3231+
TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
3232+
TTI::CastContextHint::None, CostKind)) {
3233+
LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
3234+
<< *I.getType() << " is too high.\n");
3235+
return false;
3236+
}
3237+
3238+
APInt NewSplatVal = SplatVal1->zext(Width * 2);
3239+
NewSplatVal <<= Width;
3240+
NewSplatVal |= SplatVal0->zext(Width * 2);
3241+
auto *NewSplat = ConstantVector::getSplat(
3242+
ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
3243+
3244+
IRBuilder<> Builder(&I);
3245+
replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
3246+
return true;
3247+
}
3248+
32073249
/// This is the entry point for all transforms. Pass manager differences are
32083250
/// handled in the callers of this function.
32093251
bool VectorCombine::run() {
@@ -3248,6 +3290,7 @@ bool VectorCombine::run() {
32483290
MadeChange |= scalarizeBinopOrCmp(I);
32493291
MadeChange |= scalarizeLoadExtract(I);
32503292
MadeChange |= scalarizeVPIntrinsic(I);
3293+
MadeChange |= foldInterleaveIntrinsics(I);
32513294
}
32523295

32533296
if (Opcode == Instruction::Store)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
3+
4+
; extract element 0 as shift
5+
define i32 @cast_v4i32_to_i128_trunc_i32(<4 x i32> %arg) {
6+
; CHECK-LABEL: cast_v4i32_to_i128_trunc_i32:
7+
; CHECK: ; %bb.0:
8+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; CHECK-NEXT: s_setpc_b64 s[30:31]
10+
%bigint = bitcast <4 x i32> %arg to i128
11+
%trunc = trunc i128 %bigint to i32
12+
ret i32 %trunc
13+
}
14+
15+
; extract element 1 as shift
16+
define i32 @cast_v4i32_to_i128_lshr_32_trunc_i32(<4 x i32> %arg) {
17+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i32:
18+
; CHECK: ; %bb.0:
19+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
21+
; CHECK-NEXT: s_setpc_b64 s[30:31]
22+
%bigint = bitcast <4 x i32> %arg to i128
23+
%srl = lshr i128 %bigint, 32
24+
%trunc = trunc i128 %srl to i32
25+
ret i32 %trunc
26+
}
27+
28+
; extract element 2 as shift
29+
define i32 @cast_v4i32_to_i128_lshr_64_trunc_i32(<4 x i32> %arg) {
30+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i32:
31+
; CHECK: ; %bb.0:
32+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33+
; CHECK-NEXT: v_mov_b32_e32 v0, v2
34+
; CHECK-NEXT: s_setpc_b64 s[30:31]
35+
%bigint = bitcast <4 x i32> %arg to i128
36+
%srl = lshr i128 %bigint, 64
37+
%trunc = trunc i128 %srl to i32
38+
ret i32 %trunc
39+
}
40+
41+
; extract element 3 as shift
42+
define i32 @cast_v4i32_to_i128_lshr_96_trunc_i32(<4 x i32> %arg) {
43+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_96_trunc_i32:
44+
; CHECK: ; %bb.0:
45+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46+
; CHECK-NEXT: v_mov_b32_e32 v0, v3
47+
; CHECK-NEXT: s_setpc_b64 s[30:31]
48+
%bigint = bitcast <4 x i32> %arg to i128
49+
%srl = lshr i128 %bigint, 96
50+
%trunc = trunc i128 %srl to i32
51+
ret i32 %trunc
52+
}
53+
54+
; Shift not aligned to element, not a simple extract
55+
define i32 @cast_v4i32_to_i128_lshr_33_trunc_i32(<4 x i32> %arg) {
56+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_33_trunc_i32:
57+
; CHECK: ; %bb.0:
58+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59+
; CHECK-NEXT: v_alignbit_b32 v0, v2, v1, 1
60+
; CHECK-NEXT: s_setpc_b64 s[30:31]
61+
%bigint = bitcast <4 x i32> %arg to i128
62+
%srl = lshr i128 %bigint, 33
63+
%trunc = trunc i128 %srl to i32
64+
ret i32 %trunc
65+
}
66+
67+
; extract misaligned element
68+
define i32 @cast_v4i32_to_i128_lshr_31_trunc_i32(<4 x i32> %arg) {
69+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_31_trunc_i32:
70+
; CHECK: ; %bb.0:
71+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72+
; CHECK-NEXT: v_alignbit_b32 v0, v1, v0, 31
73+
; CHECK-NEXT: s_setpc_b64 s[30:31]
74+
%bigint = bitcast <4 x i32> %arg to i128
75+
%srl = lshr i128 %bigint, 31
76+
%trunc = trunc i128 %srl to i32
77+
ret i32 %trunc
78+
}
79+
80+
; extract misaligned element
81+
define i32 @cast_v4i32_to_i128_lshr_48_trunc_i32(<4 x i32> %arg) {
82+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_48_trunc_i32:
83+
; CHECK: ; %bb.0:
84+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85+
; CHECK-NEXT: s_mov_b32 s4, 0x1000706
86+
; CHECK-NEXT: v_perm_b32 v0, v1, v2, s4
87+
; CHECK-NEXT: s_setpc_b64 s[30:31]
88+
%bigint = bitcast <4 x i32> %arg to i128
89+
%srl = lshr i128 %bigint, 48
90+
%trunc = trunc i128 %srl to i32
91+
ret i32 %trunc
92+
}
93+
94+
; extract elements 1 and 2 with shift
95+
define i64 @cast_v4i32_to_i128_lshr_32_trunc_i64(<4 x i32> %arg) {
96+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i64:
97+
; CHECK: ; %bb.0:
98+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
100+
; CHECK-NEXT: v_mov_b32_e32 v1, v2
101+
; CHECK-NEXT: s_setpc_b64 s[30:31]
102+
%bigint = bitcast <4 x i32> %arg to i128
103+
%srl = lshr i128 %bigint, 32
104+
%trunc = trunc i128 %srl to i64
105+
ret i64 %trunc
106+
}
107+
108+
; extract elements 2 and 3 with shift
109+
define i64 @cast_v4i32_to_i128_lshr_64_trunc_i64(<4 x i32> %arg) {
110+
; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i64:
111+
; CHECK: ; %bb.0:
112+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113+
; CHECK-NEXT: v_mov_b32_e32 v1, v3
114+
; CHECK-NEXT: v_mov_b32_e32 v0, v2
115+
; CHECK-NEXT: s_setpc_b64 s[30:31]
116+
%bigint = bitcast <4 x i32> %arg to i128
117+
%srl = lshr i128 %bigint, 64
118+
%trunc = trunc i128 %srl to i64
119+
ret i64 %trunc
120+
}
121+
122+
; FIXME: We don't process this case because we see multiple bitcasts
123+
; before a 32-bit build_vector
124+
define i32 @build_vector_i16_to_shift(i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) {
125+
; CHECK-LABEL: build_vector_i16_to_shift:
126+
; CHECK: ; %bb.0:
127+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128+
; CHECK-NEXT: s_mov_b32 s4, 0x5040100
129+
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s4
130+
; CHECK-NEXT: s_setpc_b64 s[30:31]
131+
%ins.0 = insertelement <4 x i16> poison, i16 %arg0, i32 0
132+
%ins.1 = insertelement <4 x i16> %ins.0, i16 %arg1, i32 1
133+
%ins.2 = insertelement <4 x i16> %ins.1, i16 %arg2, i32 2
134+
%ins.3 = insertelement <4 x i16> %ins.2, i16 %arg3, i32 3
135+
136+
%cast = bitcast <4 x i16> %ins.3 to i64
137+
%srl = lshr i64 %cast, 32
138+
%trunc = trunc i64 %srl to i32
139+
ret i32 %trunc
140+
}

0 commit comments

Comments
 (0)