GPUOpen-Drivers
diff --git a/‎lldb/source/Target/Process.cpp
Lines changed: 2 additions & 5 deletions b/‎lldb/source/Target/Process.cpp
Lines changed: 2 additions & 5 deletions
diff --git a/‎lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
Lines changed: 39 additions & 0 deletions b/‎lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 14 additions & 11 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 14 additions & 11 deletions
diff --git a/‎llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
Lines changed: 9 additions & 1 deletion b/‎llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
Lines changed: 9 additions & 1 deletion
diff --git a/‎llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Lines changed: 43 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Lines changed: 43 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
Lines changed: 140 additions & 0 deletions b/‎llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
Lines changed: 140 additions & 0 deletions
@@ -6677,11 +6677,8 @@ static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
 
   for (const auto &range : regions) {
     auto entry = option_ranges.FindEntryThatContains(range.GetRange());
-    if (entry) {
-      ranges.Append(range.GetRange().GetRangeBase(),
-                    range.GetRange().GetByteSize(),
-                    CreateCoreFileMemoryRange(range));
-    }
+    if (entry)
+      AddRegion(range, true, ranges);
   }
 }
 
 
@@ -636,3 +636,42 @@ def minidump_saves_fs_base_region(self):
             self.assertTrue(self.dbg.DeleteTarget(target))
             if os.path.isfile(tls_file):
                 os.unlink(tls_file)
+
+    @skipUnlessPlatform(["linux"])
+    @skipUnlessArch("x86_64")
+    def test_invalid_custom_regions_not_included(self):
+        options = lldb.SBSaveCoreOptions()
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        output_file = self.getBuildArtifact("no_empty_regions.dmp")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+            options.SetPluginName("minidump")
+            options.SetOutputFile(lldb.SBFileSpec(output_file))
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            region_one = lldb.SBMemoryRegionInfo()
+            process.GetMemoryRegions().GetMemoryRegionAtIndex(0, region_one)
+            options.AddMemoryRegionToSave(region_one)
+            empty_region = lldb.SBMemoryRegionInfo(
+                "empty region", 0x0, 0x0, 3, True, False
+            )
+            options.AddMemoryRegionToSave(empty_region)
+            region_with_no_permissions = lldb.SBMemoryRegionInfo(
+                "no permissions", 0x2AAA, 0x2BBB, 0, True, False
+            )
+            options.AddMemoryRegionToSave(region_with_no_permissions)
+            error = process.SaveCore(options)
+            self.assertTrue(error.Success(), error.GetCString())
+            core_target = self.dbg.CreateTarget(None)
+            core_process = core_target.LoadCore(output_file)
+            self.assertNotIn(
+                region_with_no_permissions, core_process.GetMemoryRegions()
+            )
+            self.assertNotIn(empty_region, core_process.GetMemoryRegions())
+        finally:
+            if os.path.isfile(output_file):
+                os.unlink(output_file)
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 526134
+#define LLVM_MAIN_REVISION 526143
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
 
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
     if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
-      if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
-        SDValue BV = stripBitcast(Src.getOperand(0));
-        if (BV.getOpcode() == ISD::BUILD_VECTOR &&
-            BV.getValueType().getVectorNumElements() == 2) {
-          SDValue SrcElt = BV.getOperand(1);
-          EVT SrcEltVT = SrcElt.getValueType();
-          if (SrcEltVT.isFloatingPoint()) {
-            SrcElt = DAG.getNode(ISD::BITCAST, SL,
-                                 SrcEltVT.changeTypeToInteger(), SrcElt);
+      SDValue BV = stripBitcast(Src.getOperand(0));
+      if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+        EVT SrcEltVT = BV.getOperand(0).getValueType();
+        unsigned SrcEltSize = SrcEltVT.getSizeInBits();
+        unsigned BitIndex = K->getZExtValue();
+        unsigned PartIndex = BitIndex / SrcEltSize;
+
+        if (PartIndex * SrcEltSize == BitIndex &&
+            PartIndex < BV.getNumOperands()) {
+          if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
+            SDValue SrcElt =
+                DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
+                            BV.getOperand(PartIndex));
+            return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
           }
-
-          return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
         }
       }
     }
 
@@ -4298,6 +4298,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::abs:
       handleAbsIntrinsic(I);
       break;
+    case Intrinsic::bitreverse:
+      handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(),
+                                        /*trailingVerbatimArgs*/ 0);
+      break;
     case Intrinsic::is_fpclass:
       handleIsFpClass(I);
       break;
 
@@ -116,7 +116,15 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
       return std::nullopt;
     return ResultReason::DiffOpcodes;
   }
-  case Instruction::Opcode::Select:
+  case Instruction::Opcode::Select: {
+    auto *Sel0 = cast<SelectInst>(Bndl[0]);
+    auto *Cond0 = Sel0->getCondition();
+    if (VecUtils::getNumLanes(Cond0) != VecUtils::getNumLanes(Sel0))
+      // TODO: For now we don't vectorize if the lanes in the condition don't
+      // match those of the select instruction.
+      return ResultReason::Unimplemented;
+    return std::nullopt;
+  }
   case Instruction::Opcode::FNeg:
   case Instruction::Opcode::Add:
   case Instruction::Opcode::FAdd:
 
@@ -126,6 +126,7 @@ class VectorCombine {
   bool foldShuffleFromReductions(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
+  bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
@@ -3204,6 +3205,47 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   return true;
 }
 
+/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
+/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
+/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
+/// before casting it back into `<vscale x 16 x i32>`.
+bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
+  const APInt *SplatVal0, *SplatVal1;
+  if (!match(&I, m_Intrinsic<Intrinsic::vector_interleave2>(
+                     m_APInt(SplatVal0), m_APInt(SplatVal1))))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
+                    << "\n");
+
+  auto *VTy =
+      cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
+  auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
+  unsigned Width = VTy->getElementType()->getIntegerBitWidth();
+
+  // Just in case the cost of interleave2 intrinsic and bitcast are both
+  // invalid, in which case we want to bail out, we use <= rather
+  // than < here. Even they both have valid and equal costs, it's probably
+  // not a good idea to emit a high-cost constant splat.
+  if (TTI.getInstructionCost(&I, CostKind) <=
+      TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
+                           TTI::CastContextHint::None, CostKind)) {
+    LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
+                      << *I.getType() << " is too high.\n");
+    return false;
+  }
+
+  APInt NewSplatVal = SplatVal1->zext(Width * 2);
+  NewSplatVal <<= Width;
+  NewSplatVal |= SplatVal0->zext(Width * 2);
+  auto *NewSplat = ConstantVector::getSplat(
+      ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
+
+  IRBuilder<> Builder(&I);
+  replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -3248,6 +3290,7 @@ bool VectorCombine::run() {
       MadeChange |= scalarizeBinopOrCmp(I);
       MadeChange |= scalarizeLoadExtract(I);
       MadeChange |= scalarizeVPIntrinsic(I);
+      MadeChange |= foldInterleaveIntrinsics(I);
     }
 
     if (Opcode == Instruction::Store)
 
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; extract element 0 as shift
+define i32 @cast_v4i32_to_i128_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %trunc = trunc i128 %bigint to i32
+  ret i32 %trunc
+}
+
+; extract element 1 as shift
+define i32 @cast_v4i32_to_i128_lshr_32_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 32
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract element 2 as shift
+define i32 @cast_v4i32_to_i128_lshr_64_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 64
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract element 3 as shift
+define i32 @cast_v4i32_to_i128_lshr_96_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_96_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v3
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 96
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; Shift not aligned to element, not a simple extract
+define i32 @cast_v4i32_to_i128_lshr_33_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_33_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_alignbit_b32 v0, v2, v1, 1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 33
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_31_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_31_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_alignbit_b32 v0, v1, v0, 31
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 31
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_48_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_48_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x1000706
+; CHECK-NEXT:    v_perm_b32 v0, v1, v2, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 48
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract elements 1 and 2 with shift
+define i64 @cast_v4i32_to_i128_lshr_32_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 32
+  %trunc = trunc i128 %srl to i64
+  ret i64 %trunc
+}
+
+; extract elements 2 and 3 with shift
+define i64 @cast_v4i32_to_i128_lshr_64_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 64
+  %trunc = trunc i128 %srl to i64
+  ret i64 %trunc
+}
+
+; FIXME: We don't process this case because we see multiple bitcasts
+; before a 32-bit build_vector
+define i32 @build_vector_i16_to_shift(i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) {
+; CHECK-LABEL: build_vector_i16_to_shift:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x5040100
+; CHECK-NEXT:    v_perm_b32 v0, v3, v2, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ins.0 = insertelement <4 x i16> poison, i16 %arg0, i32 0
+  %ins.1 = insertelement <4 x i16> %ins.0, i16 %arg1, i32 1
+  %ins.2 = insertelement <4 x i16> %ins.1, i16 %arg2, i32 2
+  %ins.3 = insertelement <4 x i16> %ins.2, i16 %arg3, i32 3
+
+  %cast = bitcast <4 x i16> %ins.3 to i64
+  %srl = lshr i64 %cast, 32
+  %trunc = trunc i64 %srl to i32
+  ret i32 %trunc
+}
Original file line number	Diff line number	Diff line change
`@@ -6677,11 +6677,8 @@ static void GetUserSpecifiedCoreFileSaveRanges(Process &process,`
`6677`	`6677`
`6678`	`6678`	`for (const auto &range : regions) {`
`6679`	`6679`	`auto entry = option_ranges.FindEntryThatContains(range.GetRange());`
`6680`		`- if (entry) {`
`6681`		`- ranges.Append(range.GetRange().GetRangeBase(),`
`6682`		`- range.GetRange().GetByteSize(),`
`6683`		`- CreateCoreFileMemoryRange(range));`
`6684`		`- }`
	`6680`	`+ if (entry)`
	`6681`	`+ AddRegion(range, true, ranges);`
`6685`	`6682`	`}`
`6686`	`6683`	`}`
`6687`	`6684`