GPUOpen-Drivers
diff --git a/‎lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp
Lines changed: 1 addition & 1 deletion b/‎lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lldb/source/Target/TargetList.cpp
Lines changed: 3 additions & 3 deletions b/‎lldb/source/Target/TargetList.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/Support/SwapByteOrder.h
Lines changed: 2 additions & 14 deletions b/‎llvm/include/llvm/Support/SwapByteOrder.h
Lines changed: 2 additions & 14 deletions
diff --git a/‎llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
Lines changed: 1 addition & 3 deletions b/‎llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPU.td
Lines changed: 9 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPU.td
Lines changed: 9 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Lines changed: 13 additions & 6 deletions b/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Lines changed: 13 additions & 6 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
Lines changed: 11 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Lines changed: 10 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Lines changed: 10 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.td
Lines changed: 16 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.td
Lines changed: 16 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 5 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 5 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/VOP1Instructions.td
Lines changed: 5 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/VOP1Instructions.td
Lines changed: 5 additions & 5 deletions
@@ -482,7 +482,7 @@ bool DYLDRendezvous::RemoveSOEntriesFromRemote(
 
     // Only add shared libraries and not the executable.
     if (!SOEntryIsMainExecutable(entry)) {
-      auto pos = std::find(m_soentries.begin(), m_soentries.end(), entry);
+      auto pos = llvm::find(m_soentries, entry);
       if (pos == m_soentries.end())
         return false;
 
 
@@ -370,7 +370,7 @@ Status TargetList::CreateTargetInternal(Debugger &debugger,
 
 bool TargetList::DeleteTarget(TargetSP &target_sp) {
   std::lock_guard<std::recursive_mutex> guard(m_target_list_mutex);
-  auto it = std::find(m_target_list.begin(), m_target_list.end(), target_sp);
+  auto it = llvm::find(m_target_list, target_sp);
   if (it == m_target_list.end())
     return false;
 
@@ -506,7 +506,7 @@ lldb::TargetSP TargetList::GetTargetAtIndex(uint32_t idx) const {
 
 uint32_t TargetList::GetIndexOfTarget(lldb::TargetSP target_sp) const {
   std::lock_guard<std::recursive_mutex> guard(m_target_list_mutex);
-  auto it = std::find(m_target_list.begin(), m_target_list.end(), target_sp);
+  auto it = llvm::find(m_target_list, target_sp);
   if (it != m_target_list.end())
     return std::distance(m_target_list.begin(), it);
   return UINT32_MAX;
@@ -533,7 +533,7 @@ void TargetList::SetSelectedTarget(uint32_t index) {
 
 void TargetList::SetSelectedTarget(const TargetSP &target_sp) {
   std::lock_guard<std::recursive_mutex> guard(m_target_list_mutex);
-  auto it = std::find(m_target_list.begin(), m_target_list.end(), target_sp);
+  auto it = llvm::find(m_target_list, target_sp);
   SetSelectedTargetInternal(std::distance(m_target_list.begin(), it));
 }
 
 
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 475741
+#define LLVM_MAIN_REVISION 475744
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
 
@@ -73,23 +73,11 @@ inline unsigned long long getSwappedBytes(unsigned long long C) { return llvm::b
 inline   signed long long getSwappedBytes(  signed long long C) { return llvm::byteswap(C); }
 
 inline float getSwappedBytes(float C) {
-  union {
-    uint32_t i;
-    float f;
-  } in, out;
-  in.f = C;
-  out.i = llvm::byteswap(in.i);
-  return out.f;
+  return llvm::bit_cast<float>(llvm::byteswap(llvm::bit_cast<uint32_t>(C)));
 }
 
 inline double getSwappedBytes(double C) {
-  union {
-    uint64_t i;
-    double d;
-  } in, out;
-  in.d = C;
-  out.i = llvm::byteswap(in.i);
-  return out.d;
+  return llvm::bit_cast<double>(llvm::byteswap(llvm::bit_cast<uint64_t>(C)));
 }
 
 template <typename T>
 
@@ -194,9 +194,7 @@ Error ExecutorSharedMemoryMapperService::deinitialize(
 
       // Remove the allocation from the allocation list of its reservation
       for (auto &Reservation : Reservations) {
-        auto AllocationIt =
-            std::find(Reservation.second.Allocations.begin(),
-                      Reservation.second.Allocations.end(), Base);
+        auto AllocationIt = llvm::find(Reservation.second.Allocations, Base);
         if (AllocationIt != Reservation.second.Allocations.end()) {
           Reservation.second.Allocations.erase(AllocationIt);
           break;
 
@@ -1692,6 +1692,15 @@ def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
 def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
 
+// Control use of True16 instructions. The real True16 instructions are
+// True16 instructions as they are defined in the ISA. Fake True16
+// instructions have the same encoding as real ones but syntactically
+// only allow 32-bit registers in operands and use low halves thereof.
+def UseRealTrue16Insts : Predicate<"Subtarget->useRealTrue16Insts()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
+def UseFakeTrue16Insts : Predicate<"Subtarget->hasTrue16BitInsts() && "
+                                   "!Subtarget->useRealTrue16Insts()">;
+
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<(all_of FeatureVOP3P)>;
 
 
@@ -420,11 +420,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // encodings
     if (isGFX11Plus() && Bytes.size() >= 12 ) {
       DecoderUInt128 DecW = eat12Bytes(Bytes);
-      Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS);
+      Res =
+          tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
+                        MI, DecW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
       MI = MCInst(); // clear
-      Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS);
+      Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
+                          MI, DecW, Address, CS);
       if (Res) {
         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
           convertVOP3PDPPInst(MI);
@@ -463,15 +466,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
         break;
       MI = MCInst(); // clear
 
-      Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS);
+      Res = tryDecodeInst(DecoderTableDPP8GFX1164,
+                          DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
       MI = MCInst(); // clear
 
       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
       if (Res) break;
 
-      Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS);
+      Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
+                          MI, QW, Address, CS);
       if (Res) {
         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
           convertVOPCDPPInst(MI);
@@ -532,7 +537,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
     if (Res) break;
 
-    Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS);
+    Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+                        Address, CS);
     if (Res) break;
 
     if (Bytes.size() < 4) break;
@@ -562,7 +568,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
     if (Res) break;
 
-    Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS);
+    Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+                        Address, CS);
     if (Res)
       break;
 
 
@@ -144,6 +144,17 @@ class AMDGPUDisassembler : public MCDisassembler {
     return MCDisassembler::Fail;
   }
 
+  template <typename InsnType>
+  DecodeStatus tryDecodeInst(const uint8_t *Table1, const uint8_t *Table2,
+                             MCInst &MI, InsnType Inst, uint64_t Address,
+                             raw_ostream &Comments) const {
+    for (const uint8_t *T : {Table1, Table2}) {
+      if (DecodeStatus Res = tryDecodeInst(T, MI, Inst, Address, Comments))
+        return Res;
+    }
+    return MCDisassembler::Fail;
+  }
+
   std::optional<DecodeStatus>
   onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes,
                 uint64_t Address, raw_ostream &CStream) const override;
 
@@ -1408,6 +1408,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
   case AMDGPU::V_MAX_F32_e64:
   case AMDGPU::V_MAX_F16_e64:
   case AMDGPU::V_MAX_F16_t16_e64:
+  case AMDGPU::V_MAX_F16_fake16_e64:
   case AMDGPU::V_MAX_F64_e64:
   case AMDGPU::V_PK_MAX_F16: {
     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
@@ -1503,7 +1504,8 @@ static int getOModValue(unsigned Opc, int64_t Val) {
     }
   }
   case AMDGPU::V_MUL_F16_e64:
-  case AMDGPU::V_MUL_F16_t16_e64: {
+  case AMDGPU::V_MUL_F16_t16_e64:
+  case AMDGPU::V_MUL_F16_fake16_e64: {
     switch (static_cast<uint16_t>(Val)) {
     case 0x3800: // 0.5
       return SIOutMods::DIV2;
@@ -1530,12 +1532,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
   case AMDGPU::V_MUL_F64_e64:
   case AMDGPU::V_MUL_F32_e64:
   case AMDGPU::V_MUL_F16_t16_e64:
+  case AMDGPU::V_MUL_F16_fake16_e64:
   case AMDGPU::V_MUL_F16_e64: {
     // If output denormals are enabled, omod is ignored.
     if ((Op == AMDGPU::V_MUL_F32_e64 &&
          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
         ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
-          Op == AMDGPU::V_MUL_F16_t16_e64) &&
+          Op == AMDGPU::V_MUL_F16_t16_e64 ||
+          Op == AMDGPU::V_MUL_F16_fake16_e64) &&
          MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
       return std::pair(nullptr, SIOutMods::NONE);
 
@@ -1565,12 +1569,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
   case AMDGPU::V_ADD_F64_e64:
   case AMDGPU::V_ADD_F32_e64:
   case AMDGPU::V_ADD_F16_e64:
-  case AMDGPU::V_ADD_F16_t16_e64: {
+  case AMDGPU::V_ADD_F16_t16_e64:
+  case AMDGPU::V_ADD_F16_fake16_e64: {
     // If output denormals are enabled, omod is ignored.
     if ((Op == AMDGPU::V_ADD_F32_e64 &&
          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
         ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
-          Op == AMDGPU::V_ADD_F16_t16_e64) &&
+          Op == AMDGPU::V_ADD_F16_t16_e64 ||
+          Op == AMDGPU::V_ADD_F16_fake16_e64) &&
          MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
       return std::pair(nullptr, SIOutMods::NONE);
 
 
@@ -2262,6 +2262,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field list<ValueType> ArgVT = _ArgVT;
   field bit EnableClamp = _EnableClamp;
   field bit IsTrue16 = 0;
+  field bit IsRealTrue16 = 0;
 
   field ValueType DstVT = ArgVT[0];
   field ValueType Src0VT = ArgVT[1];
@@ -2453,6 +2454,21 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
 // VOPC_Class_NoSdst_Profile_t16, and  VOP_MAC_F16_t16 do not inherit from this
 // class, so copy changes to this class in those profiles
 class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  // Most DstVT are 16-bit, but not all.
+  let DstRC = getVALUDstForVT_t16<DstVT>.ret;
+  let DstRC64 = getVALUDstForVT<DstVT>.ret;
+  let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
+  let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
+  let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
+  let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+  let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
+  let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
+  let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+}
+
+class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let IsTrue16 = 1;
   // Most DstVT are 16-bit, but not all
   let DstRC = getVALUDstForVT_t16<DstVT>.ret;
 
@@ -1673,8 +1673,10 @@ def : ClampPat<V_MAX_F32_e64, f32>;
 def : ClampPat<V_MAX_F64_e64, f64>;
 let SubtargetPredicate = NotHasTrue16BitInsts in
 def : ClampPat<V_MAX_F16_e64, f16>;
-let SubtargetPredicate = HasTrue16BitInsts in
+let SubtargetPredicate = UseRealTrue16Insts in
 def : ClampPat<V_MAX_F16_t16_e64, f16>;
+let SubtargetPredicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_F16_fake16_e64, f16>;
 
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
@@ -2789,12 +2791,12 @@ def : GCNPat<
 let OtherPredicates = [HasTrue16BitInsts] in {
 def : GCNPat<
   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
+  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
 >;
 
 def : GCNPat<
   (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
+  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
 >;
 } // End OtherPredicates
 
 
@@ -152,7 +152,7 @@ multiclass VOP1Inst_t16<string opName,
     defm NAME : VOP1Inst<opName, P, node>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
-    defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>;
+    defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>;
   }
 }
 
@@ -170,7 +170,7 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
 }
 
 class VOPProfileI2F_True16<ValueType dstVt, ValueType srcVt> :
-  VOPProfile_True16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> {
+  VOPProfile_Fake16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> {
 
   let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
   let InsVOP3Base = (ins Src0VOP3DPP:$src0, clampmod:$clamp, omod:$omod);
@@ -199,7 +199,7 @@ class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
 def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>;
 def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>;
 def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
-def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_True16<VOP_I16_F16> {
+def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> {
   let HasOMod = 1;
 }
 
@@ -292,13 +292,13 @@ let FPDPRounding = 1, isReMaterializable = 0 in {
   let OtherPredicates = [NotHasTrue16BitInsts] in
   defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>;
   let OtherPredicates = [HasTrue16BitInsts] in
-  defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>;
+  defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_Fake16<VOP_F16_F32>, any_fpround>;
 } // End FPDPRounding = 1, isReMaterializable = 0
 
 let OtherPredicates = [NotHasTrue16BitInsts] in
 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>;
 let OtherPredicates = [HasTrue16BitInsts] in
-defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>;
+defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
 
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;