rust-lang
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Lines changed: 5 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Lines changed: 5 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Lines changed: 31 additions & 12 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Lines changed: 31 additions & 12 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 5 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Lines changed: 4 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/R600Subtarget.cpp
Lines changed: 3 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/R600Subtarget.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 2 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Lines changed: 27 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Lines changed: 27 additions & 5 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Lines changed: 4 additions & 0 deletions
@@ -842,11 +842,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     Ctx.diagnose(Diag);
   }
 
-  if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
+  if (MFI->getLDSSize() >
+      static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
     LLVMContext &Ctx = MF.getFunction().getContext();
-    DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
-                                     MFI->getLDSSize(),
-                                     STM.getLocalMemorySize(), DS_Error);
+    DiagnosticInfoResourceLimit Diag(
+        MF.getFunction(), "local memory", MFI->getLDSSize(),
+        STM.getAddressableLocalMemorySize(), DS_Error);
     Ctx.diagnose(Diag);
   }
 
 
@@ -798,7 +798,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
     }
   }
 
-  LocalMemLimit = ST.getLocalMemorySize();
+  LocalMemLimit = ST.getAddressableLocalMemorySize();
   if (LocalMemLimit == 0)
     return false;
 
 
@@ -141,6 +141,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
       HasMovrel = true;
   }
 
+  AddressableLocalMemorySize = LocalMemorySize;
+
+  if (AMDGPU::isGFX10Plus(*this) &&
+      !getFeatureBits().test(AMDGPU::FeatureCuMode))
+    LocalMemorySize *= 2;
+
   // Don't crash on invalid devices.
   if (WavefrontSizeLog2 == 0)
     WavefrontSizeLog2 = 5;
@@ -304,19 +310,29 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
   }
 }
 
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
-  const Function &F) const {
-  if (NWaves == 1)
-    return getLocalMemorySize();
-  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
-  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
-  if (!WorkGroupsPerCu)
-    return 0;
-  unsigned MaxWaves = getMaxWavesPerEU();
-  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
+// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
+// allows the given function to achieve an occupancy of NWaves waves per
+// SIMD / EU, taking into account only the function's *maximum* workgroup size.
+unsigned
+AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+                                                 const Function &F) const {
+  const unsigned WaveSize = getWavefrontSize();
+  const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  const unsigned WavesPerWorkgroup =
+      std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
+
+  const unsigned WorkGroupsPerCU =
+      std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
+
+  return getLocalMemorySize() / WorkGroupsPerCU;
 }
 
 // FIXME: Should return min,max range.
+//
+// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
+// be achieved when only the given function is running on the machine; and
+// taking into account the overall number of wave slots, the (maximum) workgroup
+// size, and the per-workgroup LDS allocation size.
 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   const Function &F) const {
   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
@@ -338,10 +354,13 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 
   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
 
-  // Round to the number of waves.
-  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+  // Round to the number of waves per CU.
+  const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
 
+  // Number of waves per EU (SIMD).
+  MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
+
   // Clamp to the maximum possible number of waves.
   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
 
 
@@ -64,6 +64,7 @@ class AMDGPUSubtarget {
   unsigned EUsPerCU = 4;
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
+  unsigned AddressableLocalMemorySize = 0;
   char WavefrontSizeLog2 = 0;
 
 public:
@@ -210,6 +211,10 @@ class AMDGPUSubtarget {
     return LocalMemorySize;
   }
 
+  unsigned getAddressableLocalMemorySize() const {
+    return AddressableLocalMemorySize;
+  }
+
   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
   /// CU mode into account.
 
@@ -903,10 +903,12 @@ void GCNSchedStage::checkScheduling() {
     return;
   }
 
+  unsigned TargetOccupancy =
+      std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
   unsigned WavesAfter =
-      std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
+      std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
   unsigned WavesBefore =
-      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+      std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
 
@@ -28,7 +28,9 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
       InstrInfo(*this),
       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
       TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
-      InstrItins(getInstrItineraryForCPU(GPU)) {}
+      InstrItins(getInstrItineraryForCPU(GPU)) {
+  AddressableLocalMemorySize = LocalMemorySize;
+}
 
 R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
                                                               StringRef GPU,
 
@@ -12635,7 +12635,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
       // We can report everything over the maximum size as 0. We can't report
       // based on the actual size because we don't know if it's accurate or not
       // at any given point.
-      Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
+      Known.Zero.setHighBits(
+          countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));
       break;
     }
     }
 
@@ -828,11 +828,26 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
 }
 
 unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
+  unsigned BytesPerCU = 0;
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
+    BytesPerCU = 32768;
+  if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
+    BytesPerCU = 65536;
+
+  // "Per CU" really means "per whatever functional block the waves of a
+  // workgroup must share". So the effective local memory size is doubled in
+  // WGP mode on gfx10.
+  if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
+    BytesPerCU *= 2;
+
+  return BytesPerCU;
+}
+
+unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
     return 32768;
   if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
     return 65536;
-
   return 0;
 }
 
@@ -852,11 +867,18 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
   assert(FlatWorkGroupSize != 0);
   if (STI->getTargetTriple().getArch() != Triple::amdgcn)
     return 8;
+  unsigned MaxWaves = getMaxWavesPerEU(STI) * getEUsPerCU(STI);
   unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
-  if (N == 1)
-    return 40;
-  N = 40 / N;
-  return std::min(N, 16u);
+  if (N == 1) {
+    // Single-wave workgroups don't consume barrier resources.
+    return MaxWaves;
+  }
+
+  unsigned MaxBarriers = 16;
+  if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode))
+    MaxBarriers = 32;
+
+  return std::min(MaxWaves / N, MaxBarriers);
 }
 
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
 
@@ -192,6 +192,10 @@ unsigned getWavefrontSize(const MCSubtargetInfo *STI);
 /// \returns Local memory size in bytes for given subtarget \p STI.
 unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
 
+/// \returns Maximum addressable local memory size in bytes for given subtarget
+/// \p STI.
+unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
+
 /// \returns Number of execution units per compute unit for given subtarget \p
 /// STI.
 unsigned getEUsPerCU(const MCSubtargetInfo *STI);
Original file line number	Diff line number	Diff line change
`@@ -798,7 +798,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {`
`798`	`798`	`}`
`799`	`799`	`}`
`800`	`800`
`801`		`- LocalMemLimit = ST.getLocalMemorySize();`
	`801`	`+ LocalMemLimit = ST.getAddressableLocalMemorySize();`
`802`	`802`	`if (LocalMemLimit == 0)`
`803`	`803`	`return false;`
`804`	`804`
Original file line number	Diff line number	Diff line change
`@@ -12635,7 +12635,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(`
`12635`	`12635`	`// We can report everything over the maximum size as 0. We can't report`
`12636`	`12636`	`// based on the actual size because we don't know if it's accurate or not`
`12637`	`12637`	`// at any given point.`
`12638`		`- Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));`
	`12638`	`+ Known.Zero.setHighBits(`
	`12639`	`+ countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));`
`12639`	`12640`	`break;`
`12640`	`12641`	`}`
`12641`	`12642`	`}`