@@ -770,8 +770,15 @@ struct AMDGPUKernelTy : public GenericKernelTy {
770
770
uint64_t LoopTripCount, uint32_t ThreadLimitClause[3 ]) const override {
771
771
uint32_t NumThreads = BlockSize;
772
772
773
- // If there is an override already, do nothing
774
- if (NumThreads != GenericDevice.getDefaultNumThreads () &&
773
+ // If there is an override already, do nothing. Note the different
774
+ // default for Xteam Reductions.
775
+ if (!isXTeamReductionsMode () &&
776
+ NumThreads != GenericDevice.getDefaultNumThreads () &&
777
+ NumThreads != ConstWGSize)
778
+ return std::make_pair (false , NumThreads);
779
+
780
+ if (isXTeamReductionsMode () &&
781
+ NumThreads != llvm::omp::xteam_red::DefaultBlockSize &&
775
782
NumThreads != ConstWGSize)
776
783
return std::make_pair (false , NumThreads);
777
784
@@ -788,13 +795,21 @@ struct AMDGPUKernelTy : public GenericKernelTy {
788
795
if ((ThreadLimitClause[0 ] > 0 ) && (ThreadLimitClause[0 ] != (uint32_t )-1 ))
789
796
return std::make_pair (false , NumThreads);
790
797
791
- // If generic, generic-SPMD, or Xteam reduction kernel, do nothing.
792
- if (isGenericMode () || isGenericSPMDMode () || isXTeamReductionsMode () )
798
+ // If generic or generic-SPMD kernel, do nothing.
799
+ if (isGenericMode () || isGenericSPMDMode ())
793
800
return std::make_pair (false , NumThreads);
794
801
795
802
// Reduce the blocksize as long as it is above the tunable limit.
796
803
while (NumThreads > GenericDevice.getOMPXSmallBlockSize ())
797
804
NumThreads >>= 1 ;
805
+
806
+ if (NumThreads == 0 )
807
+ return std::make_pair (false , BlockSize);
808
+
809
+ if (isXTeamReductionsMode ())
810
+ return std::make_pair (true ,
811
+ llvm::omp::getBlockSizeAsPowerOfTwo (NumThreads));
812
+
798
813
return std::make_pair (true , NumThreads);
799
814
}
800
815
0 commit comments