Scale XBLOCK in triton reduction configs to avoid hitting max grid (#1434)

jataylo · pruthvistony · commit 65d1e7930fcc · 2024-08-12T00:20:58.000-05:00
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -1348,7 +1348,7 @@ def triton_config_reduction(size_hints, x, r, num_stages=1, num_warps=None) -> C
     if num_warps is None:
         num_warps = conditional_product(x, r) // 128
     # On AMD GPU each warp has 64 lanes which is double the size on NV GPU,
-    # therefore using half the number of warps here correspondingly.
+    # therefore using half the number of warps here correspondingly.i
     default_num_warps = 4 if torch.version.hip else 8
     min_num_warps = 1 if torch.version.hip else 2
     num_warps = next_power_of_2(min(max(num_warps, min_num_warps), default_num_warps))