Scale XBLOCK in triton reduction configs to avoid hitting max grid (#1434)

jataylo · dnikolaev-amd · commit 012c13b96691 · 2024-06-20T19:50:57.000Z
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -1324,14 +1324,31 @@ def triton_config_reduction(size_hints, x, r, num_stages=1, num_warps=None) -> C
     while r < size_hints[1] and conditional_product(x, r) < target:
         r *= 2
 
-    cfg = {"XBLOCK": x, "RBLOCK": r}
     if num_warps is None:
         num_warps = conditional_product(x, r) // 128
     # On AMD GPU each warp has 64 lanes which is double the size on NV GPU,
-    # therefore using half the number of warps here correspondingly.
+    # therefore using half the number of warps here correspondingly.i
     default_num_warps = 4 if torch.version.hip else 8
     min_num_warps = 1 if torch.version.hip else 2
     num_warps = next_power_of_2(min(max(num_warps, min_num_warps), default_num_warps))
+
+    # Check if maxGridSize is exceeded - if so then must scale XBLOCK further 
+    max_grid_x = 4294967295 if torch.version.hip else 2147483647
+    warp_size = 64 if torch.version.hip else 32
+    num_blocks = int((size_hints[0] + x - 1) // x)
+    while(num_blocks * num_warps * warp_size) > max_grid_x:
+        if (x >= TRITON_MAX_BLOCK["X"]):
+            if num_warps == 1:
+                break  # If no more scaling possible then break
+            num_warps = int(num_warps / 2)  # If max XBLOCK then scale down warps as last resort
+        x *= 2  # Scale up XBLOCK if grid exceeds limits
+        num_blocks = int(num_blocks / 2)
+    while conditional_product(x, r) > target: 
+        r = int(r / 2)
+        if r == 1:
+            break
+
+    cfg = {"XBLOCK": x, "RBLOCK": r}
     check_config(cfg, xnumel=size_hints[0])
     assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}"
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)