Expand inductor codegen dtype asserts, fix scan (pytorch#146067)

eellison · pytorchmergebot · commit 71e8a2bda406 · 2025-02-07T06:35:47.000Z
We were codegening intermediary dtype asserts in some places but not all. expands assertions, fixes newly failing assertion in `TORCHINDUCTOR_COMPILE_THREADS=1 TORCH_LOGS="output_code" PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=1 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_logcumsumexp_cuda_float16` for scan. Pull Request resolved: pytorch#146067 Approved by: https://github.com/shunting314, https://github.com/jansel
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
@@ -52,6 +52,7 @@
     sympy_dot,
     sympy_index_symbol,
     sympy_subs,
+    triton_type,
     unique,
 )
 from ..virtualized import ops, OpsHandler, OpsValue, ReductionType, StoreMode, V
@@ -1784,6 +1785,15 @@ def generate(
                     else:
                         line = f"{expr}{self.suffix}"
                     buffer.writeline(line)
+
+                    if (
+                        assignment
+                        and config.test_configs.runtime_triton_dtype_assert
+                        and dtype is not None
+                    ):
+                        assert_line = f"tl.static_assert({self.prefix}{var}.dtype == {triton_type(dtype)})"
+                        buffer.writeline(assert_line)
+
         else:
             var.bounds = var.bounds.tighten(bounds)
             var.use_count += 1
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1322,12 +1322,19 @@ def index_expr(cls, expr, dtype):
         # we only respect non int32-int64 dtypes and otherwise use current kernel indexing dtype
         index_dtype = torch.int32 if V.kernel.index_dtype == "tl.int32" else torch.int64
         dtype = dtype if dtype not in (torch.int32, torch.int64) else index_dtype
-        var = V.kernel.cse.generate(
-            V.kernel.compute,
-            indexing.index_str,
-            bounds=get_bounds_index_expr(expr),
-            dtype=dtype,
-        )
+
+        # after we emit this var we cast it to the correct dtype
+        orig = config.test_configs.runtime_triton_dtype_assert
+        try:
+            config.test_configs.runtime_triton_dtype_assert = False
+            var = V.kernel.cse.generate(
+                V.kernel.compute,
+                indexing.index_str,
+                bounds=get_bounds_index_expr(expr),
+                dtype=dtype,
+            )
+        finally:
+            config.test_configs.runtime_triton_dtype_assert = orig
 
         if dtype not in (torch.int32, torch.int64):
             var = V.kernel.cse.generate(
@@ -2491,7 +2498,9 @@ def _mask_value(value, default) -> CSEVariable:
                     self.cse.generate(
                         self.compute,
                         f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
-                        dtype=torch.int64,
+                        dtype=torch.int32
+                        if V.kernel.index_dtype == "tl.int32"
+                        else torch.int64,
                     )
                 )
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
@@ -2905,6 +2914,7 @@ def scan(
         broadcasted_values = []
         accumulators = []
 
+        dtypes = tuple(upcast_compute_type(dtype) for dtype in dtypes)
         cse_compute = functools.partial(self.cse.generate, self.compute)
         combine_helper_fn = self._lift_helper(combine_fn, len(values), dtypes)
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
@@ -2913,19 +2923,19 @@ def scan(
             value_dtype = self.cse.generate(
                 self.compute,
                 f"{value}.to({triton_compute_type(dtype)})",
-                dtype=upcast_compute_type(dtype),
+                dtype=dtype,
             )
             value = self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({value_dtype}, {self.dense_size_str()})",
-                dtype=upcast_compute_type(dtype),
+                dtype=dtype,
             )
             broadcasted_values.append(value)
 
             acc_type = triton_acc_type(dtype)
 
             if not self.persistent_reduction:
-                accumulator = self.cse.newvar(dtype=upcast_compute_type(dtype))
+                accumulator = self.cse.newvar(dtype=dtype)
                 reduced_size = self.dense_size_list()
                 reduced_size[-1] = "1"
                 reduced_size = f"[{', '.join(reduced_size)}]"
@@ -2959,7 +2969,7 @@ def cse_multiple(line, values, masks, dtypes):
             f"tl.associative_scan(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
             values,
             masks,
-            (upcast_compute_type(dtype) for dtype in dtypes),
+            dtypes,
         )
 
         if not self.persistent_reduction:
@@ -3017,6 +3027,7 @@ def sort(
         cse_compute = functools.partial(self.cse.generate, self.compute)
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
 
+        dtypes = tuple(upcast_compute_type(dtype) for dtype in dtypes)
         assert len(dtypes) == len(values)
         broadcasted_values = [
             cse_compute(
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -1470,24 +1470,26 @@ def default_accumulator(
             if is_float_dtype(dtype):
                 return float("-inf")
             elif is_boolean_dtype(dtype):
-                return 0
+                return False
             else:
                 return torch.iinfo(dtype).min
         if reduction_type in ("min", "argmin"):
             if is_float_dtype(dtype):
                 return float("inf")
             elif is_boolean_dtype(dtype):
-                return 1
+                return True
             else:
                 return torch.iinfo(dtype).max
 
+        zero = False if is_boolean_dtype(dtype) else 0
+        one = True if is_boolean_dtype(dtype) else 1
         return {
-            "sum": 0,
-            "prod": 1,
-            "xor_sum": 0,
-            "any": 0,
-            "welford_reduce": (0, 0, 0),
-            "welford_combine": (0, 0, 0),
+            "sum": zero,
+            "prod": one,
+            "xor_sum": zero,
+            "any": zero,
+            "welford_reduce": (zero, zero, zero),
+            "welford_combine": (zero, zero, zero),
         }[reduction_type]
 
     @staticmethod