misc: unused import cleanup (#1092)

b8zhong · web-flow · commit e5cfd6dcaa8e · 2025-05-25T12:34:53.000-07:00
&lt;!-- .github/pull_request_template.md --&gt;

## 📌 Description

&lt;!-- What does this PR do? Briefly describe the changes and why they’re
needed. --&gt;

Short unused import code cleanup

## 🔍 Related Issues

N/A

&lt;!-- Link any related issues here --&gt;

### ✅ Pre-commit Checks

- [x] I have installed `pre-commit` by running `pip install pre-commit`
(or used your preferred method).
- [x] I have installed the hooks with `pre-commit install`.
- [x] I have run the hooks manually with `pre-commit run --all-files`
and fixed any reported issues.
diff --git a/benchmarks/bench_groupwise_gemm_fp8_blackwell.py b/benchmarks/bench_groupwise_gemm_fp8_blackwell.py
@@ -14,14 +14,12 @@
 limitations under the License.
 """
 
-import pytest
 import torch
 import triton
 import triton.language as tl
 from triton.testing import do_bench
 
-import flashinfer
-from flashinfer.gemm import gemm_fp8_nt_blockscaled, gemm_fp8_nt_groupwise
+from flashinfer.gemm import gemm_fp8_nt_groupwise
 
 
 @triton.jit
diff --git a/benchmarks/bench_groupwise_grouped_gemm_fp8_blackwell.py b/benchmarks/bench_groupwise_grouped_gemm_fp8_blackwell.py
@@ -14,7 +14,6 @@
 limitations under the License.
 """
 
-import numpy as np
 import torch
 from triton.testing import do_bench
 
diff --git a/benchmarks/bench_pad_ragged_tensor.py b/benchmarks/bench_pad_ragged_tensor.py
@@ -1,6 +1,3 @@
-import argparse
-from typing import cast
-
 import torch
 from triton.testing import do_bench
 
diff --git a/benchmarks/bench_persistent_gemm.py b/benchmarks/bench_persistent_gemm.py
@@ -1,4 +1,3 @@
-import pytest
 import torch
 import triton
 from triton.testing import do_bench
diff --git a/benchmarks/bench_rope.py b/benchmarks/bench_rope.py
@@ -6,8 +6,7 @@
 $ python bench_rope.py
 """
 
-import math
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
diff --git a/flashinfer/jit/attention/pytorch.py b/flashinfer/jit/attention/pytorch.py
@@ -243,10 +243,10 @@ def gen_batch_decode_mla_module(
         and dtype_kv == torch.float16
         and dtype_o == torch.float16
     ):
-        logger.info(f"Use tensor-core SM80 version of MLA decode kernel.")
+        logger.info("Use tensor-core SM80 version of MLA decode kernel.")
         arc = "sm80"
     else:
-        logger.info(f"Fall back to cuda-core version of MLA decode kernel.")
+        logger.info("Fall back to cuda-core version of MLA decode kernel.")
         arc = "cuda_core"
 
     uri = get_batch_decode_mla_uri(
@@ -424,7 +424,7 @@ def gen_single_decode_module(
         ],  # additional_scalar_names
         ["double", "double", "double", "double"],  # additional_scalar_dtypes
         f"DefaultAttention<false, {str(use_sliding_window).lower()}, {str(use_logits_soft_cap).lower()}, {str(pos_encoding_mode == 2).lower()}>",  # variant_name
-        f"#include<flashinfer/attention/variants.cuh>",  # variant_decl
+        "#include<flashinfer/attention/variants.cuh>",  # variant_decl
         pos_encoding_mode=pos_encoding_mode,
         use_sliding_window=use_sliding_window,
         use_logits_soft_cap=use_logits_soft_cap,
@@ -473,22 +473,22 @@ def gen_single_prefill_module(
         ]
         additional_scalar_dtypes = ["double", "double", "double", "double"]
         variant_name = f"DefaultAttention<use_custom_mask, {str(use_sliding_window).lower()}, {str(use_logits_soft_cap).lower()}, {str(pos_encoding_mode == 2).lower()}>"
-        variant_decl = f"#include<flashinfer/attention/variants.cuh>"
+        variant_decl = "#include<flashinfer/attention/variants.cuh>"
     else:
         if not fp8_enabled:
             additional_tensor_names = []
             additional_tensor_dtypes = []
             additional_scalar_names = ["logits_soft_cap", "sm_scale"]
             additional_scalar_dtypes = ["double", "double"]
             variant_name = f"DefaultAttention<{str(use_logits_soft_cap).lower()}>"
-            variant_decl = f"#include<flashinfer/attention/hopper/variants.cuh>"
+            variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
         else:
             additional_tensor_names = ["scale_q", "scale_k", "scale_v"]
             additional_tensor_dtypes = ["float", "float", "float"]
             additional_scalar_names = ["sm_scale"]
             additional_scalar_dtypes = ["double"]
-            variant_name = f"DefaultFP8Attention"
-            variant_decl = f"#include<flashinfer/attention/hopper/variants.cuh>"
+            variant_name = "DefaultFP8Attention"
+            variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
 
     return gen_customize_single_prefill_module(
         backend,
@@ -551,7 +551,7 @@ def gen_pod_module(
     additional_scalar_dtypes = ["float", "float", "float", "float"]
     variant_name_p = f"DefaultAttention<use_custom_mask_p, {str(use_sliding_window_p).lower()}, {str(use_logits_soft_cap_p).lower()}, {str(pos_encoding_mode_p == 2).lower()}>"
     variant_name_d = f"DefaultAttention<use_custom_mask_d, {str(use_sliding_window_d).lower()}, {str(use_logits_soft_cap_d).lower()}, {str(pos_encoding_mode_d == 2).lower()}>"
-    variant_decl = f"#include<flashinfer/attention/variants.cuh>"
+    variant_decl = "#include<flashinfer/attention/variants.cuh>"
 
     return gen_customize_pod_module(
         uri,
@@ -717,7 +717,7 @@ def gen_batch_decode_module(
         ],  # additional_scalar_names
         ["double", "double", "double", "double"],  # additional_scalar_dtypes
         f"DefaultAttention<false, {str(use_sliding_window).lower()}, {str(use_logits_soft_cap).lower()}, {str(pos_encoding_mode == 2).lower()}>",  # variant_name
-        f"#include<flashinfer/attention/variants.cuh>",  # variant_decl
+        "#include<flashinfer/attention/variants.cuh>",  # variant_decl
         pos_encoding_mode=pos_encoding_mode,
         use_sliding_window=use_sliding_window,
         use_logits_soft_cap=use_logits_soft_cap,
@@ -799,14 +799,14 @@ def gen_batch_prefill_module(
             ]
             additional_scalar_dtypes = ["double", "double", "int64_t"]
             variant_name = f"DefaultAttention<{str(use_logits_soft_cap).lower()}>"
-            variant_decl = f"#include<flashinfer/attention/hopper/variants.cuh>"
+            variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
         else:
             additional_tensor_names = ["scale_q", "scale_k", "scale_v"]
             additional_tensor_dtypes = ["float", "float", "float"]
             additional_scalar_names = ["sm_scale"]
             additional_scalar_dtypes = ["double"]
-            variant_name = f"DefaultFP8Attention"
-            variant_decl = f"#include<flashinfer/attention/hopper/variants.cuh>"
+            variant_name = "DefaultFP8Attention"
+            variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
 
     return gen_customize_batch_prefill_module(
         backend,
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
@@ -14,22 +14,14 @@
 limitations under the License.
 """
 
-import functools
-from types import SimpleNamespace
-from typing import List, Literal, Optional, Tuple, Union, overload
+from typing import Literal, Optional, Tuple, Union, overload
 
 import torch
 
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_batch_mla_module, gen_jit_spec, sm100a_nvcc_flags
-from .utils import (
-    MaskMode,
-    _check_shape_dtype_device,
-    determine_mla_backend,
-    register_custom_op,
-    register_fake_op,
-)
+from .utils import MaskMode, _check_shape_dtype_device, determine_mla_backend
 
 
 def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
diff --git a/flashinfer/pod.py b/flashinfer/pod.py
@@ -14,26 +14,16 @@
 limitations under the License.
 """
 
-import functools
-import logging
 import math
 from types import SimpleNamespace
-from typing import Any, List, Literal, Optional, Tuple, Union, overload
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 
-from .decode import get_batch_decode_module
-from .jit import (
-    gen_batch_decode_module,
-    gen_batch_prefill_module,
-    gen_customize_batch_prefill_module,
-    gen_pod_module,
-    gen_single_prefill_module,
-    get_pod_uri,
-)
-from .page import block_sparse_indices_to_vector_sparse_offsets, get_seq_lens
+from .jit import gen_pod_module
+from .page import get_seq_lens
 from .prefill import get_batch_prefill_module
-from .quantization import packbits, segment_packbits
+from .quantization import packbits
 from .utils import (
     MaskMode,
     PosEncodingMode,
@@ -46,10 +36,6 @@
     _get_range_buf,
     _unpack_paged_kv_cache,
     canonicalize_torch_dtype,
-    determine_attention_backend,
-    is_float8,
-    register_custom_op,
-    register_fake_op,
 )
 
 _pod_modules = {}
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -84,7 +84,7 @@ def get_fmha_module(
             use_logits_soft_cap,
         ).build_and_load()
     else:
-        raise ValueError(f"SM100A is not supported on this device")
+        raise ValueError("SM100A is not supported on this device")
 
 
 def get_single_prefill_module(backend):
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
@@ -352,7 +352,7 @@ def plan(
         if (
             R * (num_qo_heads // num_kv_heads) < 4
             and mask_mode != MaskMode.CUSTOM.value
-            and not q_data_type in [torch.float8_e4m3fn, torch.float8_e5m2]
+            and q_data_type not in [torch.float8_e4m3fn, torch.float8_e5m2]
         ):
             # If the operation is not compute-bound, we use the cuda-core implementation
             self._use_tensor_cores = False
diff --git a/flashinfer/triton/norm.py b/flashinfer/triton/norm.py
@@ -1,4 +1,3 @@
-from collections.abc import Mapping
 from typing import Optional
 
 import torch
diff --git a/flashinfer/triton/page.py b/flashinfer/triton/page.py
@@ -14,9 +14,6 @@
 limitations under the License.
 """
 
-from typing import Optional, Tuple, Union
-
-import torch
 import triton
 import triton.language as tl
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import pytest`
`2`	`1`	`import torch`
`3`	`2`	`import triton`
`4`	`3`	`from triton.testing import do_bench`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from collections.abc import Mapping`
`2`	`1`	`from typing import Optional`
`3`	`2`
`4`	`3`	`import torch`