Fix IMAs in FlexAttention + autotuning (pytorch#130352)

drisspg · xuhancn · commit 3f6ca86e3fe9 · 2024-07-25T08:56:35.000+08:00
# Summary Makes error message better for non divisible sequence lengths. Updates this PR was blocked due to two IMAs. - The first, is that when the kv indices ends up being an 'arange' I.e. there are non sparse blocks, we end up loading off of kv_indices + 1. - The second I dont really have a clear answer for. We were hitting an ima here: https://github.com/pytorch/pytorch/blob/9f401187c708d0f11122f0409254ddfc76befaf9/torch/_inductor/kernel/flex_attention.py#L846 I noticed that the for our inputs 2048 and q_blocksize = 128 we were again exactly at 16. Something felt fishy. I suspect we launch one extra sparse_q block, But why only during autotuning... ### Repro: https://gist.github.com/drisspg/f312a66426f3440b7756c6c0cc037f4c ### After this change: ``` ========= COMPUTE-SANITIZER AUTOTUNE flex_attention(1x1x2048x64, 1x1x2048x64, 1x1x2048x64, 1x1x2048, 1x1x16, 1x1x16x16) triton_flex_attention_0 2.1118 ms 100.0% BLOCK_DMODEL=64, BLOCK_M=128, BLOCK_N=128, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_3 2.4306 ms 86.9% BLOCK_DMODEL=64, BLOCK_M=64, BLOCK_N=128, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_1 2.5729 ms 82.1% BLOCK_DMODEL=64, BLOCK_M=128, BLOCK_N=64, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_4 2.8035 ms 75.3% BLOCK_DMODEL=64, BLOCK_M=64, BLOCK_N=64, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_2 2.8837 ms 73.2% BLOCK_DMODEL=64, BLOCK_M=128, BLOCK_N=128, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 SingleProcess AUTOTUNE benchmarking takes 0.7225 seconds and 1.5218 seconds precompiling AUTOTUNE flex_attention_backward(1x1x2048x64, 1x1x2048x64, 1x1x2048x64, 1x1x2048, 1x1x2048, 1x1x2048x64, 1x1x2048x64, 1x1x2048x64, 1x1x16, 1x1x16x16, 1x1x16, 1x1x16x16) triton_flex_attention_backward_30 2.7763 ms 100.0% BLOCK_DMODEL=64, BLOCK_M1=64, BLOCK_M2=64, BLOCK_N1=64, BLOCK_N2=64, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=4 triton_flex_attention_backward_15 3.1404 ms 88.4% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=64, BLOCK_N1=64, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_backward_14 3.2604 ms 85.2% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=64, BLOCK_N1=64, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=4 triton_flex_attention_backward_7 3.4176 ms 81.2% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=3, num_warps=4 triton_flex_attention_backward_8 3.4182 ms 81.2% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=4, num_warps=4 triton_flex_attention_backward_34 3.4939 ms 79.5% BLOCK_DMODEL=64, BLOCK_M1=64, BLOCK_M2=64, BLOCK_N1=64, BLOCK_N2=64, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=8 triton_flex_attention_backward_6 3.6517 ms 76.0% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=4 triton_flex_attention_backward_26 3.7000 ms 75.0% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=128, BLOCK_N1=128, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=8 triton_flex_attention_backward_22 4.0120 ms 69.2% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=128, BLOCK_N1=128, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=4 triton_flex_attention_backward_18 4.5052 ms 61.6% BLOCK_DMODEL=64, BLOCK_M1=32, BLOCK_M2=64, BLOCK_N1=64, BLOCK_N2=32, PRESCALE_QK=False, SM_SCALE=0.125, SPARSE_KV_BLOCK_SIZE=128, SPARSE_Q_BLOCK_SIZE=128, num_stages=1, num_warps=8 SingleProcess AUTOTUNE benchmarking takes 6.6558 seconds and 6.3567 seconds precompiling torch.Size([1, 1, 2048, 64]) Test completed successfully! ========= ERROR SUMMARY: 0 errors ``` Pull Request resolved: pytorch#130352 Approved by: https://github.com/Skylion007, https://github.com/Chillee
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -1176,7 +1176,7 @@ def func(q, k, v, score_mod, block_mask):
     def test_aot_eager_gradcheck(self, score_mod):
         make_tensor = functools.partial(
             torch.randn,
-            (2, 2, 8, 4),
+            (2, 2, 128, 4),
             device="cuda",
             dtype=torch.float64,
             requires_grad=True,
@@ -1199,7 +1199,7 @@ def test_captured_score_mod_aot_eager_gradcheck(
     ):
         make_tensor = functools.partial(
             torch.randn,
-            (2, 2, 8, 4),
+            (2, 2, 128, 4),
             device="cuda",
             dtype=torch.float64,
             requires_grad=True,
@@ -1336,7 +1336,7 @@ def test_fw_bw_graph_correctness(self):
         cnt = CompileCounterWithBackend("aot_eager")
         make_tensor = functools.partial(
             torch.randn,
-            (2, 2, 8, 4),
+            (2, 2, 128, 4),
             device="cuda",
             dtype=torch.float64,
             requires_grad=True,
@@ -1355,7 +1355,7 @@ def test_fw_bw_graph_correctness(self):
             norm_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_args_0_: "f64[2, 2, 8, 4]", L_args_1_: "f64[2, 2, 8, 4]", L_args_2_: "f64[2, 2, 8, 4]"):
+    def forward(self, L_args_0_: "f64[2, 2, 128, 4]", L_args_1_: "f64[2, 2, 128, 4]", L_args_2_: "f64[2, 2, 128, 4]"):
         l_args_0_ = L_args_0_
         l_args_1_ = L_args_1_
         l_args_2_ = L_args_2_
@@ -1374,8 +1374,8 @@ def forward(self, L_args_0_: "f64[2, 2, 8, 4]", L_args_1_: "f64[2, 2, 8, 4]", L_
         child_3: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
         child_4: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
         flex_attention_0 = self.flex_attention_0
-        flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, flex_attention_0, (ones, zeros, ones_1, zeros_1, 8, 8), 0.5);  l_args_0_ = l_args_1_ = l_args_2_ = flex_attention_0 = ones = zeros = ones_1 = zeros_1 = None
-        out: "f64[2, 2, 8, 4]" = flex_attention[0];  flex_attention = None
+        flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, flex_attention_0, (ones, zeros, ones_1, zeros_1, 128, 128), 0.5);  l_args_0_ = l_args_1_ = l_args_2_ = flex_attention_0 = ones = zeros = ones_1 = zeros_1 = None
+        out: "f64[2, 2, 128, 4]" = flex_attention[0];  flex_attention = None
         return (out,)
 
     class GraphModule(torch.nn.Module):
@@ -1405,13 +1405,13 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
             joint_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, primals_1: "f64[2, 2, 8, 4]", primals_2: "f64[2, 2, 8, 4]", primals_3: "f64[2, 2, 8, 4]", full_default: "i32[1, 1, 1]", full_default_1: "i32[1, 1, 1, 1]", getitem: "f64[2, 2, 8, 4]", getitem_1: "f32[2, 2, 8]", tangents_1: "f64[2, 2, 8, 4]"):
+    def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full_default: "i32[1, 1, 1]", full_default_1: "i32[1, 1, 1, 1]", getitem: "f64[2, 2, 128, 4]", getitem_1: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
         fw_graph = self.fw_graph
         joint_graph = self.joint_graph
-        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem, getitem_1, tangents_1, fw_graph, joint_graph, (full_default, full_default_1, full_default, full_default_1, 8, 8), 0.5);  primals_1 = primals_2 = primals_3 = getitem = getitem_1 = tangents_1 = fw_graph = joint_graph = full_default = full_default_1 = None
-        getitem_2: "f64[2, 2, 8, 4]" = flex_attention_backward[0]
-        getitem_3: "f64[2, 2, 8, 4]" = flex_attention_backward[1]
-        getitem_4: "f64[2, 2, 8, 4]" = flex_attention_backward[2];  flex_attention_backward = None
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem, getitem_1, tangents_1, fw_graph, joint_graph, (full_default, full_default_1, full_default, full_default_1, 128, 128), 0.5);  primals_1 = primals_2 = primals_3 = getitem = getitem_1 = tangents_1 = fw_graph = joint_graph = full_default = full_default_1 = None
+        getitem_2: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
+        getitem_3: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
+        getitem_4: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
         return [getitem_2, getitem_3, getitem_4]
 
     class <lambda>(torch.nn.Module):
@@ -1429,6 +1429,29 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 """,  # noqa: B950
         )
 
+    @supported_platform
+    def test_nyi_for_non_divisible_seq_lens(self):
+        with self.assertRaisesRegex(
+            NotImplementedError, "NYI: L must be a multiple of 128"
+        ):
+            flex_attention(
+                torch.randn((2, 3, 4)),
+                torch.randn((2, 10, 5)),
+                torch.randn((2, 10, 5)),
+                score_mod=_identity,
+            )
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "NYI: L must be a multiple of 128"
+        ):
+            compiled_flex = torch.compile(flex_attention)
+            compiled_flex(
+                torch.randn((2, 3, 4)),
+                torch.randn((2, 10, 5)),
+                torch.randn((2, 10, 5)),
+                score_mod=_identity,
+            )
+
 
 common_utils.instantiate_parametrized_tests(TestFlexAttention)
 
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
@@ -283,7 +283,7 @@ def convert_output_node_to_buffer(output):
         indices_idx = start_n // SPARSE_KV_MULTIPLE
 
         cur_block = tl.load(kv_indices + indices_idx, eviction_policy="evict_last")
-        next_block = tl.load(kv_indices + indices_idx + 1, eviction_policy="evict_last")
+        next_block = tl.load(kv_indices + indices_idx + 1, eviction_policy="evict_last", mask=indices_idx + 1 < sparse_kv_num_blocks)
         needs_jump = (start_n + 1) % SPARSE_KV_MULTIPLE == 0
         jump_to_block = (next_block - cur_block ) * SPARSE_KV_BLOCK_SIZE - (SPARSE_KV_MULTIPLE - 1) * BLOCK_N
 
@@ -554,8 +554,8 @@ def flex_attention(*args, **kwargs):
         sparse_kv_indices,
     ] + list(other_buffers)
     input_gen_fns = {
-        4: create_num_blocks_fake_generator(sparse_kv_indices),  # sparse_kv_num_blocks
-        5: create_indices_fake,  # sparse_kv_indices
+        4: create_num_blocks_fake_generator(sparse_kv_indices),
+        5: create_indices_fake,
     }
     return (
         autotune_select_algorithm(
@@ -779,7 +779,7 @@ def flex_attention_backward_grid(
             # Increment pointers.
             indices_idx = start_n // SPARSE_KV_MULTIPLE
             cur_block = tl.load(kv_indices + indices_idx)
-            next_block = tl.load(kv_indices + indices_idx + 1)
+            next_block = tl.load(kv_indices + indices_idx + 1, mask=indices_idx + 1 < sparse_kv_num_blocks)
             needs_jump = (start_n + 1) % SPARSE_KV_MULTIPLE == 0
             jump_to_block = (next_block - cur_block ) * SPARSE_KV_BLOCK_SIZE - (SPARSE_KV_MULTIPLE - 1) * BLOCK_N2
             offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK_N2
@@ -886,7 +886,7 @@ def flex_attention_backward_grid(
             # Increment pointers.
             indices_idx = start_m // SPARSE_Q_MULTIPLE
             cur_block = tl.load(q_indices + indices_idx)
-            next_block = tl.load(q_indices + indices_idx + 1)
+            next_block = tl.load(q_indices + indices_idx + 1, mask=indices_idx + 1 < sparse_q_num_blocks)
             needs_jump = (start_m + 1) % SPARSE_Q_MULTIPLE == 0
             jump_to_block = (next_block - cur_block ) * SPARSE_Q_BLOCK_SIZE - (SPARSE_Q_MULTIPLE - 1) * BLOCK_M1
             offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK_M1
@@ -999,13 +999,16 @@ def flex_attention_backward(*args, **kwargs):
     configs: List[Tuple[int, int, int, int]] = []
     configs.append(_get_default_config_bwd(query))
     if config.max_autotune:
-        for BLOCK1 in [32, 64]:
-            for BLOCK2 in [32, 64, 128]:
-                if BLOCK2 % BLOCK1 != 0:
-                    continue
-                for w in [4, 8]:
-                    for s in [1, 3, 4, 5]:
-                        configs.append((BLOCK1, BLOCK2, w, s))
+        configs.extend(
+            [
+                (BLOCK1, BLOCK2, w, s)
+                for BLOCK1 in [32, 64]
+                for BLOCK2 in [32, 64, 128]
+                for w in [4, 8]
+                for s in [1, 3, 4, 5]
+                if BLOCK2 % BLOCK1 == 0
+            ]
+        )
 
     for BLOCK1, BLOCK2, num_warps, num_stages in configs:
         if (
@@ -1066,10 +1069,10 @@ def flex_attention_backward(*args, **kwargs):
         sparse_q_indices,
     ] + list(other_buffers)
     input_gen_fns = {
-        9: create_num_blocks_fake_generator(sparse_kv_indices),  # sparse_kv_num_blocks
-        10: create_indices_fake,
-        11: create_num_blocks_fake_generator(sparse_q_indices),  # sparse_q_num_blocks
-        12: create_indices_fake,
+        8: create_num_blocks_fake_generator(sparse_kv_indices),
+        9: create_indices_fake,
+        10: create_num_blocks_fake_generator(sparse_q_indices),
+        11: create_indices_fake,
     }
 
     grad_key = autotune_select_algorithm(
diff --git a/torch/nn/attention/_flex_attention.py b/torch/nn/attention/_flex_attention.py
@@ -476,6 +476,13 @@ def score_mod(
         Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
 
     """
+    # Some basic input validation
+    _validate_sdpa_input(query, key, value)
+    if query.size(-2) >= 32:  # use Attention Kernel
+        if query.size(-2) >= 128 & query.size(-2) % 128 != 0:
+            raise NotImplementedError("NYI: S must be <128 or a multiple of 128")
+    if key.size(-2) % 128 != 0:
+        raise NotImplementedError("NYI: L must be a multiple of 128")
 
     if block_mask is None:
         block_mask = _create_empty_block_mask(query, key, value)
@@ -490,14 +497,6 @@ def score_mod(
         )
         return out
 
-    # Some basic input validation
-    _validate_sdpa_input(query, key, value)
-    if query.size(-2) >= 32:  # use Attention Kernel
-        if query.size(-2) >= 128 & query.size(-2) % 128 != 0:
-            raise ValueError("NYI: S must be <128 or a multiple of 128")
-    if key.size(-2) % 128 != 0:
-        raise ValueError("NYI: L must be a multiple of 128")
-
     if not torch._dynamo.is_dynamo_supported():
         raise RuntimeError("flex_attention requires dynamo support.")