[Inductor][PatternMatcher] Always prevent match across mutations (pytorch#130584)

yf225 · xuhancn · commit 96aa07cd1513 · 2024-07-25T08:56:35.000+08:00
Preventing match across mutations should always be the safe thing to do. This will be especially important for Traceable FSDP2 because in that case we do have mutation ops (`.set_` and `.resize_(0)`) in the middle of the graph for both joint-graph and post-grad graph, so making sure the pattern matcher passes work well with middle-of-graph mutation ops is important. Q: Why can't we move these mutation ops to the end of graph, to make pass writing easier? A: We attempted to do that in pytorch#129852, but the custom FX passes (in `torch/_functorch/_aot_autograd/fx_passes.py`) for the re-functionalization is complicated to maintain, and the changes to partitioner (in `torch/_functorch/partitioners.py`) also feels hacky. Hence we want to preserve these mutation ops in the middle of graph to avoid the complexity. Test commands: - `pytest -rA test/inductor/test_pattern_matcher.py::TestPatternMatcher::test_uint4x2_mixed_mm` - `pytest -rA test/inductor/test_pattern_matcher.py::TestPatternMatcher::test_serialized_patterns_up_to_date` Pull Request resolved: pytorch#130584 Approved by: https://github.com/jansel
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
@@ -16,6 +16,7 @@
     Arg,
     CallFunction,
     gen_pattern,
+    is_mutation_op,
     KeywordArg,
     Match,
     PatternMatcherPass,
@@ -1000,9 +1001,7 @@ def foo(x, y):
 
     def test_match_with_mutation(self):
         counter = 0
-        test_pass = PatternMatcherPass(
-            prevent_match_across_mutations=True, pass_name="test"
-        )
+        test_pass = PatternMatcherPass(pass_name="test")
 
         @register_graph_pattern(
             CallFunction(
@@ -1159,7 +1158,7 @@ def remap_fake_tensor(x):
 
     def test_match_equivalent_function_invocations1(self):
         counter = 0
-        test_pass = PatternMatcherPass(prevent_match_across_mutations=True)
+        test_pass = PatternMatcherPass()
 
         args = [
             torch.randn(20, device="cuda"),
@@ -1215,7 +1214,7 @@ def repl(inp, x1, x2):
 
     def test_match_equivalent_function_invocations2(self):
         counter = 0
-        test_pass = PatternMatcherPass(prevent_match_across_mutations=True)
+        test_pass = PatternMatcherPass()
 
         args = [
             torch.randn(20, device="cuda"),
@@ -1260,7 +1259,7 @@ def repl(inp, x1, x2):
 
     def test_match_equivalent_function_invocations3(self):
         counter = 0
-        test_pass = PatternMatcherPass(prevent_match_across_mutations=True)
+        test_pass = PatternMatcherPass()
 
         args = [
             torch.randn(20, device="cuda"),
@@ -1371,6 +1370,61 @@ def div_softmax(x, inv_scale):
         self.common(mul_softmax, (scale, x), 0, 0)
         self.common(div_softmax, (x, scale), 0, 0)
 
+    def test_mutation_op_matching(self):
+        def check(type, func_name, args, kwargs, expect=True):
+            assert type in ["call_function", "call_method"]
+            graph = torch.fx.Graph()
+            getattr(graph, type)(func_name, args, kwargs)
+            res = is_mutation_op(next(iter(graph.nodes)))
+            if expect:
+                self.assertTrue(res)
+            else:
+                self.assertFalse(res)
+
+        t = torch.randn(1)
+        check("call_function", torch._C._set_grad_enabled, (False,), {})
+        check("call_method", "copy_", (t, t), {})
+        check("call_method", "relu_", (t,), {})
+        check("call_function", torch.manual_seed, (0,), {})
+        check("call_function", torch.ops.aten.set_.source_Tensor, (t, t), {})
+        check(
+            "call_function",
+            torch.amp.autocast_mode._enter_autocast,
+            ("cuda", None, True, None),
+            {},
+        )
+        check("call_function", torch.amp.autocast_mode._exit_autocast, (None,), {})
+        check(
+            "call_function",
+            torch.ops._c10d_functional.all_gather_into_tensor_out,
+            (t, 2, "0"),
+            {"out": t},
+        )
+        check("call_function", torch.ops.inductor.resize_storage_bytes_, (t, 0), {})
+        check(
+            "call_function",
+            torch.ops.inductor.resize_storage_bytes_.default,
+            (t, 0),
+            {},
+        )
+        check(
+            "call_function",
+            torch.ops.fsdp.split_with_sizes_copy,
+            (t, [64, 128, 8, 8]),
+            {"dim": 1, "out": [t, t, t, t]},
+        )
+        check("call_function", torch.ops.fsdp.set_, (t, t), {})
+        check(
+            "call_function", torch.ops.aten.__rshift__.Scalar, (t, 2), {}, expect=False
+        )
+        check(
+            "call_function",
+            torch.ops._c10d_functional.all_gather_into_tensor,
+            (t, 2, "0"),
+            {},
+            expect=False,
+        )
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CUDA:
diff --git a/torch/_inductor/fx_passes/b2b_gemm.py b/torch/_inductor/fx_passes/b2b_gemm.py
@@ -100,7 +100,6 @@ def b2b_gemm_grid(M, P, meta):
 
 
 B2B_GEMM_PASS = PatternMatcherPass(
-    prevent_match_across_mutations=True,
     pass_name="b2b_gemm_pass",
 )
 
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
@@ -33,35 +33,28 @@
 log = logging.getLogger(__name__)
 
 efficient_conv_bn_eval_pass = PatternMatcherPass(
-    prevent_match_across_mutations=True, pass_name="efficient_conv_bn_eval_pass"
+    pass_name="efficient_conv_bn_eval_pass"
 )
 
 fuse_split_linear_add_pass = PatternMatcherPass(
-    prevent_match_across_mutations=True,
     pass_name="fuse_split_linear_add_pass",
 )
 fuse_chunk_squeeze_cat_pass = PatternMatcherPass(
-    prevent_match_across_mutations=True,
     pass_name="fuse_chunk_squeeze_cat_pass",
 )
 remove_reshape_pass = PatternMatcherPass(
-    prevent_match_across_mutations=True,
     pass_name="remove_reshape_pass",
 )
 
 # based on predispatch aten IR
-normalization_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-merge_splits_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-split_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-unbind_stack_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-merge_getitem_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-merge_stack_tahn_unbind_pass_aten = PatternMatcherPass(
-    prevent_match_across_mutations=True
-)
-mutate_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
-remove_split_with_size_one_pass_aten = PatternMatcherPass(
-    prevent_match_across_mutations=True
-)
+normalization_pass_aten = PatternMatcherPass()
+merge_splits_pass_aten = PatternMatcherPass()
+split_cat_pass_aten = PatternMatcherPass()
+unbind_stack_pass_aten = PatternMatcherPass()
+merge_getitem_cat_pass_aten = PatternMatcherPass()
+merge_stack_tahn_unbind_pass_aten = PatternMatcherPass()
+mutate_cat_pass_aten = PatternMatcherPass()
+remove_split_with_size_one_pass_aten = PatternMatcherPass()
 
 
 def save_inductor_dict(pass_to_compare=None):
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
@@ -67,7 +67,6 @@
     if pass_name in PRE_GRAD_FUSIONS:
         continue
     PRE_GRAD_PATTERNS[pass_name] = PatternMatcherPass(
-        prevent_match_across_mutations=True,
         pass_name=pass_name,
     )
 
@@ -77,7 +76,6 @@
     if pass_name in POST_GRAD_FUSIONS:
         continue
     POST_GRAD_PATTERNS[pass_name] = PatternMatcherPass(
-        prevent_match_across_mutations=True,
         pass_name=pass_name,
     )
 
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
@@ -1600,8 +1600,9 @@ def is_start_of_fx_graph(graph: torch.fx.Graph, node: torch.fx.Node) -> bool:
     return node is next(iter(graph.nodes))
 
 
-# match: copy_, relu_, _set_grad_enabled, manual_seed, enter_functional_autocast, etc
-_mutation_op_re = re.compile(r"_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_)")
+# match: copy_, relu_, _set_grad_enabled, manual_seed, _enter_autocast, etc
+# doesn't match: __rshift__, etc
+_mutation_op_re = re.compile(r"(?<!_)(_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_))(?!_)")
 
 
 def is_mutation_op(node: torch.fx.Node) -> bool:
@@ -1642,14 +1643,12 @@ def compute_mutation_region_ids(graph: torch.fx.GraphModule) -> None:
 class PatternMatcherPass:
     def __init__(
         self,
-        prevent_match_across_mutations: bool = False,
         pass_name: Optional[str] = None,
     ) -> None:
         super().__init__()
         self.patterns: DefaultDict[
             Tuple[str, torch.fx.node.Target], List[PatternEntry]
         ] = defaultdict(list)
-        self.prevent_match_across_mutations = prevent_match_across_mutations
         self.pass_name = pass_name
 
     def __getitem__(self, item: Tuple[str, torch.fx.node.Target]) -> List[PatternEntry]:
@@ -1667,12 +1666,11 @@ def apply(self, gm: torch.fx.GraphModule) -> int:
             raise RuntimeError(
                 f"The input to PatternMatcherPass must be a GraphModule or a Graph, but got {type(gm)}"
             )
-        if self.prevent_match_across_mutations:
-            if should_compute_mutation_region_ids(graph):
-                compute_mutation_region_ids(graph)
-            get_mutation_region_id_partial = functools.partial(
-                get_mutation_region_id, graph
-            )
+        if should_compute_mutation_region_ids(graph):
+            compute_mutation_region_ids(graph)
+        get_mutation_region_id_partial = functools.partial(
+            get_mutation_region_id, graph
+        )
         count = 0
         nodes = []
         has_call_module = False
@@ -1705,8 +1703,7 @@ def apply(self, gm: torch.fx.GraphModule) -> int:
                     m = entry.pattern.match(node)
                     # pattern match crosses mutation barrier - discard
                     if (
-                        self.prevent_match_across_mutations
-                        and is_match(m)
+                        is_match(m)
                         and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1  # type: ignore[possibly-undefined]
                     ):
                         continue

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,6 @@ def b2b_gemm_grid(M, P, meta):`
`100`	`100`
`101`	`101`
`102`	`102`	`B2B_GEMM_PASS = PatternMatcherPass(`
`103`		`- prevent_match_across_mutations=True,`
`104`	`103`	`pass_name="b2b_gemm_pass",`
`105`	`104`	`)`
`106`	`105`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,6 @@`
`67`	`67`	`if pass_name in PRE_GRAD_FUSIONS:`
`68`	`68`	`continue`
`69`	`69`	`PRE_GRAD_PATTERNS[pass_name] = PatternMatcherPass(`
`70`		`- prevent_match_across_mutations=True,`
`71`	`70`	`pass_name=pass_name,`
`72`	`71`	`)`
`73`	`72`
`@@ -77,7 +76,6 @@`
`77`	`76`	`if pass_name in POST_GRAD_FUSIONS:`
`78`	`77`	`continue`
`79`	`78`	`POST_GRAD_PATTERNS[pass_name] = PatternMatcherPass(`
`80`		`- prevent_match_across_mutations=True,`
`81`	`79`	`pass_name=pass_name,`
`82`	`80`	`)`
`83`	`81`