[dynamo] Tweak naming for module hook bw_state (pytorch#121609)

jansel · pytorchmergebot · commit 3c8c7e2a46f4 · 2024-03-12T16:27:56.000Z
Some minor changes not related to the other PRs in the stack Pull Request resolved: pytorch#121609 Approved by: https://github.com/yanboliang
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
@@ -433,8 +433,8 @@ def __init__(
         self.backward_state_proxy: Optional[torch.fx.Proxy] = None
         self.backward_state_var: Optional[str] = None
 
-    def add_backward_state_hook(self, hook: VariableTracker):
-        name = f"hook{len(self.backward_state)}"
+    def add_backward_state_hook(self, hook: VariableTracker, prefix="hook"):
+        name = f"{prefix}{len(self.backward_state)}"
         assert name not in self.backward_state
         self.backward_state[name] = hook
         return name, self.get_backward_state_proxy()
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
@@ -334,7 +334,7 @@ def _in_graph_bw_hooks(bw_state: BackwardState):
                 ),
             )
 
-        module_name, bw_state_proxy = tx.output.add_backward_state_hook(module)
+        module_name, bw_state_proxy = tx.output.add_backward_state_hook(module, "mod")
         user_pre_hooks_name, _ = tx.output.add_backward_state_hook(user_pre_hooks)
         user_hooks_name, _ = tx.output.add_backward_state_hook(user_hooks)
         proxy = tx.output.create_proxy(
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -10,7 +10,7 @@
 import logging
 from contextlib import nullcontext
 from functools import wraps
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Sequence
 
 import torch
 import torch.utils.dlpack
@@ -175,6 +175,12 @@ def rng_functionalization_wrapper(args):
     return compiled_fn
 
 
+def _output_node(gm: torch.fx.GraphModule) -> torch.fx.Node:
+    """Return the output node of a graph"""
+    # reversed() since we expect output at end of graph
+    return next(n for n in reversed(gm.graph.nodes) if n.op == "output")
+
+
 def aot_dispatch_autograd(
     flat_fn,
     flat_args: List[Any],
@@ -295,8 +301,8 @@ def aot_dispatch_autograd(
         # and we will end up with a zero grad at x.
         # If we later backprop through the second output, this will also require backprop'ing through x.
         # Meaning we'll need to use `retain_graph=True` to be able to backprop through x the second time.
-        _indices_of_inps_to_detach = []
-        bw_outs = next(n for n in bw_module.graph.nodes if n.op == "output").args[0]
+        _indices_of_inps_to_detach: List[int] = []
+        bw_outs: Sequence[torch.fx.Node] = _output_node(bw_module).args[0]  # type: ignore[assignment]
 
         # TODO: we should apply the below "detach inputs if their gradients are statically known to be None"
         # optimization even if we have subclass inputs/outputs (we do not handle this today).

Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ def _in_graph_bw_hooks(bw_state: BackwardState):`
`334`	`334`	`),`
`335`	`335`	`)`
`336`	`336`
`337`		`- module_name, bw_state_proxy = tx.output.add_backward_state_hook(module)`
	`337`	`+ module_name, bw_state_proxy = tx.output.add_backward_state_hook(module, "mod")`
`338`	`338`	`user_pre_hooks_name, _ = tx.output.add_backward_state_hook(user_pre_hooks)`
`339`	`339`	`user_hooks_name, _ = tx.output.add_backward_state_hook(user_hooks)`
`340`	`340`	`proxy = tx.output.create_proxy(`