Support custom mem_id in MemoryPlanningPass algos

sxu · facebook-github-bot · commit 26f9efa16775 · 2023-08-09T17:54:10.000-07:00
Summary: Allow customizing memory pools while still leveraging the existing memory planning algos. The algorithms still default to mem_id 1.

Reviewed By: ydwu4

Differential Revision: D48159257

fbshipit-source-id: 96ddc78f52f42e0de6f3b08feacdad188718c18a
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -464,7 +464,7 @@ def greedy(
     alloc_graph_output: bool = True,
 ) -> List[int]:
     spec2obj = {}
-    shared_objects = []
+    shared_objects = defaultdict(list)
     # Don't do assertion in collect_specs_from_nodes if we have already encountered
     # and ignored some to_out_variant errors.
     do_assertion = not getattr(graph_module, "encounter_to_out_var_failure", False)
@@ -477,23 +477,29 @@ def greedy(
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
     ):
-        spec.mem_id = 1
+        if spec.mem_id is None:
+            spec.mem_id = 1
         spec.realign(alignment)
-        spec2obj[spec] = pick_shared_obj(shared_objects, spec)
-
-    input_total_size = 0
-    if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None):
-        input_total_size = bufsizes[1]
+        spec2obj[spec] = pick_shared_obj(shared_objects[spec.mem_id], spec)
+
+    total_sizes = [0] * (max(shared_objects.keys()) + 1)
+    for mem_id in shared_objects:
+        input_total_size = 0
+        if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None):
+            if len(bufsizes) > mem_id:
+                input_total_size = bufsizes[mem_id]
+        total_sizes[mem_id] = materialize_buffer(
+            shared_objects[mem_id], input_total_size
+        )
 
     # Since we now know the number of shared objects we need and the size of
     # each shared object, we can assign offset in the memory buffer for each
     # shared object.
-    total_size = materialize_buffer(shared_objects, input_total_size)
     for spec, sobj in spec2obj.items():
         spec.mem_offset = sobj.offset
 
-    logging.debug(f"greedy algorithm returns bufsizes: {total_size}")
-    return [0, total_size]
+    logging.debug(f"greedy algorithm returns bufsizes: {total_sizes}")
+    return total_sizes
 
 
 @register_algo
@@ -506,10 +512,8 @@ def naive(
     # allocate 'allocated' bytes from buffer with id mem_id.
     # return the starting offset of the allocated buffer.
     def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
-        internal_assert(
-            mem_id >= 0 and mem_id < len(bufsizes),
-            f"Tensor mem_id should be between 0 and {len(bufsizes)}, but it was {mem_id}",
-        )
+        if mem_id >= len(bufsizes):
+            bufsizes.extend([0] * (mem_id - len(bufsizes) + 1))
         ret = bufsizes[mem_id]
         bufsizes[mem_id] += allocated
         return ret
@@ -525,7 +529,8 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
         ignore_graph_output=not alloc_graph_output,
     ):
         # assume a single memory layer which has mem_id 1
-        spec.mem_id = 1
+        if spec.mem_id is None:
+            spec.mem_id = 1
         # allocate spec.allocated_memory bytes in the buffer
         # with the corresponding mem_id
         spec.realign(alignment)
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
@@ -171,12 +171,16 @@ python_unittest(
     preload_deps = [
         "//executorch/kernels/portable:custom_ops_generated_lib",
     ],
+    # Static listing does not support tests generated with parameterized
+    supports_static_listing = False,
     deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
         ":asr_joiner",
         "//caffe2:torch",
         "//executorch/backends/qnnpack/partition:qnnpack_partitioner",
         "//executorch/exir:lib",
         "//executorch/exir:memory_planning",
+        "//executorch/exir:pass_base",
         "//executorch/exir:pass_manager",
         "//executorch/exir:print_program",
         "//executorch/exir:schema",
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
@@ -18,6 +18,7 @@
 from executorch.backends.qnnpack.partition.qnnpack_partitioner import QnnpackPartitioner
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
 from executorch.exir.memory_planning import filter_nodes, Verifier
+from executorch.exir.pass_base import PassResult
 from executorch.exir.pass_manager import PassManager
 from executorch.exir.passes import (  # noqa
     ConstPropPass,
@@ -29,6 +30,7 @@
 )
 from executorch.exir.print_program import print_program
 from executorch.exir.tests.asr_joiner import ASRJoiner
+from parameterized import parameterized
 
 from torch import nn
 from torch.ao.quantization import (  # @manual=//caffe2:torch
@@ -157,6 +159,47 @@ def extra_check(
             testcase.assertTrue(getitem_spec.lifetime[1] >= cat_specs[0].lifetime[0])
 
 
+class CustomPoolMemoryPlanningPass(MemoryPlanningPass):
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for subgm in graph_module.modules():
+            if not isinstance(subgm, GraphModule):
+                continue
+            for node in subgm.graph.nodes:
+                # mem_id = 1 placeholder and outputs of mul
+                # mem_id = 3 for outputs of add
+                # parent class will copy spec will to alloc nodes
+                if node.op == "placeholder":
+                    node.meta["spec"].mem_id = 1
+                    continue
+
+                if node.op != "call_function":
+                    continue
+
+                if node.target == torch.ops.aten.add.out:
+                    node.meta["spec"].mem_id = 3
+                elif node.target == torch.ops.aten.mul.out:
+                    node.meta["spec"].mem_id = 1
+
+        return super().call(graph_module)
+
+
+class MultiplePoolsToyModel(torch.nn.Module):
+    def forward(self, a: torch.Tensor) -> torch.Tensor:
+        # a: mem_id = 1, offset = 0
+        # b: mem_id = 3, offset = 0
+        # c: mem_id = 1, offset = 4
+        # d: mem_id = 3, offset = 4
+        # greedy:
+        # e: mem_id = 1, offset = 0
+        # naive:
+        # e: mem_id = 1, offset = 8
+        b = a + a
+        c = a * b
+        d = c + b
+        e = c * d
+        return e
+
+
 def maketest(
     module_cls: Type[torch.nn.Module],
     criteria: Optional[List[Tuple[str, bool]]] = None,
@@ -463,3 +506,60 @@ def test_asr_joiner(self) -> None:
                 )
 
         self.assertEqual(3, ncheck)
+
+    # pyre-ignore
+    @parameterized.expand(
+        [
+            (
+                "naive",
+                [(1, 0), (3, 0), (1, 4), (3, 4), (1, 8)],
+                [0, 12, 0, 8],
+            ),
+            (
+                "greedy",
+                [(1, 0), (3, 0), (1, 4), (3, 4), (1, 0)],
+                [0, 8, 0, 8],
+            ),
+        ]
+    )
+    def test_multiple_pools(
+        self,
+        algo: str,
+        expected_allocs: List[Tuple[int, int]],
+        expected_bufsizes: List[int],
+    ) -> None:
+        edge_program = exir.capture(
+            MultiplePoolsToyModel(),
+            (torch.ones(1),),
+            exir.CaptureConfig(pt2_mode=True),
+        ).to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))
+
+        program = edge_program.to_executorch(
+            exir.ExecutorchBackendConfig(
+                memory_planning_pass=CustomPoolMemoryPlanningPass(
+                    memory_planning_algo=algo,
+                    alignment=1,
+                )
+            )
+        )
+        graph_module = program.dump_graph_module()
+
+        verifier = Verifier(
+            graph_module,
+            alloc_graph_input=True,
+            alloc_graph_output=True,
+        )
+        verifier.verify_storage_reuse()
+        verifier.verify_graph_input_output()
+
+        idx = 0
+        for node in graph_module.graph.nodes:
+            if node.op == "placeholder" or (
+                node.op == "call_function"
+                and node.target in (torch.ops.aten.add.out, torch.ops.aten.mul.out)
+            ):
+                mem_id, mem_offset = expected_allocs[idx]
+                self.assertEqual(node.meta["spec"].mem_id, mem_id)
+                self.assertEqual(node.meta["spec"].mem_offset, mem_offset)
+                idx += 1
+        self.assertEqual(graph_module.meta["non_const_buffer_sizes"], expected_bufsizes)