Update on "[ET][Memory planning] Improve greedy memory planning."

kimishpatel · kimishpatel · commit 3975a751e316 · 2025-01-24T14:49:36.000-08:00
This diff replaces the old greedy algorithm. Older algorithm resulted in 35% worse compared to theoretical optimum. THis matter for long context even more since additional overhead can be few hundred MB. For example the theorical optimial for llama3_2 8B, 4-bit quantized modelw ith context length of 2k needs about 1G of memory. This theoretcial max can be observed by looking at the peaks in memory profile. Current agorithm resulted in about 1.6GB of planned memory. New algorithm reduce that to about 1.1G. Differential Revision: [D68448332](https://our.internmc.facebook.com/intern/diff/D68448332/) [ghstack-poisoned]
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -6,6 +6,8 @@
 
 # pyre-strict
 
+from functools import partial
+
 from typing import Any, Dict, final, List
 
 import executorch.backends.vulkan.utils as utils
@@ -18,6 +20,9 @@
 from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 
+from executorch.exir.memory_planning import (
+    greedy,
+)
 from executorch.backends.vulkan._passes import (
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
@@ -189,11 +194,12 @@ def preprocess(  # noqa: C901
 
         # Finally, apply dynamic shape passes and memory planning pass. These passes
         # must be applied only when the graph structure is finalized.
+        greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
         program = apply_passes(
             program,
             [
                 ConstraintBasedSymShapeEvalPass(),
-                MemoryPlanningPass(),
+                MemoryPlanningPass(memory_planning_algo=greedy_memory_planning),
             ],
         )
 
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -547,7 +547,9 @@ def _find_max_overlapping_allocations_offset(
 
 
 def pick_shared_obj(
-    shared_objects: List[SharedObject], spec: TensorSpec
+    shared_objects: List[SharedObject],
+    spec: TensorSpec,
+    allow_overlapping_allocations: bool = True,
 ) -> SharedObject:
     r"""
     Pick the available shared object to which to assign this spec,
@@ -611,7 +613,7 @@ def pick_shared_obj(
             picked.allocations.append(allocation_spec)
             break
 
-    if picked is None:
+    if picked is None and allow_overlapping_allocations:
         for sobj in shared_objects:
             max_offset = _find_max_overlapping_allocations_offset(sobj, spec)
             if max_offset > 0:
@@ -673,7 +675,16 @@ def greedy(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    allow_overlapping_allocations: bool = True,
 ) -> List[int]:
+    r"""Greedy algorithm to allocate memory for tensors in the graph.
+    alloc_graph_input: If set to true, the algorithm will allocate memory for graph input.
+    alloc_graph_output: If set to true, the algorithm will allocate memory for graph output.
+    allow_overlapping_allocations: If set to true, allows for allocations that overlap
+    in their lifetime but are at different offsets in the storage. By default true.
+    This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping
+    allocations disabled
+    """
     spec2obj = {}
     shared_objects = defaultdict(list)
     # Don't do assertion in collect_specs_from_nodes if we have already encountered
@@ -699,7 +710,9 @@ def greedy(
         if spec.mem_id is None:
             spec.mem_id = 1
         spec.realign(alignment)
-        spec2obj[spec] = pick_shared_obj(shared_objects[spec.mem_id], spec)
+        spec2obj[spec] = pick_shared_obj(
+            shared_objects[spec.mem_id], spec, allow_overlapping_allocations
+        )
 
     if len(shared_objects) == 0:
         # Cannot find any tensor in the graph that needs to be allocated.