Skip to content

[ET][Memory planning] Improve greedy memory planning. #7926

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions backends/vulkan/vulkan_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# pyre-strict

from functools import partial

from typing import Any, Dict, final, List

import executorch.backends.vulkan.utils as utils
Expand All @@ -17,7 +19,6 @@
from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass
from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform

from executorch.backends.vulkan._passes import (
insert_prepack_nodes,
RemoveLocalScalarDenseOpsTransform,
Expand All @@ -41,6 +42,8 @@
PreprocessResult,
)
from executorch.exir.backend.utils import DelegateMappingBuilder

from executorch.exir.memory_planning import greedy
from executorch.exir.pass_base import ExportPass, PassBase

from executorch.exir.passes import MemoryPlanningPass, SpecPropPass
Expand Down Expand Up @@ -189,11 +192,12 @@ def preprocess( # noqa: C901

# Finally, apply dynamic shape passes and memory planning pass. These passes
# must be applied only when the graph structure is finalized.
greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
program = apply_passes(
program,
[
ConstraintBasedSymShapeEvalPass(),
MemoryPlanningPass(),
MemoryPlanningPass(memory_planning_algo=greedy_memory_planning),
],
)

Expand Down
225 changes: 199 additions & 26 deletions exir/memory_planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import operator
import typing
from collections import defaultdict
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union

import torch
Expand Down Expand Up @@ -117,6 +117,17 @@ def storage_overlap(cls, lhs_spec: TensorSpec, rhs_spec: TensorSpec) -> bool:

return has_overlap

@classmethod
def _debug_message_from_specs(
cls, lhs_spec: TensorSpec, rhs_spec: TensorSpec
) -> str:
message = (
f"lhs life time: {lhs_spec.lifetime}, rhs lifetime: {rhs_spec.lifetime} "
)
message += f"lhs: mem_id {lhs_spec.mem_id} storage: {lhs_spec.mem_offset}, {lhs_spec.allocated_memory} "
message += f"rhs: mem_id {rhs_spec.mem_id} storage: {rhs_spec.mem_offset}, {rhs_spec.allocated_memory}"
return message

def verify_storage_reuse(
self, allow_lifetime_and_storage_overlap: bool = False
) -> int:
Expand Down Expand Up @@ -159,7 +170,7 @@ def verify_storage_reuse(
lhs_spec, rhs_spec
):
raise InternalError(
f"Unexpected storage overlap: lhs {lhs_spec}, rhs {rhs_spec}"
f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}"
)

# Check that each mem_obj_id is consistent with whether the tensors have
Expand Down Expand Up @@ -454,6 +465,18 @@ def update_all_tensors_lifetime(
return specs


@dataclass
class AllocationSpec:
"""
AllocationSpec is used to represent the allocation of a tensor.
"""

# The offset of the tensor in the shared object/pool.
offset: int
# TensorSpec
spec: TensorSpec


@dataclass
class SharedObject:
r"""
Expand All @@ -470,8 +493,15 @@ class SharedObject:
offset: int
# size of this shared object in bytes
size: int
# When the object is first created
first_used_index: int
# the object will be available for index (last_used_index + 1)
last_used_index: int
# list of allocations belong to this shared object
allocations: List[AllocationSpec] = field(default_factory=list)

def __repr__(self) -> str:
return f"SharedObject(idx={self.idx}, offset={self.offset}, size={self.size}, lifetime=[{self.first_used_index, self.last_used_index}])"


def materialize_buffer(
Expand All @@ -489,35 +519,124 @@ def materialize_buffer(
return total_size


def _size_abs_dif(sobj: SharedObject, spec: TensorSpec) -> int:
def _does_not_overlap(sobj: SharedObject, spec: TensorSpec) -> bool:
r"""
Calculate the absolute different between the size of a shared object and
a tensor.
Check if a shared object and a tensor do not overlap.
"""
return abs(sobj.size - spec.allocated_memory)
for alloc in sobj.allocations:
if not (
spec.lifetime[1] < alloc.spec.lifetime[0]
or spec.lifetime[0] > alloc.spec.lifetime[1]
):
return False
return True


def _find_max_overlapping_allocations_offset(
sobj: SharedObject, spec: TensorSpec
) -> int:
max_offset = 0
for alloc in sobj.allocations:
if (
spec.lifetime[1] < alloc.spec.lifetime[0]
or spec.lifetime[0] > alloc.spec.lifetime[1]
):
continue
max_offset = max(alloc.offset + alloc.spec.allocated_memory, max_offset)
return max_offset


def pick_shared_obj(
shared_objects: List[SharedObject], spec: TensorSpec
shared_objects: List[SharedObject],
spec: TensorSpec,
allow_overlapping_allocations: bool = True,
) -> SharedObject:
r"""
Pick the available shared object with closest size to the tensor.
If there are no available shared object left, create a new one.
Pick the available shared object to which to assign this spec,
or create a new one
Algorithm details
Previous: Look at every spec in chronological order. Find if previously allocated object
allows it to fit in. If not, allocate a new object.
New:
- Sort all the specs by allocation size
- Process the specs in order
- If the spec's size in smaller than previously allocated buckets:
- Conditions under which previously allocated bucket can be used:
- Lifetime of the spec does not overlap with lifetime of the bucket.
- In this case allocate spec to that bucket and expand its lifetime.
- Spec is allocated at offset = 0 in this bucket.
- Add this spec to allocated object's list of specs.
- Lifetime of the spec overlaps with lifetime of the bucket,
partially or fully (e.g. spec's lifetime subset of bucket's lifetime)
- If none of the specs in the bucket overlaps with spec's lifetime.
- Allocate spec to the bucket at offset = 0.
- Add this spec to the bucket's list of specs.
- Expand bucket's lifetime accounting for added spec's lifetime.
- If one or more specs in the bucket overlaps with spec's lifetime.
- Collect offsets (at which the given overlapping spec is allocated in the bucket).
of all the overlapping specs, and find the max offset.
- Allocate spec to the bucket at offset = max_offset + max_offset_spec_size.
- Add this spec to the bucket's list of specs.
- Expand bucket's lifetime accounting for added spec's lifetime.
- If none of these conditions are met, allocate a new bucket.
- Add spec to this bucket.
- Update bucket's lifetime to that of the spec.
- If the spec's size is larger than previously allocated buckets, allocate a new bucket.
- Size and lifetime of this bucket is that of the spec

Proof of correctness:
- If allocating a new bucket, it is correct.
- If allocating spec to an existing bucket, whose lifetime does not overlap with any
of the previously allocated specs' lifetime, then the allocation is correct.
Proof of correctness by induction when adding spec to an existing bucket:
- If all previous allocations in the given bucket are correct:
- Then the new one being added must be correct because when the requested allocation
overlaps with one or more previous allocations, we find the largest offset among
all the overlapping allocations, and allocate the new spec at that offset. Hence,
the allocation at such an offset, will not overlap with any previous allocations.
Base case: A newly added allocation within a bucket with single allocation is correct:
because a) it must fit and b) its lifetime must not overlap with object's lifetime.
This holds true because of the following invariants:
- Once a bucket is created, it is never resized.
- All the allocations within a bucket follow this:
- Span, defined by allocation's offset + size, of two allocations can only overlap,
if their timelines do not overlap.
"""
# TODO: do better than linear scan
picked = None
for sobj in shared_objects:
if spec.lifetime[0] > sobj.last_used_index:
if picked is None or _size_abs_dif(sobj, spec) < _size_abs_dif(
picked, spec
):
picked = sobj
sobj.last_used_index = spec.lifetime[1]
sobj.size = max(sobj.size, spec.allocated_memory)
if _does_not_overlap(sobj, spec):
assert sobj.size >= spec.allocated_memory, "Allocation specs are not sorted"
picked = sobj
sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0])
sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1])
allocation_spec = AllocationSpec(0, spec)
picked.allocations.append(allocation_spec)
break

if picked is None and allow_overlapping_allocations:
for sobj in shared_objects:
max_offset = _find_max_overlapping_allocations_offset(sobj, spec)
if max_offset > 0:
if max_offset + spec.allocated_memory <= sobj.size:
picked = sobj
sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0])
sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1])
allocation_spec = AllocationSpec(max_offset, spec)
picked.allocations.append(allocation_spec)
break

if picked is None:
picked = SharedObject(
len(shared_objects), -1, spec.allocated_memory, spec.lifetime[1]
len(shared_objects),
-1,
spec.allocated_memory,
spec.lifetime[0],
spec.lifetime[1],
)
allocation_spec = AllocationSpec(0, spec)
picked.allocations.append(allocation_spec)
picked.first_used_index = spec.lifetime[0]
picked.last_used_index = spec.lifetime[1]
shared_objects.append(picked)

return picked
Expand Down Expand Up @@ -550,13 +669,50 @@ def get_node_tensor_specs(
]


# Little bit hacky to check if the graph contains
# XNNPACK delegate
# Why?


def _contains_xnnpack_delegate(graph_module: torch.fx.GraphModule) -> bool:
for node in graph_module.graph.nodes:
if node.target == executorch_call_delegate:
lowered_module = getattr(
graph_module.graph.owning_module, node.args[0].target
)
if "xnnpack" in lowered_module.backend_id.lower():
return True
return False


def greedy(
graph_module: torch.fx.GraphModule,
alignment: int,
graph_signature: Optional[ExportGraphSignature] = None,
alloc_graph_input: bool = True,
alloc_graph_output: bool = True,
allow_overlapping_allocations: bool = True,
) -> List[int]:
r"""Greedy algorithm to allocate memory for tensors in the graph.
alloc_graph_input: If set to true, the algorithm will allocate memory for graph input.
alloc_graph_output: If set to true, the algorithm will allocate memory for graph output.
allow_overlapping_allocations: If set to true, allows for allocations that overlap
in their lifetime but are at different offsets in the storage. By default true.
This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping
allocations disabled
"""
# padding allocation with 64 bytes.
# this requirement is really for XNNPACK backend which can read tensors
# beyond the end of the tensor. This is done for performance
# optimizations in XNNPACK.
# While accounting for backend specific requirement is not the right choice
# in backend agnostic memory planning, we do it here as it seems most appropriate.
# Right now this applies to greedy only so any other
# algorithm that plans memory for XNNPACK backend will
# not have this.
extra_padded_bytes = 0
if _contains_xnnpack_delegate(graph_module):
extra_padded_bytes = 64
spec2obj = {}
shared_objects = defaultdict(list)
# Don't do assertion in collect_specs_from_nodes if we have already encountered
Expand All @@ -565,24 +721,34 @@ def greedy(
# For each tensor, pick the available shared object with closest size to
# the tensor. If there are no available shared object left, create a new
# one.
import bisect

sorted_specs = []
for spec in collect_specs_from_nodes(
graph_module.graph.nodes,
graph_signature,
do_assertion=do_assertion,
ignore_graph_input=not alloc_graph_input,
ignore_graph_output=not alloc_graph_output,
):
bisect.insort(sorted_specs, spec, key=lambda x: x.allocated_memory)
sorted_specs.reverse()

for spec in sorted_specs:
if spec.mem_id is None:
spec.mem_id = 1
spec.realign(alignment)
spec2obj[spec] = pick_shared_obj(shared_objects[spec.mem_id], spec)
spec2obj[spec] = pick_shared_obj(
shared_objects[spec.mem_id], spec, allow_overlapping_allocations
)

if len(shared_objects) == 0:
# Cannot find any tensor in the graph that needs to be allocated.
# Return [0, 0] to be consistent with default behavior of naive.
total_sizes = [0, 0]
else:
total_sizes = [0] * (max(shared_objects.keys()) + 1)
num_specs_processed = 0
for mem_id in shared_objects:
input_total_size = 0
if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None):
Expand All @@ -594,13 +760,20 @@ def greedy(
total_sizes[mem_id] = materialize_buffer(
shared_objects[mem_id], input_total_size
)

# Since we now know the number of shared objects we need and the size of
# each shared object, we can assign offset in the memory buffer for each
# shared object.
for spec, sobj in spec2obj.items():
spec.mem_obj_id = sobj.idx
spec.mem_offset = sobj.offset
total_sizes[mem_id] += extra_padded_bytes

# Since we now know the number of shared objects we need and the size of
# each shared object, we can assign offset in the memory buffer for each
# shared object.
for sobj in shared_objects[mem_id]:
for alloc in sobj.allocations:
spec = alloc.spec
alloc.spec.mem_obj_id = sobj.idx
alloc.spec.mem_offset = sobj.offset + alloc.offset
num_specs_processed += 1
assert (
len(spec2obj) == num_specs_processed
), f"All specs should be processed but there were {len(spec2obj)} specs and processed {num_specs_processed} specs"

logging.debug(f"greedy algorithm returns bufsizes: {total_sizes}")
return total_sizes
Expand Down
Loading
Loading