pytorch · facebook-github-bot · Aug 14, 2024 · May 8, 2024 · May 10, 2024 · May 10, 2024
@@ -179,6 +179,8 @@ option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF)
 
 option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
 
+option(EXECUTORCH_BUILD_NEURON "Build the backends/mediatek directory" OFF)
+
 option(EXECUTORCH_BUILD_PYBIND "Build the Python Bindings" OFF)
 
 option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
@@ -624,6 +626,10 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
+if(EXECUTORCH_BUILD_NEURON)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()

diff --git a/LICENSE b/LICENSE
@@ -6,6 +6,7 @@ Copyright (c) Meta Platforms, Inc. and affiliates.
 Copyright 2023 Arm Limited and/or its affiliates.
 Copyright (c) Qualcomm Innovation Center, Inc.
 Copyright (c) 2023 Apple Inc.
+Copyright (c) 2024 MediaTek Inc.
 
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

@@ -0,0 +1,50 @@
+#[[
+/*
+* Copyright (c) 2024 MediaTek Inc.
+*
+* Licensed under the BSD License (the "License"); you may not use this file
+* except in compliance with the License. See the license file in the root
+* directory of this source tree for more details.
+*/
+]]
+
+# Let include directory as "executorch/..."
+set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+set(NEURON_BUFFER_ALLOCATOR_LIB "" CACHE PATH "Path to Neuron Buffer Allocator library")
+message(STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}")
+
+include_directories(
+    BEFORE
+    ${_common_include_directories}
+)
+
+# shortcut include directory for neuron headers
+include_directories(
+    BEFORE
+    ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include
+)
+
+# targets
+add_library(neuron_backend SHARED)
+target_link_libraries(neuron_backend
+    PRIVATE
+    executorch_no_prim_ops
+    android
+    log
+    ${NEURON_BUFFER_ALLOCATOR_LIB}
+)
+target_sources(neuron_backend
+    INTERFACE
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp
+)
+target_link_options_shared_lib(neuron_backend)
+
+install(TARGETS neuron_backend DESTINATION lib)
@@ -0,0 +1,5 @@
+from .partitioner import NeuropilotPartitioner
+from .preprocess import NeuropilotBackend
+from .quantizer import NeuropilotQuantizer, Precision
+
+__all__ = [NeuropilotBackend, NeuropilotPartitioner, NeuropilotQuantizer, Precision]
@@ -0,0 +1,101 @@
+# Copyright (c) 2024 MediaTek Inc.
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file in the root
+# directory of this source tree for more details.
+
+from typing import Callable, final, List, Optional, Tuple
+
+import torch
+from executorch.backends.mediatek.preprocess import NeuropilotBackend
+from executorch.exir.backend.backend_details import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data
+
+from mtk_converter.python.converters.pytorch import importer_v2
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+class NeuropilotOperatorsSupport(OperatorSupportBase):
+
+    def __init__(
+        self,
+        op_types_to_skip: Optional[set] = None,
+        op_names_to_skip: Optional[set] = None,
+    ) -> None:
+        if op_types_to_skip is None:
+            op_types_to_skip = set()
+        if op_names_to_skip is None:
+            op_names_to_skip = set()
+
+        self._op_types_to_skip = op_types_to_skip
+        self._op_names_to_skip = op_names_to_skip
+
+    def is_node_supported(self, _, node: torch.fx.Node) -> bool:
+        # Handle 'call_function' only cause 'placeholder' and 'output' cannot be tagged.
+        # Ref: https://github.com/pytorch/executorch/pull/1398
+        if node.op != "call_function":
+            return False
+
+        op_type = node.target.__name__
+        if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip:
+            print(
+                f"[Neuropilot Backend] The {op_type} operator with name '{node.name}' is skipped."
+            )
+            return False
+
+        return importer_v2.is_fx_node_supported(node)
+
+
+@final
+class NeuropilotPartitioner(Partitioner):
+
+    def __init__(
+        self,
+        compile_spec: List[CompileSpec],
+        op_types_to_skip: Optional[set] = None,
+        op_names_to_skip: Optional[set] = None,
+    ) -> None:
+        self.delegation_spec = DelegationSpec(NeuropilotBackend.__name__, compile_spec)
+        self._op_types_to_skip = op_types_to_skip
+        self._op_names_to_skip = op_names_to_skip
+
+    def ops_to_not_decompose(
+        self,
+        ep: ExportedProgram,
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        ops_not_decompose = [
+            torch.ops.aten.pixel_shuffle.default,
+            torch.ops.aten.upsample_bilinear2d.default,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.upsample_nearest2d.default,
+            torch.ops.aten.upsample_nearest2d.vec,
+        ]
+        return (ops_not_decompose, None)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        capability_partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            NeuropilotOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
+            allows_single_node_partition=True,
+        )
+        partition_list = capability_partitioner.propose_partitions()
+
+        partition_tags = {}
+        for partition in partition_list:
+            for node in partition.nodes:
+                tag = f"tag{partition.id}"
+                node.meta["delegation_tag"] = tag
+                partition_tags[tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 MediaTek Inc.
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file in the root
+# directory of this source tree for more details.
+
+import torch
+
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._decomp import get_decompositions
+from torch.fx import Graph
+from torch.fx.experimental.proxy_tensor import make_fx
+
+
+def _get_input_node_names(graph: Graph):
+    input_names = []
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            input_names.append(node.name)
+    return input_names
+
+
+class DecomposeScaledDotProductAttention(ExportPass):
+    """Decompose the single SDPA operator."""
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target != torch.ops.aten.scaled_dot_product_attention.default:
+                continue
+
+            decom_mappings = get_decompositions(
+                [torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default]
+            )
+            input_tensors = (arg.meta["val"] for arg in node.args)
+            decomposed_module = make_fx(node.target, decom_mappings, "fake", True)(
+                *input_tensors
+            )
+            decomposed_input_names = _get_input_node_names(decomposed_module.graph)
+            with graph.inserting_before(node):
+                name_to_input_tensor_map = {}
+                for idx, arg in enumerate(node.args):
+                    name_to_input_tensor_map[decomposed_input_names[idx]] = arg
+
+                decomposed_node_to_subgraph_node = {}
+                for decomposed_node in decomposed_module.graph.nodes:
+                    if decomposed_node.op == "placeholder":
+                        decomposed_node_to_subgraph_node[decomposed_node] = (
+                            name_to_input_tensor_map[decomposed_node.name]
+                        )
+
+                # Copy node from decompose graph module
+                for decomposed_node in decomposed_module.graph.nodes:
+                    if decomposed_node.op == "placeholder":
+                        continue
+                    if decomposed_node.op == "output":
+                        for user in node.users.copy():
+                            new_node = decomposed_node_to_subgraph_node[
+                                decomposed_node.args[0]
+                            ]
+                            user.replace_input_with(node, new_node)
+                        continue
+
+                    subgraph_node = graph.node_copy(
+                        decomposed_node,
+                        arg_transform=lambda x, d=decomposed_node_to_subgraph_node: d[
+                            x
+                        ],
+                    )
+                    subgraph_node.meta["source_fn_stack"] = [
+                        (subgraph_node, subgraph_node.target)
+                    ]
+                    decomposed_node_to_subgraph_node[decomposed_node] = subgraph_node
+
+                graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 MediaTek Inc.
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file in the root
+# directory of this source tree for more details.
+
+import contextlib
+import struct
+
+from typing import final, List
+
+import mtk_converter
+import mtk_neuron
+import torch
+from executorch.exir.backend.backend_details import (
+    BackendDetails,
+    ExportedProgram,
+    PreprocessResult,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+SKIP_COMPILE_SPEC_KEYS = {"ImportForever"}
+
+
+@final
+class NeuropilotBackend(BackendDetails):
+
+    @classmethod
+    def preprocess(
+        cls, edge_program: ExportedProgram, module_compile_spec: List[CompileSpec]
+    ) -> PreprocessResult:
+
+        name_to_node_mappings = {node.name: node for node in edge_program.graph.nodes}
+        input_names = edge_program.graph_signature.user_inputs
+        output_names = edge_program.graph_signature.user_outputs
+        fp_input_indices = [
+            idx
+            for idx, name in enumerate(input_names)
+            if name_to_node_mappings[name].meta["val"].dtype == torch.float32
+        ]
+        fp_output_indices = [
+            idx
+            for idx, name in enumerate(output_names)
+            if name_to_node_mappings[name].meta["val"].dtype == torch.float32
+        ]
+
+        # This default compile options are only for mt6989 SOC
+        compile_options = ["--arch=mdla5.1,edpa1.0", "--relax-fp32", "--opt=3"]
+        for spec in module_compile_spec:
+            if spec.key in SKIP_COMPILE_SPEC_KEYS:
+                continue
+            if spec.value == b"":
+                compile_options.append(f"--{spec.key}")
+            else:
+                value = spec.value.decode("utf-8")
+                compile_options.append(f"--{spec.key}={value}")
+
+        converter = mtk_converter.PyTorchV2Converter.from_exported_program(edge_program)
+        converter.quantize = True
+        converter.input_quantization_bitwidths = None
+        converter.allow_missing_quantization_ranges = True
+        converter.prepend_input_quantize_ops = True
+        converter.prepend_input_quantize_ops_indices = fp_input_indices
+        converter.append_output_dequantize_ops = True
+        converter.append_output_dequantize_ops_indices = fp_output_indices
+        with contextlib.redirect_stdout(None):
+            mlir_str = converter.convert_to_mlir()
+            model_bytes = mtk_neuron.compile(mlir_str, " ".join(compile_options))
+
+        num_inputs = len(input_names)
+        num_outputs = len(output_names)
+        header = struct.pack("<BIII", 1, num_inputs, num_outputs, len(model_bytes))
+        return PreprocessResult(processed_bytes=bytes(header + model_bytes))
@@ -0,0 +1,4 @@
+from .qconfig import Precision
+from .quantizer import NeuropilotQuantizer
+
+__all__ = [NeuropilotQuantizer, Precision]