pytorch
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/arm/CMakeLists.txt
Lines changed: 36 additions & 0 deletions b/‎backends/arm/CMakeLists.txt
Lines changed: 36 additions & 0 deletions
diff --git a/‎backends/arm/arm_backend.py
Lines changed: 100 additions & 13 deletions b/‎backends/arm/arm_backend.py
Lines changed: 100 additions & 13 deletions
diff --git a/‎backends/arm/cmake/Dependencies.cmake
Lines changed: 10 additions & 0 deletions b/‎backends/arm/cmake/Dependencies.cmake
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/arm/cmake/build.sh
Lines changed: 53 additions & 0 deletions b/‎backends/arm/cmake/build.sh
Lines changed: 53 additions & 0 deletions
diff --git a/‎backends/arm/cmake/toolchain.sh
Lines changed: 12 additions & 0 deletions b/‎backends/arm/cmake/toolchain.sh
Lines changed: 12 additions & 0 deletions
@@ -43,3 +43,6 @@
 [submodule "examples/third-party/fbjni"]
 	path = examples/third-party/fbjni
 	url = https://github.com/facebookincubator/fbjni.git
+[submodule "backends/arm/third-party/ethos-u-core-driver"]
+	path = backends/arm/third-party/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
@@ -336,6 +336,13 @@ if(EXECUTORCH_BUILD_QNN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/qualcomm)
 endif()
 
+# Build Arm Baremetal backend
+option(EXECUTORCH_BUILD_ARM_BAREMETAL
+       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 # Add selective build subdirectory
 if(BUILD_SELECTIVE_BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
 
@@ -0,0 +1,36 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+include(cmake/Dependencies.cmake)
+
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+
+add_library(
+  executorch_delegate_ethos_u
+  STATIC ${_arm_baremetal_sources}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${_common_include_directories}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${DRIVER_ETHOSU_INCLUDE_DIR}
+)
@@ -12,6 +12,8 @@
 import logging
 import operator
 import os
+import struct
+import subprocess
 import tempfile
 from typing import final, List
 
@@ -136,13 +138,89 @@ def dbg_tosa_dump(tosa_fb, path):
     fb = tosa_fb.serialize()
     js = tosa_fb.writeJson(filename)
 
-    f = open(path + filename, "wb")
-    f.write(fb)
-    f.close()
+    with open(path + filename, "wb") as f:
+        f.write(fb)
 
-    f = open(path + "desc.json", "w")
-    f.write(js)
-    f.close()
+    with open(path + "desc.json", "w") as f:
+        f.write(js)
+
+
+# Output to Vela with current file-based compilation
+# WARNING: if this changes, the runtime reader also needs to change
+def vela_compile(tosa_fb):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tosaname = "out.tosa"
+        flatbuffer = tosa_fb.serialize()
+        with open(os.path.join(tmpdir, tosaname), "wb") as f:
+            f.write(flatbuffer)
+
+        # invoke vela
+        vela_command = (
+            f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        )
+        subprocess.run([vela_command], shell=True, check=True)
+
+        np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+        blocks = b""
+        with np.load(np_path, allow_pickle=False) as data:
+            # Emit the NPZ regions as:
+            #  - 16 byte block name null terminated string (padded to 16 if name shorter)
+            #  - 4 bytes of int32 block length and 12 bytes of 0's
+            #  - block data (padded to 16 byte alignment at end)
+            # Repeat for all blocks
+            for key in data.keys():
+                block_name = bytes(key, "utf8")[:15]
+                block_name = block_name + b"\x00" * (16 - len(block_name))
+
+                block_data = b""
+                if key in ("input_shape", "output_shape"):
+                    inputs = data[key]
+                    # Encode a struct of int len; and one or more int x,y,z,w shape;
+                    input_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        assert len(inp) <= 4
+                        inp_pad = inp.tolist() + [0] * (4 - len(inp))
+                        input_struct = input_struct + struct.pack("<iiii", *inp_pad)
+                    block_data = input_struct
+                elif key in ("input_offset", "output_offset"):
+                    inputs = data[key]
+                    if key == "output_offset" and len(inputs) > 1:
+                        raise RuntimeError(
+                            "Currently only support one output in Vela ArmBackend"
+                        )
+                    offset_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        offset_struct = offset_struct + struct.pack("<i", inp)
+                    block_data = offset_struct
+                else:
+                    block_data = data[key].tobytes()
+                # We need the acual unpadded block lengths for hw setup
+                block_length = len(block_data).to_bytes(16, "little")
+                # pad block data to multiple of 16 bytes
+                block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
+
+                block = block_name + block_length + block_data
+                blocks = blocks + block
+
+            # Add a block for scratch, inputs and outputs
+            # scratch shape is a 1 element array giving us size in bytes
+            block_name = bytes("scratch_data", "utf8")[:15]
+            block_name = block_name + b"\x00" * (16 - len(block_name))
+            block_length = data["scratch_shape"][0].item()
+            block_length = block_length + (15 - (block_length - 1) % 16)
+            block_data = b"\x00" * block_length
+            block_length = block_length.to_bytes(16, "little")
+            block = block_name + block_length + block_data
+            blocks = blocks + block
+            # TODO are these already in scratch shape? look to be
+            # input_shape * input_elem_size
+            # output_shape * output_elem_size
+            # input_offset and output_offset specify the location these arrays are written from base of scratch
+
+        # return 16 byte VELA bin header + blocks + footer
+        header = bytes("vela_bin_stream", "utf-8") + b"\x00"
+        footer = bytes("vela_end_stream", "utf-8") + b"\x00"
+        return header + blocks + footer
 
 
 def dbg_fail(node, tosa_fb, path):
@@ -237,14 +315,13 @@ def preprocess(  # noqa: C901
         # if a debug/test build capture output files from TOSA stage
         path = None
         debug_output = False
+        output_format = "vela"
         for spec in compile_spec:
             if spec.key == "debug_tosa_path":
                 path = spec.value.decode()
                 debug_output = True
-
-        # in non debug builds we still pass files to vela
-        if path is None:
-            path = tempfile.mkdtemp(prefix="arm_tosa_")
+            if spec.key == "output_format":
+                output_format = spec.value.decode()
 
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
@@ -890,6 +967,16 @@ def preprocess(  # noqa: C901
         if debug_output is True:
             dbg_tosa_dump(tosa_fb, path)
 
-        # Serialize and return the tosa flatbuffer
-        fb = tosa_fb.serialize()
-        return PreprocessResult(processed_bytes=bytes(fb))
+        # Serialize and return the program. While we have always produced TOSA
+        # output as an intermediate, some flows compile to device binaries in
+        # preprocess and some consume TOSA fb directly.
+        if output_format == "vela":
+            # Emit vela_bin_stream format
+            binary = vela_compile(tosa_fb)
+        elif output_format == "tosa":
+            # Emit TOSA flatbuffer
+            binary = bytes(tosa_fb.serialize())
+        else:
+            raise RuntimeError(f"Unknown format {output_format}")
+
+        return PreprocessResult(processed_bytes=binary)
@@ -0,0 +1,10 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
+
+# Ethos-U driver
+set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
+include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+#
+# Setup toolchain
+#
+BASEDIR=`realpath $(dirname "$0")`
+echo "building using build.sh in $BASEDIR"
+
+ARCH=$(uname -i)
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/
+
+echo $GCCPATH
+if test -d "${GCCPATH}"; then
+	echo Using exising compiler ${GCCPATH}
+else
+	pushd ${BASEDIR}/
+	./toolchain.sh
+	popd
+fi
+export PATH=${PATH}:${GCCPATH}
+
+echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`
+
+
+#
+# Prepare and run clean build
+#
+rm -rf buck-out/ build/lib/ cmake-out/
+rm -rf cmake-corstone
+mkdir cmake-corstone
+cd cmake-corstone
+
+#cmake -DBUCK2=buck2 ..
+
+#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
+cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_XNNPACK=OFF \
+	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
+	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
+	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
+	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
+	  ..
+
+cd ..
+cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
+ARCH=$(uname -i)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
+tar xf gcc.tar.xz
+export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`