Skip to content

Initial framework of an ethos-u runtime backend #2 #595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
521124b
[NOMERGE] stub out tests with codegen issues temporarily
robell Oct 3, 2023
2a6dce5
Initial Ethos-U runtime backend
robell Oct 3, 2023
2083a71
Fixed error messages on runtime init
robell Oct 4, 2023
f5cff0b
lintrunner cleanup
robell Oct 4, 2023
a1fbdb4
Enable logging in Release mode
digantdesai Oct 4, 2023
918c798
Move arm example dir under backends
digantdesai Oct 4, 2023
b2a395a
Add option specify a list of ops
digantdesai Oct 4, 2023
862e510
Add softmax as another toy model
digantdesai Oct 4, 2023
69c132d
patches and utils for setting up baremetal stack for cs300
digantdesai Oct 4, 2023
e87db01
Add example script to run on cpu
digantdesai Oct 4, 2023
afd8080
Add ArmBackend to example scripts
robell Oct 4, 2023
a29523f
Add delegate test and FVP output
robell Oct 4, 2023
8efd2e3
align builds on same cmake toolchain
robell Oct 4, 2023
52c1c09
Fix delegate runner patch
robell Oct 4, 2023
8ae524c
cmake compiler and log behaviour fixing
robell Oct 4, 2023
a64e835
Minimal example of AoT with ArmPartitioner+Vela
robell Oct 5, 2023
ec03f3c
Generate pte for delegate test on the fly
robell Oct 5, 2023
1ba71d7
Added support for variable input output patterns
robell Oct 5, 2023
f29715e
Handle multiple delegate inputs with SRAM offsets
robell Oct 5, 2023
3b35ff6
Add TOSA ref model and Vela dependencies
robell Oct 5, 2023
94c598e
Cleanup from lintrunner and other bits of tidyup
robell Oct 5, 2023
683d428
Removed ethos u driver build and cmsis dependency
robell Oct 5, 2023
3292199
renamed lib ethos_u to executorch_delegate_ethos_u
robell Oct 5, 2023
e6ede01
lintfix
robell Oct 5, 2023
76a393a
tidied delegate_runner output
robell Oct 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@
[submodule "examples/third-party/llama"]
path = examples/third-party/llama
url = https://github.com/facebookresearch/llama.git
[submodule "backends/arm/third-party/ethos-u-core-driver"]
path = backends/arm/third-party/ethos-u-core-driver
url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
30 changes: 24 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,20 @@ endif()
# directory, before and after this command is invoked - targets in
# sub-directories added after this command is invoked
if(CMAKE_BUILD_TYPE STREQUAL "Release")
# To enable logging in Release mode
option(
EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE
"Enable logging in release mode" OFF)

set(_ET_LOG_ENABLE 0)
if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE})
set(_ET_LOG_ENABLE 1)
endif()

# Avoid pulling in the logging strings, which can be large.
add_definitions(-DET_LOG_ENABLED=0)
# Avoid pulling in the flatbuffer data verification logic, which can add about
# 20kB.
add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE})
# Avoid pulling in the flatbuffer data verification
# logic, which can add about 20kB.
add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
endif()

Expand Down Expand Up @@ -94,17 +104,21 @@ option(BUILD_SELECTIVE_BUILD_TEST

option(EXECUTORCH_BUILD_SIZE_TEST "Whether to build size test" OFF)

# Option to register op list
option(SELECT_OPS_LIST "Register the following list of ops" OFF)

if(BUILD_SELECTIVE_BUILD_TEST)
option(SELECT_ALL_OPS
"Whether to register all ops defined in portable kernel library." OFF)

# Option to register op list
option(SELECT_OPS_LIST "Register the following list of ops" OFF)

# Option to register ops from yaml file
option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF)
endif()

# Build Arm Baremetal backend
option(EXECUTORCH_BUILD_ARM_BAREMETAL
"Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)

# Build xnn_executor_runner which depends on XNNPACK
option(EXECUTORCH_BUILD_XNNPACK
"Build xnn_executor_runner which depends on XNNPACK" OFF)
Expand Down Expand Up @@ -303,6 +317,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
endif()

if(EXECUTORCH_BUILD_ARM_BAREMETAL)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
endif()

# Add selective build subdirectory
if(BUILD_SELECTIVE_BUILD_TEST)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
Expand Down
37 changes: 37 additions & 0 deletions backends/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
cmake_minimum_required(VERSION 3.19)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Source root directory for executorch.
if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

include(${EXECUTORCH_ROOT}/build/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
set(_common_compile_options -Wno-deprecated-declarations)

include(cmake/Dependencies.cmake)

set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")

add_library(
executorch_delegate_ethos_u
STATIC ${_arm_baremetal_sources}
)
target_include_directories(
executorch_delegate_ethos_u
PUBLIC
${_common_include_directories}
)
target_include_directories(
executorch_delegate_ethos_u
PUBLIC
${DRIVER_ETHOSU_INCLUDE_DIR}
)
88 changes: 82 additions & 6 deletions backends/arm/arm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import logging
import operator
import os
import struct
import subprocess
import tempfile
from typing import final, List

Expand Down Expand Up @@ -143,6 +145,82 @@ def dbg_tosa_dump(tosa_fb, path):
f.close()


# Output to Vela with current file-based compilation
# WARNING: if this changes, the runtime reader also needs to change
def vela_compile(tosa_fb):
with tempfile.TemporaryDirectory() as tmpdir:
tosaname = "out.tosa"
flatbuffer = tosa_fb.serialize()
f = open(os.path.join(tmpdir, tosaname), "wb")
f.write(flatbuffer)
f.close()

# invoke vela
# TODO target ethos-u55-128
vela_command = (
f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
)
subprocess.run([vela_command], shell=True, check=True)

np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
blocks = b""
with np.load(np_path, allow_pickle=False) as data:
# Emit the NPZ regions as:
# - 16 byte block name null terminated string (padded to 16 if name shorter)
# - 4 byes of int32 block length and 12 bytes of 0's
# - block data (padded to 16 byte alignment at end)
# Repeat for all blocks
for key in data.keys():
block_name = bytes(key, "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))

block_data = b""
if key in ("input_shape", "output_shape"):
inputs = data[key]
# Encode a struct of int len; and one or more int x,y,z,w shape;
input_struct = struct.pack("<i", len(inputs))
for inp in inputs:
assert len(inp) <= 4
inp_pad = inp.tolist() + [0] * (4 - len(inp))
input_struct = input_struct + struct.pack("<iiii", *inp_pad)
block_data = input_struct
elif key in ("input_offset", "output_offset"):
inputs = data[key]
offset_struct = struct.pack("<i", len(inputs))
for inp in inputs:
offset_struct = offset_struct + struct.pack("<i", inp)
block_data = offset_struct
else:
block_data = data[key].tobytes()
# We need the acual unpadded block lengths for hw setup
block_length = len(block_data).to_bytes(16, "little")
# pad block data to multiple of 16 bytes
block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)

block = block_name + block_length + block_data
blocks = blocks + block

# Add a block for scratch, inputs and outputs
# scratch shape is a 1 element array giving us size in bytes
block_name = bytes("scratch_data", "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))
block_length = data["scratch_shape"][0].item()
block_length = block_length + (15 - (block_length - 1) % 16)
block_data = b"\x00" * block_length
block_length = block_length.to_bytes(16, "little")
block = block_name + block_length + block_data
blocks = blocks + block
# TODO are these already in scratch shape? look to be
# input_shape * input_elem_size
# output_shape * output_elem_size
# input_offset and output_offset specify the location these arrays are written from base of scratch

# return 16 byte VELA bin header + blocks + footer
header = bytes("vela_bin_stream", "utf-8") + b"\x00"
footer = bytes("vela_end_stream", "utf-8") + b"\x00"
return header + blocks + footer


def dbg_fail(node, tosa_fb, path):
dbg_tosa_dump(tosa_fb, path)
logger.warn("Internal error due to poorly handled node:")
Expand Down Expand Up @@ -240,10 +318,6 @@ def preprocess( # noqa: C901
path = spec.value.decode()
debug_output = True

# in non debug builds we still pass files to vela
if path is None:
path = tempfile.mkdtemp(prefix="arm_tosa_")

# Converted output for this subgraph, serializer needs path early as it emits
# const data directly. Path created and data written only in debug builds.
tosa_fb = ts.TosaSerializer(path)
Expand Down Expand Up @@ -881,5 +955,7 @@ def preprocess( # noqa: C901
dbg_tosa_dump(tosa_fb, path)

# Serialize and return the tosa flatbuffer
fb = tosa_fb.serialize()
return PreprocessResult(processed_bytes=bytes(fb))
# fb = bytes(tosa_fb.serialize())
binary = vela_compile(tosa_fb)

return PreprocessResult(processed_bytes=binary)
10 changes: 10 additions & 0 deletions backends/arm/cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright 2023 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")

# Ethos-U driver
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
90 changes: 90 additions & 0 deletions backends/arm/cmake/arm-none-eabi-gcc.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright 2023 Arm Limited and/or its affiliates.
Copy link
Contributor

@digantdesai digantdesai Oct 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this same as - core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake? But will respect cmdline vars?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd anticipate further changes and cleaning but it was derived from there, yes.

It has a similar override but for m55 our only current supported target. The way the toolchain file is constructed, some of the general ./configure like step in cmake was invoking it without an appropriate value so the default prevents the -mfpu flag which was the default set on an m4 target messing things up.

#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU")
string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)

set(CMAKE_SYSTEM_NAME Generic)
set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
set(CMAKE_LINKER "arm-none-eabi-ld")

set(CMAKE_EXECUTABLE_SUFFIX ".elf")
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

# Select C/C++ version
set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 14)

set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})

# Compile options
add_compile_options(
-mcpu=${GCC_CPU}
-mthumb
"$<$<CONFIG:DEBUG>:-gdwarf-3>"
"$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
-fdata-sections
-ffunction-sections)

# Compile defines
add_compile_definitions(
"$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")

# Link options
add_link_options(
-mcpu=${GCC_CPU}
-mthumb
--specs=nosys.specs)

# Set floating point unit
if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
set(FLOAT hard)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
set(FLOAT soft)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
set(FLOAT hard)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
set(FLOAT hard)
set(FPU_CONFIG "fpv4-sp-d16")
add_compile_options(-mfpu=${FPU_CONFIG})
add_link_options(-mfpu=${FPU_CONFIG})
else()
set(FLOAT soft)
endif()

if(FLOAT)
add_compile_options(-mfloat-abi=${FLOAT})
add_link_options(-mfloat-abi=${FLOAT})
endif()

add_link_options(LINKER:--nmagic,--gc-sections)

# Compilation warnings
add_compile_options(
# -Wall
# -Wextra

# -Wcast-align
# -Wdouble-promotion
# -Wformat
# -Wmissing-field-initializers
# -Wnull-dereference
# -Wredundant-decls
# -Wshadow
# -Wswitch
# -Wswitch-default
# -Wunused
-Wno-redundant-decls
-Wno-psabi
)
53 changes: 53 additions & 0 deletions backends/arm/cmake/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
# Copyright 2023 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
set -e

#
# Setup toolchain
#
BASEDIR=`realpath $(dirname "$0")`
echo "building using build.sh in $BASEDIR"

ARCH=$(uname -i)
GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/

echo $GCCPATH
if test -d "${GCCPATH}"; then
echo Using exising compiler ${GCCPATH}
else
pushd ${BASEDIR}/
./toolchain.sh
popd
fi
export PATH=${PATH}:${GCCPATH}

echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`


#
# Prepare and run clean build
#
rm -rf buck-out/ build/lib/ cmake-out/
rm -rf cmake-corstone
mkdir cmake-corstone
cd cmake-corstone

#cmake -DBUCK2=buck2 ..

#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
cmake -DFLATC_EXECUTABLE=flatc \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
-DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
-DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
-DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
--toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
..

cd ..
cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
12 changes: 12 additions & 0 deletions backends/arm/cmake/toolchain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
# Copyright 2023 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
set -e

# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
ARCH=$(uname -i)
curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
tar xf gcc.tar.xz
export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
Loading