Pull request pytorch#10: Feature/EIEX-33 neutron backend

jirioc · jirioc · commit 42a05051a6d6 · 2024-10-31T14:38:45.000+01:00
Merge in AITEC/executorch from feature/EIEX-33-neutron-backend to main-nxp

* commit '4f455df638bd23b887c009384304bd6cd27630b5':
  Initial implementation of the NeutronBackend.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -147,6 +147,8 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
 
+option(EXECUTORCH_BUILD_NXP_NEUTRON "Build the NXP Neutron NPU library" OFF)
+
 option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
@@ -575,6 +577,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
 endif()
 
+if(EXECUTORCH_BUILD_NXP_NEUTRON)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/nxp)
+endif()
+
 if(EXECUTORCH_BUILD_MPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
 endif()
diff --git a/backends/nxp/CMakeLists.txt b/backends/nxp/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(neutron_backend)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+set(_neutron_sources backends/nxp/runtime/NeutronBackend.cpp )
+
+list(TRANSFORM _neutron_sources PREPEND "${EXECUTORCH_ROOT}/")
+
+add_library(executorch_delegate_neutron STATIC ${_neutron_sources})
+target_include_directories(
+  executorch_delegate_neutron PUBLIC ${_common_include_directories}
+)
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2024 NXP
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * Implementation of the backend for the NXP Neutron NPU.
+ */
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include "NeutronDriver.h"
+#include "NeutronErrors.h"
+
+using namespace std;
+
+namespace torch {
+namespace executor {
+
+// Aggregate neutron model handle and data structures into one.
+typedef struct {
+    int numInputs = 0;
+    int numOutputs = 0;
+    NeutronModelConfig mcfg;
+    NeutronDataConfig dcfg;
+    NeutronModelHandle nmh = NULL;
+} NeutronConfig;
+
+class NeutronBackend final : public PyTorchBackendInterface {
+ public:
+  NeutronBackend() {}
+
+  ~NeutronBackend() = default;
+
+  virtual bool is_available() const override {
+    return true;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+
+    MemoryAllocator* allocator = context.get_runtime_allocator();
+    
+    auto *cfg = allocator->allocateInstance<NeutronConfig>();
+
+    // The following data is read from the "processed" data blob.
+    //    cfg->numInputs
+    //    cfg->numoutputs
+    //    cfg->mcfg.microcode
+    //    cfg->mcfg.weights
+    //    cfg->mcfg.kernels
+    const uint32_t* buffer = static_cast<const uint32_t*>(processed->data());
+    uint32_t magicWord = buffer[0];
+    // Check valid microcode.
+    if (magicWord != 0x64434D6E) {
+      ET_LOG(Error, "Preprocessed buffer does not contain a valid Neutron microcode");
+      return Error::InvalidProgram;
+    }
+    uint32_t microcodeSize = buffer[6];
+    uint32_t weightsSize = buffer[7];
+    cfg->numInputs = buffer[9];
+    cfg->numOutputs = buffer[10];
+    cfg->mcfg.microcode = static_cast<const uint8_t*>(processed->data());
+    cfg->mcfg.weights = static_cast<const uint8_t*>(cfg->mcfg.microcode) + microcodeSize;
+    cfg->mcfg.kernels = static_cast<const uint8_t*>(cfg->mcfg.weights) + weightsSize;
+
+    // Allocate place for input and output pointers.
+    cfg->dcfg.inputs = static_cast<const void**>(allocator->allocate(cfg->numInputs * sizeof(void*)));
+    cfg->dcfg.outputs = static_cast<void**>(allocator->allocate(cfg->numOutputs * sizeof(void*)));
+    
+    // Prepare data for through neutron driver.
+    NeutronError neutronRC = neutronModelPrepare((const NeutronModelConfig *)&cfg->mcfg, &cfg->nmh);
+    if (neutronRC != ENONE) {
+      ET_LOG(Error, "Neutron model preparation failed with error code %d", neutronRC);
+      return Error::InvalidProgram;
+    }
+
+    return cfg;
+  }
+
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* input_handle,
+      EValue** args) const override {
+    
+    NeutronConfig *cfg = static_cast<NeutronConfig *>(input_handle);
+
+    // Set inputs and outputs from args.    
+    for (int i = 0; i < cfg->numInputs; i++) {
+      cfg->dcfg.inputs[i] = args[i]->toTensor().const_data_ptr();
+    }
+    for (int i = 0; i < cfg->numOutputs; i++) {
+      cfg->dcfg.outputs[i] = args[cfg->numInputs + i]->toTensor().mutable_data_ptr();
+    }
+
+    // TODO: Use trace from BackendExecutionContext.
+    NeutronTraceConfig trace_config{.traceConfig = 0};
+    neutronSetTrace(cfg->nmh, &trace_config);
+
+    // Run neutron compute.
+    NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg);
+    if (neutronRC != ENONE) {
+      ET_LOG(Error, "Neutron model evaluation failed with error code %d", neutronRC);
+      return Error::InvalidProgram;
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    NeutronConfig *cfg = reinterpret_cast<NeutronConfig *>(handle);
+
+    // Unprepare to free resources in neutron driver.
+    NeutronError neutronRC = neutronModelUnprepare(cfg->nmh);
+    (void)neutronRC;
+
+    // Deallocation is done automatically.
+    /*
+    delete[] cfg->dcfg.inputs;
+    delete[] cfg->dcfg.outputs;
+    delete cfg;
+    */
+    return;
+  }
+};
+
+namespace {
+auto backend = NeutronBackend();
+Backend backend_id{"NeutronBackend", &backend};
+static auto registered = register_backend(backend_id);
+} // namespace
+
+} // namespace executor
+} // namespace torch
diff --git a/backends/nxp/runtime/NeutronDriver.h b/backends/nxp/runtime/NeutronDriver.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2024 NXP
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Interface for the NXP Neutron NPU driver.
+ */
+
+#ifndef NEUTRON_DRIVER_H
+#define NEUTRON_DRIVER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "NeutronErrors.h"
+
+/* Neutron Driver error category codes */
+typedef enum ERROR_CATEGORY_DRIVER {
+    ERROR_CATEGORY_DRIVER_GENERIC,        /* Generic error category */
+    ERROR_CATEGORY_DRIVER_UNSUPPORTED,    /* Unsupported function */
+    ERROR_CATEGORY_DRIVER_UCODE,          /* Microcode bad magic or version incompatible. */
+    ERROR_CATEGORY_DRIVER_INVALID,        /* Invalid arguments */
+    ERROR_CATEGORY_DRIVER_BAD_HANDLE,     /* Bad inference handle */
+    ERROR_CATEGORY_DRIVER_NO_MEMORY,      /* Not enough memory */
+    ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */
+    ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH,   /* Unknown architecture */
+    ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN,  /* Tracing did not run, but trace buffer was requested. */
+    ERROR_CATEGORY_DRIVER_TIMEOUT         /* Timeout error. */
+} ERROR_CATEGORY_DRIVER;
+
+/// Trace configuration to enable kernel level tracing.
+#define TRACE_CONFIG_KERNEL_LEVEL (1U << 0)
+
+/// Trace confinguration to enable job level tracing.
+#define TRACE_CONFIG_JOB_LEVEL (1U << 1)
+
+// Macro to define where to allocate memory for NeutronCtx
+#ifndef NO_HEAP_USAGE
+#define NO_HEAP_USAGE 0
+#endif
+
+/* Neutron Driver errors */
+#define GEN_NEUTRON_DRIVER_ERROR(category, code) GEN_NEUTRON_ERROR(ERROR_COMPONENT_DRIVER, category, code)
+#define GEN_NEUTRON_DRIVER_GENERIC_ERROR()       GEN_NEUTRON_DRIVER_ERROR(ERROR_CATEGORY_DRIVER_GENERIC, __LINE__)
+
+/// Type definition for a Neutron model handle. This is an identifier used to uniquely identify a model.
+/// The convention is that the value NEUTRON_INVALID_HANDLE handle corresponds to an invalid handle.
+typedef void *NeutronModelHandle;
+
+typedef struct {
+    /// Neutron microcode buffer address.
+    /// The Neutron microcode is generated by the Neutron converter tool.
+    /// The microcode buffer is allocated and initialized by the application or ML framework.
+    /// The microcode buffer is passed by reference to the Neutron firmware.
+    /// The microcode buffer is specific for a given ML model.
+    const void *microcode;
+
+    /// Neutron weights buffer address.
+    /// The Neutron weights is generated by the Neutron converter tool.
+    /// The weights buffer is allocated and initialized by the application or ML framework.
+    /// The weights buffer address is passed by reference to the Neutron-firmware.
+    /// The weights buffer is specific for a given ML model.
+    const void *weights;
+
+    /// Neutron kernels buffer address.
+    /// The Neutron kernels are generated by the Neutron converter tool.
+    /// The kernels buffer is allocated and initialized by the application or ML framework.
+    /// The kernels buffer address is passed by reference to the Neutron-firmware.
+    /// The kernels buffer is specific for a given ML model.
+    const void *kernels;
+
+    /// Timeout seconds for the microcode running.
+    /// This timeout is the uplimit seconds that a user expect to complete, default 60.
+    uint32_t timeoutSeconds;
+
+} NeutronModelConfig;
+
+typedef struct {
+    /// The input buffers of the model.
+    /// The input buffers are allocated and initialized by the application or ML framework.
+    /// The input buffers are passed by reference to the Neutron firmware.
+    const void **inputs;
+
+    /// The output buffers of the model.
+    /// The output buffers are allocated by the application or ML framework.
+    /// The output buffers are passed by reference to the Neutron firmware.
+    void **outputs;
+
+    /// Scratch buffer required for computing model intermediate results.
+    /// If NULL, this buffer has to be allocated by the driver.
+    void *scratch;
+
+    /// Scratch buffer required for prefetching model weights from FLASH to SRAM.
+    /// This buffer is used only for Neutron-C targets when the weight prefetch option was explicitly used.
+    /// If NULL, this buffer has to be allocated by the driver.
+    void *scratchWeights;
+
+} NeutronDataConfig;
+
+typedef struct {
+    /// Sets whether tracing should be executed during firmware run or not.
+    /// If set to 0, tracing will not run.
+    /// If set to 1 - kernel level tracing.
+    /// If set to 2 - job level tracing.
+    /// If set to 3 - mixed level tracing
+    uint32_t traceConfig;
+
+    /// Buffer to store collected trace data.
+    /// If it is NULLPTR, driver will allocate the memory, otherwise, application can.
+    char *traceBuffer;
+
+    /// What is the allocated memory for buffer. Needed to check if appending string will be out of bounds.
+    /// Application should set this, if the buffer is allocated by application, otherwise driver will set the value.
+    size_t traceBufferSize;
+} NeutronTraceConfig;
+
+/// This structure contains the prototypes for functions that have a custom implementation.
+/// Any new functions or variables must be added at the end.
+typedef struct {
+    /// This function performs the copying from FLASH to SRAM.
+    void (*copy)(void *dst, void *src, uint32_t size, uint32_t channel);
+    /// This is a blocking function that checks if the current copy has finished.
+    void (*wait)(uint32_t channel);
+} NeutronConfig;
+
+/* Invalid handle, returned by neutronModelPrepare() if an error occurred. */
+#define NEUTRON_INVALID_HANDLE NULL
+
+/// - Initialize the Neutron Driver library, setting initial values, do memory allocation
+///   for internal data structures, do memory mapping.
+NeutronError neutronInit();
+
+/// - Deinitialize the Neutron Driver library, releasing any resources aquired by neutronInit
+NeutronError neutronDeinit();
+
+/// - Prepare Neutron execution for a model with the given configuration.
+/// - This function only prepares the execution by transferring the parameters to the firmware.
+/// - This function allows caching a model and then running the same model but with different
+///   input data (assuming the new input data replaces the old input data by reusing the same buffers).
+/// - In case external allocated memory shall be used for the ModelHandle, e.g. from the Tensorflow
+///   tensor arena, hdl shall be a pointer to the start of the allocated memory block.
+//    If a pointer to NULL is passed, memory will be allocated by the driver
+///   from HEAP. If no HEAP is available, an error will be thrown.
+NeutronError neutronModelPrepare(const NeutronModelConfig *mcfg, NeutronModelHandle *hdl);
+
+/// - Unprepare Neutron execution handle.
+/// - This function releases the internal context data structures and the reserved handle.
+NeutronError neutronModelUnprepare(NeutronModelHandle hdl);
+
+/// - Perform Neutron execution in blocking mode.
+NeutronError neutronRunBlocking(NeutronModelHandle hdl, const NeutronDataConfig *dcfg);
+
+/// - Perform Neutron execution in non-blocking mode.
+/// - This functionality is only available for Neutron-S.
+NeutronError neutronRunNonBlocking(NeutronModelHandle hdl, const NeutronDataConfig *dcfg);
+
+/// - Wait (block) for Neutron completion.
+/// - This functionality is only available for Neutron-S.
+NeutronError neutronWait(NeutronModelHandle hdl, const NeutronDataConfig *dcfg);
+
+/// - Query if the job is done by Neutron.
+/// - This functionality is only available for neutronRunNonBlocking.
+NeutronError neutronIsReady(NeutronModelHandle hdl, bool *isReady);
+
+#ifndef NDEBUG
+/// - Set tracing information.
+void neutronSetTrace(NeutronModelHandle hdl, NeutronTraceConfig *tcfg);
+
+/// - Get tracing result to buffer.
+NeutronError neutronGetTrace(NeutronModelHandle hdl, char **buffer, size_t *size);
+#endif
+
+/// - Perform power management to suspend Neutron hardware.
+//  - This function disables the clock for Neutron.
+NeutronError neutronSuspend();
+
+/// - Perform power management to resume Neutron hardware.
+//  - This function enables the clock for Neutron.
+NeutronError neutronResume();
+
+/// - Used to initialize custom API's or variables implemented by external application.
+NeutronError neutronSetConfig(NeutronConfig *config);
+
+/// - Used to get NeutronContext size.
+size_t neutronGetModelContextSize();
+
+/// Other functions to control the state of driver/firmware.
+#ifdef __cplusplus
+}
+#endif
+#endif // NEUTRON_DRIVER_H
diff --git a/backends/nxp/runtime/NeutronErrors.h b/backends/nxp/runtime/NeutronErrors.h