Skip to content

Commit cc1a8bd

Browse files
robellfacebook-github-bot
authored andcommitted
Tidied up Ethos-U delegate binaries (#1046)
Summary: fixed #677 simplified and made binary format more robust/validate-able and moved more processing to AoT made runtime input/output support more general (arbitrary number of EValue inputs and outputs) Pull Request resolved: #1046 Reviewed By: SS-JIA Differential Revision: D50599184 Pulled By: digantdesai fbshipit-source-id: 908d8598745b0b4b99c4001cc42c6a74c251e563
1 parent d8e9b26 commit cc1a8bd

File tree

7 files changed

+284
-224
lines changed

7 files changed

+284
-224
lines changed

backends/arm/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
2020
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
2121
include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
2222

23-
set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
23+
set(_arm_baremetal_sources
24+
backends/arm/runtime/ArmBackendEthosU.cpp
25+
backends/arm/runtime/VelaBinStream.cpp
26+
)
2427
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
2528

2629
add_library(

backends/arm/arm_backend.py

Lines changed: 52 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path):
145145
f.write(js)
146146

147147

148-
# Output to Vela with current file-based compilation
149-
# WARNING: if this changes, the runtime reader also needs to change
148+
# Pack either input or output tensor block, compose the related arrays into
149+
# per-io structs to simplify runtime use.
150+
def vela_bin_pack_io(prefix, data):
151+
ios = struct.pack("<i", len(data[prefix + "_shape"]))
152+
for i in range(len(data[prefix + "_shape"])):
153+
io_shape = data[prefix + "_shape"][i]
154+
io_elem_size = data[prefix + "_elem_size"][i]
155+
io_offset = data[prefix + "_offset"][i]
156+
io_region = data[prefix + "_region"][i]
157+
assert len(io_shape) <= 4
158+
inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
159+
io_struct = struct.pack(
160+
"<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
161+
)
162+
ios += io_struct
163+
return ios
164+
165+
166+
# Output via Vela to binary stream for ArmBackendEthosU
167+
# WARNING: Do not change this without changing VelaBinStream.cpp as that
168+
# function consumes this format and the two need to align.
150169
def vela_compile(tosa_fb):
151170
with tempfile.TemporaryDirectory() as tmpdir:
152171
tosaname = "out.tosa"
@@ -162,65 +181,52 @@ def vela_compile(tosa_fb):
162181

163182
np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
164183
blocks = b""
184+
165185
with np.load(np_path, allow_pickle=False) as data:
186+
# Construct our modified output_blocks with data in a form easily
187+
# digested on the device side
188+
bin_blocks = {"vela_bin_stream": b""}
189+
190+
# copy command data through unmodified
191+
bin_blocks["cmd_data"] = data["cmd_data"].tobytes()
192+
193+
# copy weight data through unmodified
194+
bin_blocks["weight_data"] = data["weight_data"].tobytes()
195+
196+
# Add a block for scratch, inputs and outputs; scratch shape is a 1 element
197+
# array giving us size in bytes so extract this and add a block of 0's.
198+
# Currently we preallocated this on the host to provide SRAM for computation.
199+
if len(data["scratch_shape"][0]) != 1:
200+
raise RuntimeError("Expected scratch to be single array")
201+
block_length = data["scratch_shape"][0].item()
202+
bin_blocks["scratch_data"] = b"\x00" * block_length
203+
204+
# Capture inputs and outputs
205+
bin_blocks["inputs"] = vela_bin_pack_io("input", data)
206+
bin_blocks["outputs"] = vela_bin_pack_io("output", data)
207+
208+
bin_blocks["vela_end_stream"] = b""
209+
166210
# Emit the NPZ regions as:
167211
# - 16 byte block name null terminated string (padded to 16 if name shorter)
168212
# - 4 bytes of int32 block length and 12 bytes of 0's
169213
# - block data (padded to 16 byte alignment at end)
170214
# Repeat for all blocks
171-
for key in data.keys():
215+
for key in bin_blocks.keys():
172216
block_name = bytes(key, "utf8")[:15]
173217
block_name = block_name + b"\x00" * (16 - len(block_name))
174218

175-
block_data = b""
176-
if key in ("input_shape", "output_shape"):
177-
inputs = data[key]
178-
# Encode a struct of int len; and one or more int x,y,z,w shape;
179-
input_struct = struct.pack("<i", len(inputs))
180-
for inp in inputs:
181-
assert len(inp) <= 4
182-
inp_pad = inp.tolist() + [0] * (4 - len(inp))
183-
input_struct = input_struct + struct.pack("<iiii", *inp_pad)
184-
block_data = input_struct
185-
elif key in ("input_offset", "output_offset"):
186-
inputs = data[key]
187-
if key == "output_offset" and len(inputs) > 1:
188-
raise RuntimeError(
189-
"Currently only support one output in Vela ArmBackend"
190-
)
191-
offset_struct = struct.pack("<i", len(inputs))
192-
for inp in inputs:
193-
offset_struct = offset_struct + struct.pack("<i", inp)
194-
block_data = offset_struct
195-
else:
196-
block_data = data[key].tobytes()
197219
# We need the acual unpadded block lengths for hw setup
198-
block_length = len(block_data).to_bytes(16, "little")
199-
# pad block data to multiple of 16 bytes
220+
block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
221+
222+
# Pad block data to multiple of 16 bytes
223+
block_data = bin_blocks[key]
200224
block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
201225

202226
block = block_name + block_length + block_data
203227
blocks = blocks + block
204228

205-
# Add a block for scratch, inputs and outputs
206-
# scratch shape is a 1 element array giving us size in bytes
207-
block_name = bytes("scratch_data", "utf8")[:15]
208-
block_name = block_name + b"\x00" * (16 - len(block_name))
209-
block_length = data["scratch_shape"][0].item()
210-
block_length = block_length + (15 - (block_length - 1) % 16)
211-
block_data = b"\x00" * block_length
212-
block_length = block_length.to_bytes(16, "little")
213-
block = block_name + block_length + block_data
214-
blocks = blocks + block
215-
# TODO are these already in scratch shape? look to be
216-
# input_shape * input_elem_size
217-
# output_shape * output_elem_size
218-
# input_offset and output_offset specify the location these arrays are written from base of scratch
219-
220-
# return 16 byte VELA bin header + blocks + footer
221-
header = bytes("vela_bin_stream", "utf-8") + b"\x00"
222-
footer = bytes("vela_end_stream", "utf-8") + b"\x00"
223-
return header + blocks + footer
229+
return blocks
224230

225231

226232
def dbg_fail(node, tosa_fb, path):

backends/arm/runtime/ArmBackendEthosU.cpp

Lines changed: 22 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
*/
1212

1313
#include <cstring>
14-
#include <memory>
15-
#include <vector>
1614

1715
#include <executorch/runtime/backend/interface.h>
1816
#include <executorch/runtime/core/error.h>
1917
#include <executorch/runtime/core/evalue.h>
2018

19+
#include <executorch/backends/arm/runtime/VelaBinStream.h>
20+
2121
#include <ethosu_driver.h>
2222
#include <pmu_ethosu.h>
2323

@@ -52,29 +52,14 @@ class ArmBackend final : public PyTorchBackendInterface {
5252

5353
char* data = (char*)processed->data();
5454
size_t size = processed->size();
55-
char* foot = data + size - 16;
55+
char* foot = data + size - sizeof(VelaBinBlock);
5656

57-
// Header and footer both 16 bit aligned suggest valid structure and we
58-
// wont walk off the end of the chunks and segfault
59-
if (!((int)data == next_mul_16((uintptr_t)data))) {
60-
ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
61-
return Error::InvalidProgram;
62-
}
63-
if (!((int)foot == next_mul_16((uintptr_t)foot))) {
64-
ET_LOG(Error, "ArmBackend::init: Footer expected to be 16 byte aligned");
65-
ET_LOG(
66-
Error,
67-
"ArmBackend::init: Program expected to be multiple of 16 bytes");
68-
return Error::InvalidProgram;
69-
}
70-
if (!(0 == strncmp(data, "vela_bin_stream", 15))) {
71-
ET_LOG(Error, "ArmBackend::init: Binary passed is not a vela_bin_stream");
72-
return Error::InvalidProgram;
73-
}
74-
if (!(0 == strncmp(foot, "vela_end_stream", 15))) {
75-
ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
57+
// Verify format of vela_bin
58+
if (vela_bin_validate(data, size) == false) {
59+
ET_LOG(Error, "Malformed vela_bin_stream found");
7660
return Error::InvalidProgram;
7761
}
62+
7863
// Verify address range is accessible current expectation is the program
7964
// is wholly stored in SRAM
8065
// TODO: expect to improve capabilities here by supporting DRAM storage
@@ -108,7 +93,7 @@ class ArmBackend final : public PyTorchBackendInterface {
10893
char* data = (char*)processed->data();
10994

11095
// Read key sections from the vela_bin_stream
111-
if (!this->vela_read(data, &handles, processed->size())) {
96+
if (vela_bin_read(data, &handles, processed->size()) == false) {
11297
ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
11398
return Error::InvalidProgram;
11499
}
@@ -124,8 +109,9 @@ class ArmBackend final : public PyTorchBackendInterface {
124109
handles.scratch_data_size);
125110

126111
// Write inputs into SRAM scratch area defined by Vela
127-
for (int i = 0; i < handles.input_shapes.size(); i++) {
128-
const char* input_addr = handles.scratch_data + handles.input_offset[i];
112+
for (int i = 0; i < handles.inputs->count; i++) {
113+
const char* input_addr =
114+
handles.scratch_data + handles.inputs->io[i].offset;
129115
// Process input EValue into scratch
130116
// TODO: Optimise into direct write from Vela into the SRAM or DRAM output
131117
// for compatible data layouts.
@@ -168,25 +154,17 @@ class ArmBackend final : public PyTorchBackendInterface {
168154
return Error::InvalidProgram;
169155
}
170156

171-
// output data from Ethos U
172-
// We only handle one output at the moment
173-
const char* output_addr = handles.scratch_data + handles.output_offset[0];
174-
// Outputs are in the index immediately after inputs
175-
int output_index = handles.input_shapes.size();
176-
177-
if (handles.output_shapes.size() != 1) {
178-
ET_LOG(
179-
Error,
180-
"ArmBackend::execute: currently only support one return tensor");
181-
return Error::InvalidProgram;
182-
}
183-
// Process results into EValue storage
184-
// TODO: optimise into direct write for compatible, contig layout
185-
int* output_address = (int*)output_addr;
186-
auto tensor_out = args[output_index]->toTensor();
187-
for (int j = 0; j < tensor_out.numel(); j++) {
188-
// TODO: extend beyond tensors with 4 byte elements
189-
tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
157+
// Write outputs from scratch into EValue pointers
158+
for (int i = 0; i < handles.outputs->count; i++) {
159+
const char* output_addr =
160+
handles.scratch_data + handles.outputs->io[i].offset;
161+
// Process input EValue into scratch
162+
int* output_address = (int*)output_addr;
163+
// Outputs are in the index immediately after inputs
164+
auto tensor_out = args[handles.inputs->count + i]->toTensor();
165+
for (int j = 0; j < tensor_out.numel(); j++) {
166+
tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
167+
}
190168
}
191169

192170
return Error::Ok;
@@ -195,114 +173,6 @@ class ArmBackend final : public PyTorchBackendInterface {
195173
void destroy(DelegateHandle* handle) const override {
196174
return;
197175
}
198-
199-
private:
200-
typedef struct {
201-
const char* cmd_data;
202-
size_t cmd_data_size;
203-
const char* weight_data;
204-
size_t weight_data_size;
205-
const char* scratch_data;
206-
size_t scratch_data_size;
207-
vector<size_t> input_offset;
208-
vector<vector<int>> input_shapes;
209-
vector<size_t> output_offset;
210-
vector<vector<int>> output_shapes;
211-
} VelaHandles;
212-
213-
typedef struct {
214-
char name[16];
215-
uint32_t size;
216-
char _pad[12];
217-
char data[];
218-
} VelaBinBlock;
219-
220-
typedef struct {
221-
int count;
222-
int shape[][4];
223-
} VelaShapes;
224-
225-
typedef struct {
226-
int count;
227-
int offsets[];
228-
} VelaOffsets;
229-
230-
static int next_mul_16(int n) {
231-
return ((n - 1) | 15) + 1;
232-
}
233-
234-
int vela_read(char* data, VelaHandles* handles, int size) const {
235-
constexpr const size_t header_size = 16;
236-
237-
// Read header string
238-
if (strncmp(data, "vela_bin_stream", 15)) {
239-
return 0;
240-
}
241-
data += header_size;
242-
243-
// Expect one or more 'VelaBinBlock's
244-
while (1) {
245-
VelaBinBlock* b = (VelaBinBlock*)data;
246-
data += sizeof(VelaBinBlock) + next_mul_16(b->size);
247-
248-
// Exit with success on finding end of stream
249-
if (!strncmp(b->name, "vela_end_stream", strlen("vela_end_stream")))
250-
return 1;
251-
252-
if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
253-
// This magic header confirms a valid command stream in binary
254-
if (strncmp(b->data, "COP1", strlen("COP1")))
255-
return 0;
256-
handles->cmd_data = b->data;
257-
handles->cmd_data_size = b->size;
258-
}
259-
if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
260-
handles->weight_data = b->data;
261-
handles->weight_data_size = b->size;
262-
}
263-
if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
264-
handles->scratch_data = b->data;
265-
handles->scratch_data_size = b->size;
266-
}
267-
268-
// capture inputs and outputs
269-
if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
270-
VelaOffsets* offsets = (VelaOffsets*)b->data;
271-
for (int i = 0; i < offsets->count; i++) {
272-
handles->input_offset.push_back(offsets->offsets[i]);
273-
}
274-
}
275-
if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
276-
VelaOffsets* offsets = (VelaOffsets*)b->data;
277-
for (int i = 0; i < offsets->count; i++) {
278-
handles->output_offset.push_back(offsets->offsets[i]);
279-
}
280-
}
281-
282-
if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
283-
VelaShapes* shapes = (VelaShapes*)b->data;
284-
for (int i = 0; i < shapes->count; i++) {
285-
vector<int> s = {
286-
shapes->shape[i][0],
287-
shapes->shape[i][1],
288-
shapes->shape[i][2],
289-
shapes->shape[i][3]};
290-
handles->input_shapes.push_back(s);
291-
}
292-
}
293-
if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
294-
VelaShapes* shapes = (VelaShapes*)b->data;
295-
for (int i = 0; i < shapes->count; i++) {
296-
vector<int> s = {
297-
shapes->shape[i][0],
298-
shapes->shape[i][1],
299-
shapes->shape[i][2],
300-
shapes->shape[i][3]};
301-
handles->output_shapes.push_back(s);
302-
}
303-
}
304-
}
305-
}
306176
};
307177

308178
namespace {

0 commit comments

Comments
 (0)