Skip to content

Commit dabb14e

Browse files
zingofreddan80
authored andcommitted
Arm backend: Add devtools support to example
New flags on run.sh --etdump Build in etdump and profiling, the etdump base64 coded and put in the log --debug_build Build debug instead of release --extra_build_flags Extra flags to pass to cmake this makes it for example possible to override the allocator pool size or other build time cmake flags. The devtools build has been updated so FLATCC_EXECUTABLE can be used to point out the executable. Signed-off-by: Zingo Andersen <[email protected]> Change-Id: Ic0fb1e48ee633c5fe91473bdc2db9e894b2fc4fa
1 parent 9d1a310 commit dabb14e

File tree

10 files changed

+379
-85
lines changed

10 files changed

+379
-85
lines changed

CMakeLists.txt

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK)
673673
endif()
674674

675675
if(EXECUTORCH_BUILD_DEVTOOLS)
676-
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
677-
ON
678-
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
679-
)
676+
if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
677+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
678+
ON
679+
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
680+
)
681+
else()
682+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
683+
OFF
684+
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
685+
)
686+
endif()
680687
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
681688
endif()
682689

backends/arm/runtime/ArmBackendEthosU.cpp

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,39 @@
1515

1616
#include <ethosu_driver.h>
1717

18+
#if defined(ET_EVENT_TRACER_ENABLED)
19+
#include <executorch/runtime/core/event_tracer.h>
20+
#include <executorch/runtime/core/event_tracer_hooks.h>
21+
using executorch::runtime::EventTracer;
22+
using executorch::runtime::EventTracerEntry;
23+
24+
class EventTraceScope {
25+
public:
26+
EventTraceScope(EventTracer* event_tracer_, const char* name) {
27+
event_tracer = event_tracer_;
28+
event_tracer_entry_scope = event_tracer->start_profiling(name);
29+
}
30+
~EventTraceScope() {
31+
event_tracer->end_profiling(event_tracer_entry_scope);
32+
}
33+
34+
private:
35+
EventTracer* event_tracer;
36+
EventTracerEntry event_tracer_entry_scope;
37+
};
38+
#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
39+
EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
40+
#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
41+
SCOPE = EVENTTRACER->start_profiling(NAME)
42+
#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
43+
EVENTTRACER->end_profiling(SCOPE)
44+
45+
#else
46+
#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
47+
#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
48+
#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
49+
#endif
50+
1851
#include <executorch/backends/arm/runtime/VelaBinStream.h>
1952
#include <executorch/runtime/backend/interface.h>
2053
#include <executorch/runtime/core/error.h>
@@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
109142
BackendExecutionContext& context,
110143
DelegateHandle* input_handle,
111144
EValue** args) const override {
145+
#if defined(ET_EVENT_TRACER_ENABLED)
146+
EventTracer* event_tracer = context.event_tracer();
147+
EventTracerEntry event_tracer_local_scope;
148+
#endif
149+
150+
EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()");
151+
ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
152+
112153
ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
113154
VelaHandles handles;
114155

115-
ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
116156
// Command stream - we know at this point it's aligned
157+
EXECUTORCH_PROF_START(
158+
event_tracer,
159+
event_tracer_local_scope,
160+
"+ArmBackend::execute()processed_data");
117161
char* data = (char*)execution_handle->processed->data();
162+
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
163+
118164
ET_LOG(Debug, "ArmBackend::execute %p", data);
119165

166+
EXECUTORCH_PROF_START(
167+
event_tracer,
168+
event_tracer_local_scope,
169+
"+ArmBackend::execute()vela_bin_read()");
120170
// Read key sections from the vela_bin_stream
121171
if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
122172
false) {
123173
ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
124174
return Error::InvalidProgram;
125175
}
176+
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
126177

127178
ET_LOG(
128179
Debug,
@@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
186237

187238
// Select a compatible copy routine
188239
if (both_char and permuted_input_shape) {
240+
EXECUTORCH_PROF_SCOPE(
241+
event_tracer,
242+
"+ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
189243
// permuted byte copy CHW to HWC
190244
permute_CHW_to_HWC(
191245
tensor_in.mutable_data_ptr<char>(),
@@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
194248
tensor_in.size(2),
195249
tensor_in.size(3));
196250
} else if (both_char or both_int) {
251+
EXECUTORCH_PROF_SCOPE(
252+
event_tracer, "+ArmBackend::execute()handles.input.memcpy()");
197253
// Sizes match and elt size matches so memcpy
198254
memcpy(
199255
scratch_addr,
@@ -234,14 +290,18 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
234290
(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
235291
size_t bases_size[2] = {
236292
handles.weight_data_size, handles.scratch_data_size};
237-
int result = ethosu_invoke_v3(
293+
int result = 0;
294+
EXECUTORCH_PROF_START(
295+
event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU");
296+
result = ethosu_invoke_v3(
238297
driver.get(),
239298
(void*)handles.cmd_data,
240299
handles.cmd_data_size,
241300
bases,
242301
bases_size,
243302
2, /* fixed array of pointers to binary interface*/
244303
nullptr);
304+
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
245305

246306
if (result != 0) {
247307
ET_LOG(
@@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
277337
&permuted_output_shape));
278338
if (tensor_out.scalar_type() == ScalarType::Char and
279339
permuted_output_shape) {
340+
EXECUTORCH_PROF_SCOPE(
341+
event_tracer,
342+
"+ArmBackend::execute()handles.output.permute_HWC_to_CHW()");
343+
280344
char* output_address = (char*)output_addr;
281345
permute_HWC_to_CHW(
282346
output_address,
@@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
285349
tensor_out.size(2),
286350
tensor_out.size(3));
287351
} else {
352+
EXECUTORCH_PROF_SCOPE(
353+
event_tracer, "+ArmBackend::execute()handles.output.move()");
288354
for (int j = 0; j < tensor_out.numel(); j++) {
289355
if (tensor_out.scalar_type() == ScalarType::Char) {
290356
char* output_address = (char*)output_addr;

devtools/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19)
1313

1414
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1515

16+
set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
17+
1618
if(NOT CMAKE_CXX_STANDARD)
1719
set(CMAKE_CXX_STANDARD 17)
1820
endif()
1921

2022
if(NOT FLATCC_EXECUTABLE)
21-
set(FLATCC_EXECUTABLE flatcc)
23+
set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
2224
endif()
2325

2426
# Source root directory for executorch.
@@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE
6668
OFF
6769
CACHE BOOL ""
6870
)
69-
set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
71+
7072
add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)
7173

7274
# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
@@ -163,7 +165,7 @@ add_custom_command(
163165
# Note that the flatcc project actually writes its outputs into the source
164166
# tree instead of under the binary directory, and there's no way to change
165167
# that behavior.
166-
${_flatcc_source_dir}/bin/flatcc -cwr -o
168+
${FLATCC_EXECUTABLE} -cwr -o
167169
${_program_schema__include_dir}/executorch/devtools/etdump
168170
${_etdump_schema__srcs}
169171
COMMAND rm -rf ${_etdump_schema_cleanup_paths}

examples/arm/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,8 @@ generate_bindings_for_kernels(
5757
gen_operators_lib(
5858
LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
5959
)
60+
61+
if(EXECUTORCH_ENABLE_EVENT_TRACER)
62+
target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
63+
target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
64+
endif()

examples/arm/aot_arm_compiler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def get_compile_spec(
273273
target,
274274
system_config="Ethos_U55_High_End_Embedded",
275275
memory_mode="Shared_Sram",
276-
extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
276+
extra_flags="--debug-force-regor --output-format=raw --verbose-operators",
277277
)
278278
.set_permute_memory_format(True)
279279
.set_quantize_io(True)
@@ -286,7 +286,7 @@ def get_compile_spec(
286286
target,
287287
system_config="Ethos_U85_SYS_DRAM_Mid",
288288
memory_mode="Shared_Sram",
289-
extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
289+
extra_flags="--output-format=raw --verbose-operators",
290290
)
291291
.set_permute_memory_format(True)
292292
.set_quantize_io(True)

examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644
2020

2121
KEEP(*(.eh_frame*))
2222
} > ITCM :rom_exec
23-
@@ -280,7 +280,7 @@ SECTIONS
23+
@@ -280,7 +280,8 @@ SECTIONS
2424
#endif
2525
* (expected_output_data_sec)
2626
* (sec_command_stream, sec_weight_data, sec_input_data)
2727
-
28+
+ *(.got*)
2829
+ *(.rodata*)
2930
* (ethosu_core_in_queue)
3031
* (ethosu_core_out_queue)

examples/arm/executor_runner/CMakeLists.txt

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF)
1010
option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
1111
option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
1212

13-
1413
if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
1514
message(
1615
FATAL_ERROR
@@ -220,10 +219,8 @@ target_sources(
220219
# Include the target's bare-metal linker script
221220
ethosu_eval_link_options(arm_executor_runner)
222221

223-
# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
224-
# bin size as we link in a number of other symbols
225-
target_link_libraries(
226-
arm_executor_runner
222+
set(arm_executor_runner_link)
223+
list(APPEND arm_executor_runner_link
227224
extension_runner_util
228225
ethosu_target_init
229226
executorch
@@ -237,6 +234,44 @@ target_link_libraries(
237234
-Xlinker -Map=arm_executor_runner.map
238235
)
239236

237+
if(EXECUTORCH_ENABLE_EVENT_TRACER)
238+
target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED)
239+
240+
add_library(etdump STATIC IMPORTED)
241+
set_property(
242+
TARGET etdump
243+
PROPERTY IMPORTED_LOCATION
244+
"${ET_BUILD_DIR_PATH}/lib/libetdump.a"
245+
)
246+
247+
if(CMAKE_BUILD_TYPE MATCHES "Debug")
248+
set(FLATCCRT_LIB flatccrt_d)
249+
else()
250+
set(FLATCCRT_LIB flatccrt)
251+
endif()
252+
253+
add_library(${FLATCCRT_LIB} STATIC IMPORTED)
254+
set_property(
255+
TARGET ${FLATCCRT_LIB}
256+
PROPERTY IMPORTED_LOCATION
257+
"${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
258+
)
259+
260+
list(APPEND arm_executor_runner_link
261+
etdump
262+
${FLATCCRT_LIB}
263+
)
264+
endif()
265+
266+
# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
267+
# bin size as we link in a number of other symbols
268+
target_link_libraries(
269+
arm_executor_runner
270+
${arm_executor_runner_link}
271+
)
272+
273+
target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map )
274+
240275
# ET headers and generated headers includes
241276
target_include_directories(
242277
arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}

0 commit comments

Comments
 (0)