Skip to content

Commit 248a3f6

Browse files
committed
Update on "[ET-VK][AOT] Define pass application order"
## Changes The goal of this diff is to enforce a specific structure in how graph transform passes are applied during `vulkan_preprocess`. This will help make sure that certain passes are applied at the correct time, and that pre-requisite conditions for passes are fulfilled before they are applied. See the comments in `vulkan_preprocess.py` for more details. Differential Revision: [D65234843](https://our.internmc.facebook.com/intern/diff/D65234843/) [ghstack-poisoned]
2 parents 6681329 + b25bd7d commit 248a3f6

39 files changed

+3127
-100
lines changed

.github/workflows/_android.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ jobs:
6666
# avoid permission issue
6767
sudo chown -R "${USER}" /opt/android
6868
69+
- name: Download Artifacts
70+
shell: bash
71+
run: |
72+
set -eux
73+
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
74+
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
75+
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
76+
unzip model.zip
77+
mv *.pte model.pte
78+
6979
- name: Gradle cache
7080
uses: gradle/actions/setup-gradle@v3
7181

.github/workflows/pull.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ jobs:
9999
submodules: 'true'
100100
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
101101
timeout: 900
102+
upload-artifact: android-models
103+
upload-artifact-to-s3: true
102104
script: |
103105
# The generic Linux job chooses to use base env, not the one setup by the image
104106
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -107,13 +109,15 @@ jobs:
107109
DTYPE=${{ matrix.dtype }}
108110
BUILD_TOOL="cmake"
109111
MODE=${{ matrix.mode }}
112+
ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
113+
ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
110114
111115
# Setup executorch
112116
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
113117
# Install requirements for export_llama
114118
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
115119
# Test llama2
116-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
120+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" "${ARTIFACTS_DIR_NAME}"
117121
118122
test-llama-runner-linux-android:
119123
name: test-llama-runner-linux-android
@@ -320,6 +324,7 @@ jobs:
320324
321325
android:
322326
uses: ./.github/workflows/_android.yml
327+
needs: test-llama-runner-linux
323328

324329
unittest:
325330
uses: ./.github/workflows/_unittest.yml

backends/cadence/aot/functions_hifi.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
- op: add.out
2626
kernels:
2727
- arg_meta: null
28-
kernel_name: torch::executor::add_out
28+
kernel_name: cadence::impl::HiFi::add_out
2929

3030
- op: bmm.out
3131
kernels:
@@ -45,12 +45,12 @@
4545
- op: div.out
4646
kernels:
4747
- arg_meta: null
48-
kernel_name: torch::executor::div_out
48+
kernel_name: cadence::impl::HiFi::div_out
4949

5050
- op: div.out_mode
5151
kernels:
5252
- arg_meta: null
53-
kernel_name: torch::executor::div_out_mode
53+
kernel_name: cadence::impl::HiFi::div_out_mode
5454

5555
- op: embedding.out
5656
kernels:
@@ -65,7 +65,7 @@
6565
- op: mul.out
6666
kernels:
6767
- arg_meta: null
68-
kernel_name: torch::executor::mul_out
68+
kernel_name: cadence::impl::HiFi::mul_out
6969

7070
- op: permute_copy.out
7171
kernels:
@@ -75,7 +75,7 @@
7575
- op: sigmoid.out
7676
kernels:
7777
- arg_meta: null
78-
kernel_name: torch::executor::sigmoid_out
78+
kernel_name: cadence::impl::HiFi::sigmoid_out
7979

8080
- op: slice_copy.Tensor_out
8181
kernels:
@@ -90,7 +90,12 @@
9090
- op: sub.out
9191
kernels:
9292
- arg_meta: null
93-
kernel_name: torch::executor::sub_out
93+
kernel_name: cadence::impl::HiFi::sub_out
94+
95+
- op: tanh.out
96+
kernels:
97+
- arg_meta: null
98+
kernel_name: cadence::impl::HiFi::tanh_out
9499

95100
- op: view_copy.out
96101
kernels:

backends/cadence/cadence.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
4343

4444
set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
4545
set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
46+
#workaround for larger compilation time
47+
set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
48+
4649
set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
4750
set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
4851
add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)

backends/cadence/hifi/kernels/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ add_library(
99
cadence_kernels
1010
kernels.cpp
1111
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
12+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
13+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
14+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
15+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
1216
)
1317
# Let files say "include <executorch/path/to/header.h>".
1418
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

backends/cadence/hifi/kernels/kernels.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,49 @@
1111
#include <inttypes.h>
1212
#include <stddef.h>
1313
#include <xa_type_def.h>
14+
/* For NNLIB APIs */
15+
#include "xa_nnlib_kernels_api.h"
16+
17+
/* Potential NNLIB function/APIs */
18+
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
19+
FLOAT32* __restrict__ p_out,
20+
const WORD32* const p_out_shape,
21+
const FLOAT32* __restrict__ p_inp1,
22+
const WORD32* const p_inp1_shape,
23+
const FLOAT32* __restrict__ p_inp2,
24+
const WORD32* const p_inp2_shape);
25+
26+
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
27+
FLOAT32* __restrict__ p_out,
28+
const WORD32* const p_out_shape,
29+
const FLOAT32* __restrict__ p_inp1,
30+
const WORD32* const p_inp1_shape,
31+
const FLOAT32* __restrict__ p_inp2,
32+
const WORD32* const p_inp2_shape);
33+
34+
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
35+
FLOAT32* __restrict__ p_out,
36+
const FLOAT32* __restrict__ p_inp1,
37+
const FLOAT32* __restrict__ p_inp2,
38+
WORD32 num_elm,
39+
WORD32 mode);
40+
41+
extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
42+
FLOAT32* __restrict__ p_out,
43+
const WORD32* const p_out_shape,
44+
const FLOAT32* __restrict__ p_inp1,
45+
const WORD32* const p_inp1_shape,
46+
const FLOAT32* __restrict__ p_inp2,
47+
const WORD32* const p_inp2_shape,
48+
WORD32 mode);
49+
50+
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
51+
FLOAT32* __restrict__ p_out,
52+
const WORD32* const p_out_shape,
53+
const FLOAT32* __restrict__ p_inp1,
54+
const WORD32* const p_inp1_shape,
55+
const FLOAT32* __restrict__ p_inp2,
56+
const WORD32* const p_inp2_shape);
1457

1558
namespace cadence {
1659
namespace impl {

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ endif()
2020

2121
# ATen compliant ops that are needed to run this model.
2222
set(_aten_ops__srcs
23+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
24+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
25+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
26+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
27+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
28+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
2329
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
2430
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
2531
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -29,24 +35,29 @@ set(_aten_ops__srcs
2935
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
3036
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
3137
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
32-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
3338
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
3439
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
3540
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
36-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
3741
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
3842
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
39-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
4043
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
41-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
4244
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
4345
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
4446
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
45-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
4647
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
4748
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
4849
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
49-
)
50+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
51+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
52+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
53+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
54+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
55+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
56+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
57+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
58+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
59+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
60+
)
5061
add_library(aten_ops_cadence ${_aten_ops__srcs})
5162
target_link_libraries(aten_ops_cadence PUBLIC executorch)
5263
target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)

0 commit comments

Comments
 (0)