Skip to content

Commit 5f370c5

Browse files
committed
Update on "[ET-VK] Store unique ptr to Tensor in Value instead of inlined tensor object, to reduce Value struct size from 448 to 80 bytes."
This diff aims to reduce the size of the Value struct in the Executorch Vulkan runtime by storing a unique pointer to the Tensor object instead of an inlined tensor object. This change reduces the size of the Value struct from 448 bytes to 80 bytes, which can improve performance and reduce memory usage. Differential Revision: [D66655991](https://our.internmc.facebook.com/intern/diff/D66655991/) [ghstack-poisoned]
2 parents de5b1f4 + 4d92ec8 commit 5f370c5

File tree

41 files changed

+1730
-267
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1730
-267
lines changed

.github/scripts/extract_benchmark_results.py

Lines changed: 104 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ def transform(
310310
workflow_run_attempt: int,
311311
job_name: str,
312312
job_id: int,
313+
schema_version: str,
313314
) -> List:
314315
"""
315316
Transform the benchmark results into the format writable into the benchmark database
@@ -319,45 +320,91 @@ def transform(
319320
for r in benchmark_results:
320321
r["deviceInfo"]["device"] = job_name
321322

322-
# TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2,
323-
# and I'm trying to fit ET benchmark results into it, which is kind of awkward.
324-
# However, the schema is going to be updated soon
325-
return [
326-
{
327-
# GH-info to identify where the benchmark is run
328-
"repo": repo,
329-
"head_branch": head_branch,
330-
"workflow_id": workflow_run_id,
331-
"run_attempt": workflow_run_attempt,
332-
"job_id": job_id,
333-
# The model
334-
"name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
335-
"dtype": (
336-
r["benchmarkModel"]["quantization"]
337-
if r["benchmarkModel"]["quantization"]
338-
else "unknown"
339-
),
340-
# The metric value
341-
"metric": r["metric"],
342-
"actual": r["actualValue"],
343-
"target": r["targetValue"],
344-
# The device
345-
"device": r["deviceInfo"]["device"],
346-
"arch": r["deviceInfo"].get("os", ""),
347-
# Not used here, just set it to something unique here
348-
"filename": workflow_name,
349-
"test_name": app_type,
350-
"runner": job_name,
351-
}
352-
for r in benchmark_results
353-
]
323+
if schema_version == "v2":
324+
# TODO (huydhn): Clean up this branch after ExecuTorch dashboard migrates to v3
325+
return [
326+
{
327+
# GH-info to identify where the benchmark is run
328+
"repo": repo,
329+
"head_branch": head_branch,
330+
"workflow_id": workflow_run_id,
331+
"run_attempt": workflow_run_attempt,
332+
"job_id": job_id,
333+
# The model
334+
"name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
335+
"dtype": (
336+
r["benchmarkModel"]["quantization"]
337+
if r["benchmarkModel"]["quantization"]
338+
else "unknown"
339+
),
340+
# The metric value
341+
"metric": r["metric"],
342+
"actual": r["actualValue"],
343+
"target": r["targetValue"],
344+
# The device
345+
"device": r["deviceInfo"]["device"],
346+
"arch": r["deviceInfo"].get("os", ""),
347+
# Not used here, just set it to something unique here
348+
"filename": workflow_name,
349+
"test_name": app_type,
350+
"runner": job_name,
351+
}
352+
for r in benchmark_results
353+
]
354+
elif schema_version == "v3":
355+
quantization = (
356+
r["benchmarkModel"]["quantization"]
357+
if r["benchmarkModel"]["quantization"]
358+
else "unknown"
359+
)
360+
# From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
361+
return [
362+
{
363+
"benchmark": {
364+
"name": "ExecuTorch",
365+
"mode": "inference",
366+
"dtype": quantization,
367+
"extra_info": {
368+
"app_type": app_type,
369+
},
370+
},
371+
"model": {
372+
"name": r["benchmarkModel"]["name"],
373+
"type": "OSS model",
374+
"backend": r["benchmarkModel"].get("backend", ""),
375+
"extra_info": {
376+
"quantization": quantization,
377+
},
378+
},
379+
"metric": {
380+
"name": r["metric"],
381+
"benchmark_values": [r["actualValue"]],
382+
"target_value": r["targetValue"],
383+
"extra_info": {
384+
"method": r.get("method", ""),
385+
},
386+
},
387+
"runners": [
388+
{
389+
"name": r["deviceInfo"]["device"],
390+
"type": r["deviceInfo"]["os"],
391+
"avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
392+
"total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
393+
}
394+
],
395+
}
396+
for r in benchmark_results
397+
]
354398

355399

356400
def main() -> None:
357401
args = parse_args()
358402

359-
# Across all devices
360-
all_benchmark_results = []
403+
# Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
404+
all_benchmark_results = {
405+
"v2": [],
406+
"v3": [],
407+
}
361408

362409
with open(args.artifacts) as f:
363410
for artifact in json.load(f):
@@ -384,23 +431,31 @@ def main() -> None:
384431
)
385432

386433
if benchmark_results:
387-
benchmark_results = transform(
388-
app_type,
389-
benchmark_results,
390-
args.repo,
391-
args.head_branch,
392-
args.workflow_name,
393-
args.workflow_run_id,
394-
args.workflow_run_attempt,
395-
job_name,
396-
extract_job_id(args.artifacts),
397-
)
398-
all_benchmark_results.extend(benchmark_results)
434+
for schema in all_benchmark_results.keys():
435+
results = transform(
436+
app_type,
437+
benchmark_results,
438+
args.repo,
439+
args.head_branch,
440+
args.workflow_name,
441+
args.workflow_run_id,
442+
args.workflow_run_attempt,
443+
job_name,
444+
extract_job_id(args.artifacts),
445+
schema,
446+
)
447+
all_benchmark_results[schema].extend(results)
448+
449+
for schema in all_benchmark_results.keys():
450+
if not all_benchmark_results.get(schema):
451+
continue
452+
453+
output_dir = os.path.join(args.output_dir, schema)
454+
os.mkdir(output_dir)
399455

400-
if all_benchmark_results:
401456
output_file = os.path.basename(args.artifacts)
402-
with open(f"{args.output_dir}/{output_file}", "w") as f:
403-
json.dump(all_benchmark_results, f)
457+
with open(f"{output_dir}/{output_file}", "w") as f:
458+
json.dump(all_benchmark_results[schema], f)
404459

405460

406461
if __name__ == "__main__":

.github/workflows/android-perf.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -298,15 +298,25 @@ jobs:
298298
--workflow-run-attempt ${{ github.run_attempt }}
299299
done
300300
301-
ls -lah benchmark-results
302-
303-
for BENCHMARK_RESULTS in benchmark-results/*.json; do
304-
cat "${BENCHMARK_RESULTS}"
305-
echo
301+
for SCHEMA in v2 v3; do
302+
for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
303+
cat "${BENCHMARK_RESULTS}"
304+
echo
305+
done
306306
done
307307
308-
- name: Upload the benchmark results
308+
# TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
309+
- name: Upload the benchmark results (v2)
310+
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
311+
with:
312+
benchmark-results-dir: benchmark-results/v2
313+
dry-run: false
314+
schema-version: v2
315+
316+
- name: Upload the benchmark results (v3)
309317
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
310318
with:
311-
benchmark-results-dir: 'benchmark-results'
319+
benchmark-results-dir: benchmark-results/v3
312320
dry-run: false
321+
schema-version: v3
322+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/apple-perf.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -372,15 +372,25 @@ jobs:
372372
--workflow-run-attempt ${{ github.run_attempt }}
373373
done
374374
375-
ls -lah benchmark-results
376-
377-
for BENCHMARK_RESULTS in benchmark-results/*.json; do
378-
cat "${BENCHMARK_RESULTS}"
379-
echo
375+
for SCHEMA in v2 v3; do
376+
for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
377+
cat "${BENCHMARK_RESULTS}"
378+
echo
379+
done
380380
done
381381
382-
- name: Upload the benchmark results
382+
# TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
383+
- name: Upload the benchmark results (v2)
384+
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
385+
with:
386+
benchmark-results-dir: benchmark-results/v2
387+
dry-run: false
388+
schema-version: v2
389+
390+
- name: Upload the benchmark results (v3)
383391
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
384392
with:
385-
benchmark-results-dir: 'benchmark-results'
393+
benchmark-results-dir: benchmark-results/v3
386394
dry-run: false
395+
schema-version: v3
396+
github-token: ${{ secrets.GITHUB_TOKEN }}

CMakeLists.txt

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,21 @@ if(NOT CMAKE_BUILD_TYPE)
5656
set(CMAKE_BUILD_TYPE Debug)
5757
endif()
5858

59+
# Setup RPATH.
60+
# See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
61+
# Use separate rpaths during build and install phases
62+
set(CMAKE_SKIP_BUILD_RPATH OFF)
63+
# Don't use the install-rpath during the build phase
64+
set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
65+
# Automatically add all linked folders that are NOT in the build directory to
66+
# the rpath (per library?)
67+
# TODO: Doesn't work for us right now because we are not installing .so's into the
68+
# correct locations. For example we have libcustom_ops_aot_lib.so depending on
69+
# _portable_lib.so, which was eventually put under <site-packages>/executorch/extension/pybindings/
70+
# but this rpath is not automatically added because at build time it seems `portable_lib`
71+
# is being built under the same directory, so no extra rpath is being added. To
72+
# properly fix this we need to install `portable_lib` into the correct path.
73+
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
5974
# ------------------------------ OPTIONS -------------------------------------
6075
# WARNING: Please don't add example specific options in this CMakeLists.txt.
6176
# Instead please use `find_package(executorch REQUIRED)` in the example
@@ -682,22 +697,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
682697
endif()
683698

684699
if(EXECUTORCH_BUILD_PYBIND)
685-
# Setup RPATH.
686-
# See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
687-
if(APPLE)
688-
set(CMAKE_MACOSX_RPATH ON)
689-
set(_rpath_portable_origin "@loader_path")
690-
else()
691-
set(_rpath_portable_origin $ORIGIN)
692-
endif(APPLE)
693-
# Use separate rpaths during build and install phases
694-
set(CMAKE_SKIP_BUILD_RPATH FALSE)
695-
# Don't use the install-rpath during the build phase
696-
set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
697-
set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
698-
# Automatically add all linked folders that are NOT in the build directory to
699-
# the rpath (per library?)
700-
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
701700
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
702701

703702
if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)

backends/cadence/aot/compiler.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
to_edge,
2929
)
3030
from executorch.exir.pass_base import PassResult
31+
from torch._inductor.decomposition import remove_decompositions
3132
from torch.ao.quantization.pt2e.export_utils import model_is_exported
3233
from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
3334

@@ -58,16 +59,33 @@ def convert_pt2(
5859
Returns a GraphModule with the converted model.
5960
"""
6061

62+
# Get default decompositions
63+
decomp_table = torch.export.default_decompositions()
64+
# Select ops to keep
65+
ops_to_keep = [
66+
torch.ops.aten.conv1d.default,
67+
torch.ops.aten.conv2d.default,
68+
torch.ops.aten.layer_norm.default,
69+
torch.ops.aten.linear.default,
70+
torch.ops.aten.matmul.default,
71+
]
72+
# Remove decompositions for the ops we want to keep
73+
# pyre-fixme[6]: For 1st argument expected `Dict[typing.Callable[..., typing.Any
74+
remove_decompositions(decomp_table, ops_to_keep)
6175
# Export with dynamo
62-
model_gm = torch.export.export_for_training(model, inputs).module()
76+
model_gm = (
77+
torch.export.export_for_training(model, inputs)
78+
.run_decompositions(decomp_table)
79+
.module()
80+
)
6381

64-
if model_gm_has_SDPA(model_gm): # pyre-fixme[6]
82+
if model_gm_has_SDPA(model_gm):
6583
# Decompose SDPA
66-
DecomposeScaledDotProductAttention(False)(model_gm) # pyre-fixme[6]
84+
DecomposeScaledDotProductAttention(False)(model_gm)
6785

6886
# Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
6987
# for details).
70-
result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) # pyre-fixme[6]
88+
result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)
7189
assert result is not None
7290
model_gm = result.graph_module
7391

backends/cadence/hifi/kernels/kernels.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
9292
const WORD32* const p_inp2_shape);
9393

9494
extern "C" void xa_nn_elm_pow_f32(
95-
FLOAT32* restrict z,
96-
const FLOAT32* restrict x,
97-
const FLOAT32* restrict y,
95+
FLOAT32* __restrict__ z,
96+
const FLOAT32* __restrict__ x,
97+
const FLOAT32* __restrict__ y,
9898
WORD32 N);
9999

100100
extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(

backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2020
2121
******************************************************************************/
22-
#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h"
23-
#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
24-
#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
25-
#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
26-
#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
27-
#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
22+
#include "xa_type_def.h"
23+
#include "xa_nnlib_common_fpu.h"
24+
#include "xa_nn_common.h"
25+
#include "xa_nnlib_err_chk.h"
26+
// #include "xa_nn_basic_state.h"
27+
#include "xa_nnlib_kernels_api.h"
2828

2929
#if !HAVE_VFPU
3030
DISCARD_FUN_FOR_NONVOID_RETURN(
@@ -844,4 +844,4 @@ WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
844844
}
845845
return 0;
846846
}
847-
#endif
847+
#endif

backends/test/README.md

Whitespace-only changes.

backends/test/TARGETS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets(is_fbcode = True)

0 commit comments

Comments
 (0)