Skip to content

Commit 653cc15

Browse files
committed
Merge remote-tracking branch 'origin/main' into gh/swolchok/19/base
2 parents 858e9fd + 0c6a77e commit 653cc15

File tree

166 files changed

+2674
-1165
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

166 files changed

+2674
-1165
lines changed

.ci/docker/ci_commit_pins/torchao.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0916b5b29b092afcbf2b898caae49abe80662bac

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ run_and_verify() {
9191
RESULT=$(cat result.txt)
9292
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
9393
if [[ "$(uname)" == "Darwin" ]]; then
94-
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress on a basketball court. There are several players on the court, with one player in the foreground holding a basketball, and"
94+
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
9595
else
9696
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
9797
EXPECTED_PREFIX="ASSISTANT:"

.github/workflows/trunk.yml

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -270,33 +270,34 @@ jobs:
270270
# Test llama2
271271
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
272272
273-
test-llava-runner-macos:
274-
name: test-llava-runner-macos
275-
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
276-
strategy:
277-
fail-fast: false
278-
with:
279-
runner: macos-m1-stable
280-
python-version: '3.11'
281-
submodules: 'true'
282-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
283-
timeout: 900
284-
script: |
285-
# The generic Linux job chooses to use base env, not the one setup by the image
286-
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
287-
conda activate "${CONDA_ENV}"
288-
289-
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "cmake"
290-
291-
# install Llava requirements
292-
bash examples/models/llama2/install_requirements.sh
293-
bash examples/models/llava/install_requirements.sh
294-
295-
# run python unittest
296-
python -m unittest examples.models.llava.test.test_llava
297-
298-
# run e2e (export, tokenizer and runner)
299-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh Release
273+
# # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
274+
# test-llava-runner-macos:
275+
# name: test-llava-runner-macos
276+
# uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
277+
# strategy:
278+
# fail-fast: false
279+
# with:
280+
# runner: macos-14-xlarge
281+
# python-version: '3.11'
282+
# submodules: 'true'
283+
# ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
284+
# timeout: 900
285+
# script: |
286+
# BUILD_TOOL=cmake
287+
288+
# bash .ci/scripts/setup-conda.sh
289+
# # Setup MacOS dependencies as there is no Docker support on MacOS atm
290+
# GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
291+
292+
# # install Llava requirements
293+
# ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
294+
# ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
295+
296+
# # run python unittest
297+
# ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
298+
299+
# # run e2e (export, tokenizer and runner)
300+
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
300301

301302
test-qnn-model:
302303
name: test-qnn-model

CMakeLists.txt

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
505505
)
506506
target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema)
507507
if(DL_LIBRARY_EXISTS)
508-
target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr()
508+
# For dladdr()
509+
target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl)
509510
endif()
510511
target_include_directories(
511512
executorch_no_prim_ops_shared PUBLIC ${_common_include_directories}
@@ -541,7 +542,7 @@ target_link_options_shared_lib(executorch)
541542
# operators necessary for the models that will run.
542543
#
543544
if(BUILD_EXECUTORCH_PORTABLE_OPS)
544-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
545+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
545546
endif()
546547

547548
if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
@@ -584,56 +585,56 @@ if(EXECUTORCH_BUILD_GTESTS)
584585
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
585586
endif()
586587

587-
if(EXECUTORCH_BUILD_SDK)
588-
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
589-
ON
590-
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
591-
)
592-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
588+
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
589+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
593590
endif()
594591

595-
if(EXECUTORCH_BUILD_EXTENSION_APPLE)
596-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
592+
if(EXECUTORCH_BUILD_CADENCE)
593+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
597594
endif()
598595

599-
if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
600-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
596+
if(EXECUTORCH_BUILD_COREML)
597+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
601598
endif()
602599

603-
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
604-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
600+
if(EXECUTORCH_BUILD_MPS)
601+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
605602
endif()
606603

607604
if(EXECUTORCH_BUILD_NEURON)
608605
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek)
609606
endif()
610607

611-
if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
612-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
608+
if(EXECUTORCH_BUILD_QNN)
609+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
613610
endif()
614611

615612
if(EXECUTORCH_BUILD_XNNPACK)
616613
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
617614
endif()
618615

619-
if(EXECUTORCH_BUILD_QNN)
620-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
616+
if(EXECUTORCH_BUILD_SDK)
617+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
618+
ON
619+
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
620+
)
621+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
621622
endif()
622623

623-
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
624-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
624+
if(EXECUTORCH_BUILD_EXTENSION_APPLE)
625+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
625626
endif()
626627

627-
if(EXECUTORCH_BUILD_MPS)
628-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
628+
if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
629+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
629630
endif()
630631

631-
if(EXECUTORCH_BUILD_COREML)
632-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
632+
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
633+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
633634
endif()
634635

635-
if(EXECUTORCH_BUILD_CADENCE)
636-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
636+
if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
637+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
637638
endif()
638639

639640
if(EXECUTORCH_BUILD_PYBIND)
@@ -690,9 +691,8 @@ if(EXECUTORCH_BUILD_PYBIND)
690691
)
691692
# util lib
692693
add_library(
693-
util
694-
${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
695-
${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
694+
util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
695+
${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
696696
)
697697
target_include_directories(
698698
util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
@@ -741,12 +741,14 @@ if(EXECUTORCH_BUILD_PYBIND)
741741
else()
742742
set_target_properties(
743743
portable_lib
744-
PROPERTIES # Assume <executorch> is the root `site-packages/executorch`
745-
# Need to add <executorch>/extension/llm/custom_ops for
746-
# libcustom_ops_aot_lib
747-
# Need to add <executorch>/kernels/quantized for
748-
# libquantized_ops_aot_lib
749-
BUILD_RPATH "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
744+
PROPERTIES
745+
# Assume <executorch> is the root `site-packages/executorch`
746+
# Need to add <executorch>/extension/llm/custom_ops for
747+
# libcustom_ops_aot_lib
748+
# Need to add <executorch>/kernels/quantized for
749+
# libquantized_ops_aot_lib
750+
BUILD_RPATH
751+
"$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
750752
)
751753
endif()
752754

@@ -757,9 +759,7 @@ endif()
757759

758760
if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
759761
# TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
760-
add_subdirectory(
761-
${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops
762-
)
762+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
763763
endif()
764764

765765
if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)

backends/apple/coreml/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ if(NOT EXECUTORCH_ROOT)
1414
endif()
1515

1616
if(EXECUTORCH_BUILD_SDK)
17-
# protobuf requires frtti
18-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti" )
17+
# protobuf requires frtti
18+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti")
1919
endif()
20-
20+
2121
option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
2222

2323
# inmemoryfs sources

backends/apple/mps/test/test_mps_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def lower_module_and_test_output(
229229
compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))]
230230

231231
if use_partitioner:
232-
logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}")
232+
logging.info(f"Edge IR graph:\n{edge_program.exported_program()}")
233233
delegated_program = edge_program
234234
delegated_program = edge_program.to_backend(
235235
MPSPartitioner(compile_specs=compile_specs)

backends/arm/arm_partitioner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
4040
exir_ops.edge.aten.addmm.default,
4141
exir_ops.edge.aten.expand_copy.default,
4242
exir_ops.edge.aten.cat.default,
43+
exir_ops.edge.aten.bmm.default,
4344
exir_ops.edge.aten.permute_copy.default,
4445
exir_ops.edge.aten.hardtanh.default,
4546
exir_ops.edge.aten.convolution.default,

backends/arm/operators/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
op_addmm,
1010
op_avg_pool2d,
1111
op_batch_norm,
12+
op_bmm,
1213
op_cat,
1314
op_conv2d,
1415
op_dequant,

backends/arm/operators/op_bmm.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2024 Arm Limited and/or its affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
from typing import List
7+
8+
import serializer.tosa_serializer as ts
9+
import torch.fx
10+
from executorch.backends.arm.operators.node_visitor import (
11+
NodeVisitor,
12+
register_node_visitor,
13+
)
14+
from executorch.backends.arm.tosa_mapping import TosaArg
15+
from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args
16+
from executorch.backends.arm.tosa_utils import get_two_inputs
17+
from serializer.tosa_serializer import TosaOp
18+
19+
20+
@register_node_visitor
21+
class BMMVisitor(NodeVisitor):
22+
target = "aten.bmm.default"
23+
24+
def __init__(self, *args):
25+
super().__init__(*args)
26+
27+
def define_node(
28+
self,
29+
node: torch.fx.Node,
30+
tosa_graph: ts.TosaSerializer,
31+
inputs: List[TosaArg],
32+
output: TosaArg,
33+
is_quant_node: bool,
34+
) -> None:
35+
input0, input1 = get_two_inputs(node)
36+
37+
# aten.bmm maps directly to MATMUL
38+
# NOTE: For now, only INT8 & FP32 is supported
39+
40+
# For INT8, we need to get the zero points and add an intermediate tensor
41+
# for a later rescale.
42+
if is_quant_node:
43+
input0_zp = get_quant_node_args(input0).zp
44+
input1_zp = get_quant_node_args(input1).zp
45+
bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
46+
bmm_output_name = bmm_result.name
47+
else:
48+
input0_zp, input1_zp = 0, 0
49+
bmm_output_name = output.name
50+
51+
# Add the MATMUL to the TOSA graph.
52+
attr = ts.TosaSerializerAttribute()
53+
attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp)
54+
55+
tosa_graph.addOperator(
56+
TosaOp.Op().MATMUL,
57+
[input0.name, input1.name],
58+
[bmm_output_name],
59+
attr,
60+
)
61+
62+
# As INT8 accumulates into INT32, we need to rescale it back to INT8
63+
if is_quant_node:
64+
input0_q_params = get_quant_node_args(input0)
65+
input1_q_params = get_quant_node_args(input1)
66+
output_q_params = get_quant_node_args(list(node.users)[0])
67+
68+
final_output_scale = (
69+
input0_q_params.scale * input1_q_params.scale
70+
) / output_q_params.scale
71+
72+
build_rescale(
73+
tosa_fb=tosa_graph,
74+
scale=final_output_scale,
75+
input_node=bmm_result,
76+
output_name=output.name,
77+
output_type=ts.DType.INT8,
78+
output_shape=bmm_result.shape,
79+
input_zp=0,
80+
output_zp=output_q_params.zp,
81+
is_double_round=False,
82+
)

backends/arm/operators/op_softmax.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def define_node(
3333
input_name = inputs[0].name
3434
dim_order = inputs[0].dim_order
3535
input_shape = tosa_shape(inputs[0].shape, dim_order)
36-
dim_value = dim_order.index(inputs[1].number)
36+
dim_value = dim_order.index(inputs[1].number % len(dim_order))
3737

3838
## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1)
3939
# FP32

backends/arm/quantizer/quantization_annotation/mm_annotator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def _annotate_mm(
2222
quantization_config: QuantizationConfig,
2323
filter_fn: Optional[Callable[[Node], bool]] = None,
2424
) -> Optional[List[List[Node]]]:
25-
mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn)
25+
mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn)
2626
mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values()))
2727
annotated_partitions = []
2828
for mm_partition in mm_partitions:

0 commit comments

Comments
 (0)