Skip to content

Commit 07d1c26

Browse files
authored
Merge branch 'pytorch:main' into add-profiling-to-xnn-executor-runner-2
2 parents 34b6b3e + fedb035 commit 07d1c26

File tree

74 files changed

+1039
-2244
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+1039
-2244
lines changed

.ci/scripts/utils.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@ retry () {
1717
}
1818

1919
clean_executorch_install_folders() {
20-
./install_requirements.sh --clean
20+
./install_executorch.sh --clean
2121
}
2222

2323
install_executorch() {
2424
which pip
2525
# Install executorch, this assumes that Executorch is checked out in the
2626
# current directory.
2727
if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then
28-
./install_requirements.sh --pybind xnnpack --use-pt-pinned-commit
28+
./install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
2929
else
30-
./install_requirements.sh --pybind xnnpack
30+
./install_executorch.sh --pybind xnnpack
3131
fi
3232
# Just print out the list of packages for debugging
3333
pip list

.github/workflows/apple.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
paths:
1010
- .ci/scripts/setup-ios.sh
1111
- .github/workflows/apple.yml
12-
- install_requirements.sh
12+
- install_executorch.sh
1313
- backends/apple/**
1414
- build/build_apple_frameworks.sh
1515
- build/build_apple_llm_demo.sh

.github/workflows/pull.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ jobs:
200200
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
201201
202202
# install pybind
203-
bash install_requirements.sh --pybind xnnpack
203+
bash install_executorch.sh --pybind xnnpack
204204
205205
# install Llava requirements
206206
bash examples/models/llama/install_requirements.sh
@@ -333,6 +333,9 @@ jobs:
333333

334334
unittest-arm:
335335
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
336+
permissions:
337+
id-token: write
338+
contents: read
336339
with:
337340
runner: linux.2xlarge
338341
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -433,7 +436,7 @@ jobs:
433436
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
434437
435438
# install pybind
436-
bash install_requirements.sh --pybind xnnpack
439+
bash install_executorch.sh --pybind xnnpack
437440
438441
# install phi-3-mini requirements
439442
bash examples/models/phi-3-mini/install_requirements.sh
@@ -460,7 +463,7 @@ jobs:
460463
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
461464
462465
# install pybind
463-
bash install_requirements.sh --pybind xnnpack
466+
bash install_executorch.sh --pybind xnnpack
464467
465468
# install llama requirements
466469
bash examples/models/llama/install_requirements.sh
@@ -487,7 +490,7 @@ jobs:
487490
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
488491
489492
# install pybind
490-
bash install_requirements.sh --pybind xnnpack
493+
bash install_executorch.sh --pybind xnnpack
491494
492495
# install llama requirements
493496
bash examples/models/llama/install_requirements.sh
@@ -514,7 +517,7 @@ jobs:
514517
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
515518
516519
# install pybind
517-
bash install_requirements.sh --pybind xnnpack
520+
bash install_executorch.sh --pybind xnnpack
518521
519522
# install llama requirements
520523
bash examples/models/llama/install_requirements.sh

.github/workflows/trunk.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ jobs:
132132
test-arm-backend-delegation:
133133
name: test-arm-backend-delegation
134134
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
135+
permissions:
136+
id-token: write
137+
contents: read
135138
with:
136139
runner: linux.2xlarge
137140
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -159,6 +162,9 @@ jobs:
159162
test-arm-reference-delegation:
160163
name: test-arm-reference-delegation
161164
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
165+
permissions:
166+
id-token: write
167+
contents: read
162168
with:
163169
runner: linux.2xlarge
164170
docker-image: executorch-ubuntu-22.04-arm-sdk

backends/apple/mps/setup.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf
9797
### [Optional] Run the generated model directly using pybind
9898
1. Make sure `pybind` MPS support was installed:
9999
```bash
100-
./install_requirements.sh --pybind mps
100+
./install_executorch.sh --pybind mps
101101
```
102102
2. Run the `mps_example` script to trace the model and run it directly from python:
103103
```bash

backends/cadence/build_cadence_fusionG3.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ unset XTENSA_CORE
1212
export XTENSA_CORE=FCV_FG3GP
1313
git submodule sync
1414
git submodule update --init
15-
./install_requirements.sh
15+
./install_executorch.sh
1616

1717
rm -rf cmake-out
1818

backends/cadence/build_cadence_hifi4.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ unset XTENSA_CORE
1212
export XTENSA_CORE=nxp_rt600_RI23_11_newlib
1313
git submodule sync
1414
git submodule update --init
15-
./install_requirements.sh
15+
./install_executorch.sh
1616

1717
rm -rf cmake-out
1818

backends/cadence/fusion_g3/operators/op_dequantize.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args(
6767

6868
ET_CHECK_MSG(
6969
input.scalar_type() == dtype,
70-
"input.scalar_type() %" PRId8 " is not matching dtype argumenta:",
71-
static_cast<int8_t>(input.scalar_type()));
70+
"input.scalar_type() %s is not matching dtype arguments:",
71+
::executorch::runtime::toString(input.scalar_type()));
7272

7373
if (out_dtype.has_value()) {
7474
ET_CHECK_MSG(
@@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out(
561561
const Tensor& input,
562562
double scale,
563563
int64_t zero_point,
564-
int64_t quant_min,
565-
int64_t quant_max,
564+
__ET_UNUSED int64_t quant_min,
565+
__ET_UNUSED int64_t quant_max,
566566
ScalarType dtype,
567-
::executorch::aten::optional<ScalarType> out_dtype,
568567
Tensor& out) {
568+
constexpr ScalarType out_dtype = ScalarType::Float;
569+
569570
#ifdef OP_ARG_CHECK
570571
torch::executor::Error err = resize_tensor(out, input.sizes());
571572
ET_CHECK_MSG(

backends/cadence/reference/operators/quantized_conv_out.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
119119
if (((_h + d0 * _wh - p0) >= 0) &&
120120
((_h + d0 * _wh - p0) < h) &&
121121
((_w + d1 * _ww - p1) >= 0) &&
122-
((_w + d1 * _ww - p1 < w))) {
122+
((_w + d1 * _ww - p1) < w)) {
123123
int ioff =
124124
(_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
125125
int woff = _wh * ww + _ww;

backends/qualcomm/_passes/insert_requantize.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,9 @@ def _single_output_annotation(
8989
requantize_dict = n.meta.pop(QCOM_REQUANTIZE)
9090
# {quant_attr: user_node_name_list}
9191
group_quant_attr_dict = self._invert_dict(requantize_dict)
92-
# TODO: If users of the node contain output node,
93-
# we replace the node with to_copy op. However, it would
94-
# be problem when the node has multiple to_copy ops
95-
add_output = len(group_quant_attr_dict) == 1
9692

9793
for hashable_quant_attr, user_nodes in group_quant_attr_dict.items():
9894
user_nodes_copy = user_nodes.copy()
99-
if add_output:
100-
user_nodes_copy.append("output")
10195
self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy)
10296

10397
def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:

backends/qualcomm/quantizer/custom_annotation.py

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,80 @@
1414
QuantizationConfig,
1515
)
1616
from executorch.exir.dialects._ops import ops as exir_ops
17-
from torch.ao.quantization.observer import MinMaxObserver
17+
from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver
1818
from torch.ao.quantization.quantizer import (
1919
QuantizationAnnotation,
20+
QuantizationSpec,
2021
SharedQuantizationSpec,
2122
)
2223
from torch.fx import Node
2324

2425

25-
def annotate_matmul_16a8w( # noqa: C901
26-
gm: torch.fx.GraphModule, traverse_input1=True
27-
) -> None:
26+
def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
27+
def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
28+
input_qspec_map = {}
29+
input_act = node.args[0]
30+
input_spec = quantization_config.input_activation
31+
input_qspec_map[input_act] = input_spec
32+
33+
weight = node.args[1]
34+
input_qspec_map[weight] = quantization_config.weight
35+
36+
node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
37+
input_qspec_map=input_qspec_map,
38+
output_qspec=quantization_config.output_activation,
39+
_annotated=True,
40+
)
41+
42+
quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
43+
torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
44+
)
45+
for node in gm.graph.nodes:
46+
if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
47+
if "nn_module_stack" in node.meta:
48+
module_values_list = list(node.meta["nn_module_stack"].values())
49+
full_qualified_name = module_values_list[-1][0]
50+
if full_qualified_name == "output.conv":
51+
annotate_conv2d(
52+
node, quantization_config=quantization_config_16a8w_per_channel
53+
)
54+
55+
56+
def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
57+
for node in gm.graph.nodes:
58+
if node.op == "output":
59+
for index, prefill_output in enumerate(node.args[0]):
60+
kv_quant_attr = kv_quant_attrs[index]
61+
fixed_observer = FixedQParamsObserver.with_args(
62+
scale=kv_quant_attr[0],
63+
zero_point=kv_quant_attr[1],
64+
quant_min=kv_quant_attr[2],
65+
quant_max=kv_quant_attr[3],
66+
dtype=kv_quant_attr[4],
67+
qscheme=torch.torch.per_tensor_affine,
68+
)
69+
70+
fixed_output_spec = QuantizationSpec(
71+
quant_min=kv_quant_attr[2],
72+
quant_max=kv_quant_attr[3],
73+
dtype=kv_quant_attr[4],
74+
ch_axis=0,
75+
observer_or_fake_quant_ctr=fixed_observer,
76+
)
77+
78+
input_qspec_map = {}
79+
for input in prefill_output.args:
80+
if isinstance(input, Node):
81+
input_qspec_map[input] = fixed_output_spec
82+
83+
prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
84+
input_qspec_map=input_qspec_map,
85+
output_qspec=fixed_output_spec,
86+
_annotated=True,
87+
)
88+
89+
90+
def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
2891
"""
2992
This function is specific for matmul op 16a8w.
3093
For k, we will tag such as the below, and
@@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node):
142205
for node in gm.graph.nodes:
143206
if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
144207
annotate_matmul(node, quantization_config_16a8w)
145-
if traverse_input1:
146-
annotate_matmul_input1(node.args[1])
208+
annotate_matmul_input1(node.args[1])
147209

148210

149211
def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3280,7 +3280,7 @@ def test_stories_single_llama(self):
32803280

32813281
cmds = [
32823282
"python",
3283-
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py",
3283+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
32843284
"--artifact",
32853285
self.artifact_dir,
32863286
"--build_folder",
@@ -3307,6 +3307,8 @@ def test_stories_single_llama(self):
33073307
"16a4w",
33083308
"--temperature",
33093309
"0",
3310+
"--llama_model",
3311+
"stories110m",
33103312
]
33113313
if self.host:
33123314
cmds.extend(["--host", self.host])

backends/vulkan/docs/android_demo.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ First, build and install ExecuTorch libraries, then build the LLaMA runner
8181
binary using the Android NDK toolchain.
8282

8383
```shell
84-
./install_requirements.sh --clean
84+
./install_executorch.sh --clean
8585
(mkdir cmake-android-out && \
8686
cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
8787
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
3333
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3434

3535
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
36-
shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
36+
// 64 is the number of threads in the local wg
37+
$num_shared = 64 * TILE_SIZE * TILE_SIZE
38+
shared ivec2 pos_shared[${num_shared}];
3739

3840
/*
3941
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an

backends/vulkan/test/op_tests/linear_weight_int4_test.cpp

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,38 @@ at::Tensor linear_weight_int4_reference_impl(
3030
const size_t ndim = original_x_size.size();
3131
const int64_t out_features = weights_4x2.size(0);
3232
const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
33-
const at::Tensor packed_weights =
34-
at::_convert_weight_to_int4pack(weights_4x2, inner_k_tiles);
35-
at::Tensor out = at::_weight_int4pack_mm(
36-
x_flattened, packed_weights, groupsize, scales_and_zeros);
33+
at::Tensor out = at::_weight_int4pack_mm_for_cpu(
34+
x_flattened, weights_4x2, groupsize, scales_and_zeros);
3735
std::vector<int64_t> out_shape(
3836
original_x_size.begin(), original_x_size.end());
3937
out_shape.at(ndim - 1) = out_features;
4038
return out.reshape(out_shape);
4139
}
4240

41+
at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) {
42+
std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
43+
weights_shape[1] *= 2;
44+
45+
at::Tensor weights_unpacked =
46+
at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt));
47+
48+
const int64_t N = weights_unpacked.size(0);
49+
const int64_t K = weights_unpacked.size(1);
50+
51+
for (int n = 0; n < N; n++) {
52+
for (int k = 0; k < K; k += 2) {
53+
const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
54+
const uint8_t second_val = packed_val & 0x0F;
55+
const uint8_t first_val = (packed_val & 0xF0) >> 4;
56+
57+
weights_unpacked[n][k] = int(first_val);
58+
weights_unpacked[n][k + 1] = int(second_val);
59+
}
60+
}
61+
62+
return weights_unpacked;
63+
}
64+
4365
at::Tensor dequantize_and_linear(
4466
const at::Tensor& x,
4567
const at::Tensor& weights_4x2,
@@ -91,13 +113,18 @@ void test_reference_linear_int4(
91113
at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
92114
at::Tensor weights_4x2 =
93115
at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
116+
at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
94117

95118
const int k_groups = K / group_size;
96119
at::Tensor scales_and_zeros =
97120
at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));
98121

99122
at::Tensor out = linear_weight_int4_reference_impl(
100-
x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
123+
x,
124+
at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
125+
group_size,
126+
scales_and_zeros,
127+
inner_k_tiles);
101128

102129
at::Tensor out_ref = dequantize_and_linear(
103130
x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);

backends/xnnpack/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam
9898
cd executorch
9999

100100
# Get a clean cmake-out directory
101-
./install_requirements.sh --clean
101+
./install_executorch.sh --clean
102102
mkdir cmake-out
103103

104104
# Configure cmake

build/test_ios.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ say "Installing Requirements"
6363

6464
pip install --upgrade cmake pip setuptools wheel zstd
6565

66-
./install_requirements.sh --pybind coreml mps xnnpack
66+
./install_executorch.sh --pybind coreml mps xnnpack
6767
export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH"
6868
./build/install_flatc.sh
6969

0 commit comments

Comments
 (0)