Skip to content

Commit 03c52d7

Browse files
authored
Merge branch 'main' into main
2 parents d3b7f18 + 334af4a commit 03c52d7

File tree

77 files changed

+1769
-490
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+1769
-490
lines changed

.ci/scripts/gather_benchmark_configs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
2525
"google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
2626
"google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
27+
"apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
2728
}
2829

2930
# Predefined benchmark configurations
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
name: apple-perf (private devices)
2+
3+
on:
4+
# TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
5+
# to separate between public and private iOS devices
6+
# schedule:
7+
# - cron: 0 0,4,8,12,16,20 * * *
8+
pull_request:
9+
paths:
10+
- .github/workflows/apple-perf-private-device-experiment.yml
11+
# push:
12+
# branches:
13+
# - main
14+
# paths:
15+
# - .github/workflows/apple-perf-private-device-experiment.yml
16+
# Note: GitHub has an upper limit of 10 inputs
17+
workflow_dispatch:
18+
inputs:
19+
models:
20+
description: Models to be benchmarked
21+
required: false
22+
type: string
23+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
24+
devices:
25+
description: Target devices to run benchmark
26+
required: false
27+
type: string
28+
default: apple_iphone_15_private
29+
benchmark_configs:
30+
description: The list of configs used the benchmark
31+
required: false
32+
type: string
33+
workflow_call:
34+
inputs:
35+
models:
36+
description: Models to be benchmarked
37+
required: false
38+
type: string
39+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
40+
devices:
41+
description: Target devices to run benchmark
42+
required: false
43+
type: string
44+
default: apple_iphone_15_private
45+
benchmark_configs:
46+
description: The list of configs used the benchmark
47+
required: false
48+
type: string
49+
50+
concurrency:
51+
group: apple-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
52+
cancel-in-progress: true
53+
54+
jobs:
55+
apple:
56+
uses: ./.github/workflows/apple-perf.yml
57+
secrets: inherit
58+
permissions:
59+
id-token: write
60+
contents: read
61+
with:
62+
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
63+
devices: apple_iphone_15_private
64+
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/pull.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ jobs:
399399
size=${arr[4]}
400400
# threshold=48120 on devserver with gcc11.4
401401
# todo(lfq): update once binary size is below 50kb.
402-
threshold="51504"
402+
threshold="51408"
403403
if [[ "$size" -le "$threshold" ]]; then
404404
echo "Success $size <= $threshold"
405405
else
@@ -436,7 +436,7 @@ jobs:
436436
size=${arr[4]}
437437
# threshold=48120 on devserver with gcc11.4
438438
# todo(lfq): update once binary size is below 50kb.
439-
threshold="51784"
439+
threshold="47552"
440440
if [[ "$size" -le "$threshold" ]]; then
441441
echo "Success $size <= $threshold"
442442
else

backends/cadence/aot/TARGETS

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66

77
load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
88
load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
9+
load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
910
load(
1011
"@fbsource//tools/build_defs:default_platform_defs.bzl",
1112
"CXX",
1213
)
1314
load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
14-
load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
1515

1616
oncall("odai_jarvis")
1717

@@ -36,18 +36,18 @@ python_library(
3636
"compiler.py",
3737
],
3838
deps = [
39-
":passes",
40-
":utils",
39+
":memory_planning",
4140
":ops_registrations",
41+
":passes",
4242
":replace_ops",
43-
":memory_planning",
43+
":utils",
4444
"//caffe2:torch",
4545
"//executorch/backends/cadence/aot/quantizer:fusion_pass",
4646
"//executorch/backends/cadence/aot/quantizer:quantizer",
4747
"//executorch/backends/transforms:decompose_sdpa",
4848
"//executorch/backends/transforms:remove_clone_ops",
49-
"//executorch/exir:lib",
5049
"//executorch/devtools:lib",
50+
"//executorch/exir:lib",
5151
],
5252
)
5353

@@ -57,19 +57,19 @@ python_library(
5757
"export_example.py",
5858
],
5959
deps = [
60-
":passes",
61-
":utils",
6260
":ops_registrations",
61+
":passes",
6362
":replace_ops",
63+
":utils",
6464
"//caffe2:torch",
6565
"//executorch/backends/cadence/aot/quantizer:fusion_pass",
66-
"//executorch/backends/cadence/runtime:runtime",
6766
"//executorch/backends/cadence/aot/quantizer:quantizer",
68-
"//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
67+
"//executorch/backends/cadence/runtime:runtime",
6968
"//executorch/backends/transforms:decompose_sdpa",
7069
"//executorch/backends/transforms:remove_clone_ops",
71-
"//executorch/exir:lib",
70+
"//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
7271
"//executorch/devtools:lib",
72+
"//executorch/exir:lib",
7373
],
7474
)
7575

@@ -94,12 +94,12 @@ python_library(
9494
"passes.py",
9595
],
9696
deps = [
97-
":utils",
9897
":fuse_ops",
99-
":simplify_ops",
100-
":replace_ops",
101-
":reorder_ops",
10298
":remove_ops",
99+
":reorder_ops",
100+
":replace_ops",
101+
":simplify_ops",
102+
":utils",
103103
"//caffe2:torch",
104104
"//executorch/exir:pass_base",
105105
"//executorch/exir/dialects:lib",
@@ -131,7 +131,6 @@ python_library(
131131
],
132132
)
133133

134-
135134
export_file(name = "functions.yaml")
136135

137136
executorch_generated_lib(
@@ -191,9 +190,9 @@ python_library(
191190
],
192191
typing = True,
193192
deps = [
194-
"//caffe2:torch",
195-
":ops_registrations",
196193
":compiler_utils",
194+
":ops_registrations",
195+
"//caffe2:torch",
197196
"//executorch/backends/cadence/aot:pass_utils",
198197
"//executorch/backends/cadence/aot:utils",
199198
"//executorch/exir:pass_base",
@@ -228,11 +227,11 @@ python_library(
228227
"//caffe2:torch",
229228
"//executorch/backends/cadence/aot:pass_utils",
230229
"//executorch/backends/cadence/aot:simplify_ops",
230+
"//executorch/backends/transforms:remove_clone_ops",
231231
"//executorch/exir:pass_base",
232232
"//executorch/exir/dialects:lib",
233233
"//executorch/exir/dialects/edge:lib",
234234
"//executorch/exir/passes:spec_prop_pass",
235-
"//executorch/backends/transforms:remove_clone_ops"
236235
],
237236
)
238237

@@ -283,13 +282,13 @@ python_unittest(
283282
],
284283
typing = True,
285284
deps = [
285+
":ops_registrations",
286286
"//caffe2:torch",
287287
"//executorch/backends/cadence/aot:graph_builder",
288288
"//executorch/backends/cadence/aot:pass_utils",
289289
"//executorch/exir:pass_base",
290290
"//executorch/exir/dialects:lib",
291291
"//later:lib",
292-
":ops_registrations"
293292
],
294293
)
295294

@@ -319,8 +318,10 @@ python_unittest(
319318
srcs = [
320319
"tests/test_fusion_ops_passes.py",
321320
],
321+
supports_static_listing = False,
322322
typing = True,
323323
deps = [
324+
"fbsource//third-party/pypi/parameterized:parameterized",
324325
":compiler",
325326
"//caffe2:torch",
326327
"//executorch/backends/cadence/aot:compiler",
@@ -391,7 +392,6 @@ python_unittest(
391392
],
392393
)
393394

394-
395395
python_library(
396396
name = "memory_planning",
397397
srcs = [
@@ -409,7 +409,6 @@ python_library(
409409
],
410410
)
411411

412-
413412
python_library(
414413
name = "memory_constraints",
415414
srcs = [
@@ -425,7 +424,6 @@ python_library(
425424
],
426425
)
427426

428-
429427
python_unittest(
430428
name = "test_memory_passes",
431429
srcs = [

backends/cadence/aot/fuse_ops.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -901,9 +901,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
901901
@register_cadence_pass(CadencePassAttribute(opt_level=1))
902902
class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
903903
"""
904-
Fuse dequantize-quantize op pairs to a single requantize op.
905-
For the special case where quant params match, this will remove
906-
both dequant and quant ops.
904+
Fuse transpose op pairs to a single view op.
907905
"""
908906

909907
# A list of ops that can be bypassed when looking for a
@@ -915,6 +913,7 @@ class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
915913
exir_ops.edge.cadence.dequantize_per_tensor.default,
916914
exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
917915
exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
916+
exir_ops.edge.cadence.quantized_relu.per_tensor,
918917
}
919918

920919
def can_fuse_for_chain(

backends/cadence/aot/replace_ops.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2259,6 +2259,34 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
22592259
return result
22602260

22612261

2262+
2263+
@register_cadence_pass(CadencePassAttribute(opt_level=1))
2264+
class ReplacePowWithMullPass(ExportPass):
2265+
"""
2266+
Replace the pow op with degree 2 for a mul op.
2267+
"""
2268+
2269+
def call_operator(
2270+
self,
2271+
op,
2272+
args: Tuple[Argument, ...],
2273+
kwargs: Dict[str, Argument],
2274+
meta: NodeMetadata,
2275+
) -> ProxyValue:
2276+
# TODO(eigen): Add support for other degrees.
2277+
if op not in {
2278+
exir_ops.edge.aten.pow.Scalar,
2279+
} or args[0] != 2:
2280+
return super().call_operator(op, args, kwargs, meta)
2281+
2282+
return super().call_operator(
2283+
exir_ops.edge.aten.mul.Tensor,
2284+
(args[1], args[1]),
2285+
{},
2286+
meta,
2287+
)
2288+
2289+
22622290
# This class encapsulates all the functions that replace/switch one op in the
22632291
# graph with another.
22642292
class CadenceReplaceOpsInGraph:
@@ -2299,4 +2327,5 @@ class CadenceReplaceOpsInGraph:
22992327
ReplaceWhereWithFullArgsWithWhereScalar,
23002328
ReplaceGeluWithApproximateGeluPass,
23012329
ReplaceSplitWithSlicePass,
2330+
ReplacePowWithMullPass,
23022331
]

backends/cadence/aot/tests/test_fusion_ops_passes.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
2424
from executorch.exir.dialects._ops import ops as exir_ops
2525
from executorch.exir.dialects.edge._ops import EdgeOpOverload
26+
from executorch.exir.pass_base import ProxyValue
27+
from parameterized import parameterized
2628
from torch import nn
2729

2830

@@ -485,39 +487,60 @@ def test_fuse_then_transpose_pass(self):
485487

486488

487489
class TestFuseTransposeOpPairsPass(TestFusionPassesBase):
488-
def test_fuse_transpose_pairs(self):
490+
def _create_operator(
491+
self, builder: GraphBuilder, op: torch._ops.OpOverload, x: ProxyValue
492+
) -> ProxyValue:
493+
if op == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default:
494+
return builder.call_operator(
495+
op=op,
496+
args=(x, 1.2, 3, 0, 127, torch.int8),
497+
)
498+
elif op == exir_ops.edge.cadence.quantized_relu.per_tensor:
499+
return builder.call_operator(
500+
op=op,
501+
args=(x, 0, 0, 0, 0),
502+
)
503+
else:
504+
raise ValueError(f"Unsupported op: {op}")
505+
506+
@parameterized.expand(
507+
[
508+
exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
509+
exir_ops.edge.cadence.quantized_relu.per_tensor,
510+
],
511+
)
512+
def test_fuse_transpose_pairs(self, op: torch._ops.OpOverload):
489513
# Create a graph with transpose -> quant -> transpose.
490514
builder = GraphBuilder()
491515
x = builder.placeholder("x", torch.randn(2, 3))
492516
transpose_node = builder.call_operator(
493517
op=exir_ops.edge.aten.transpose_copy.int,
494518
args=(x, 0, 1),
495519
)
496-
quant_node = builder.call_operator(
497-
op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
498-
args=(transpose_node, 1.2, 3, 0, 127, torch.int8),
499-
)
520+
quant_node = self._create_operator(builder, op, transpose_node)
500521
transpose_node = builder.call_operator(
501522
op=exir_ops.edge.aten.transpose_copy.int,
502523
args=(quant_node, 0, 1),
503524
)
504-
builder.output(transpose_node)
525+
builder.output([transpose_node])
505526
gm = builder.get_graph_module()
506527
self.check_op_counts(
507528
gm,
508529
expected_op_counts={
509530
exir_ops.edge.aten.transpose_copy.int: 2,
510-
exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
531+
op: 1,
511532
},
512533
)
513534

514535
# Check that the pass fuses the two transpose ops.
515-
gm_after_pass = FuseTransposeOpPairsPass()(gm).graph_module
536+
fusion_pass_result = FuseTransposeOpPairsPass()(gm)
537+
self.assertIsNotNone(fusion_pass_result)
538+
gm_after_pass = fusion_pass_result.graph_module
516539
self.check_op_counts(
517540
gm_after_pass,
518541
expected_op_counts={
519542
exir_ops.edge.aten.transpose_copy.int: 0,
520-
exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
543+
op: 1,
521544
},
522545
)
523546

0 commit comments

Comments
 (0)