Skip to content

Commit f12edc8

Browse files
committed
Merge remote-tracking branch 'origin/main' into tiktoken
2 parents 97fcc30 + 19ed018 commit f12edc8

File tree

25 files changed

+526
-241
lines changed

25 files changed

+526
-241
lines changed

backends/transforms/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ runtime.python_library(
119119
visibility = [
120120
"//executorch/backends/...",
121121
"//executorch/examples/...",
122+
"//executorch/extension/llm/...",
122123
],
123124
deps = [
124125
"//caffe2:torch",

docs/source/extension-module.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc
136136

137137
using namespace ::torch::executor;
138138

139-
Module module("/path/to/model.pte", Module::MlockConfig::UseMlock, std::make_unique<ETDumpGen>());
139+
Module module("/path/to/model.pte", Module::LoadMode::MmapUseMlock, std::make_unique<ETDumpGen>());
140140

141141
// Execute a method, e.g. module.forward(...); or module.execute("my_method", ...);
142142

docs/source/llm/getting-started.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,8 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
313313
```cpp
314314
// main.cpp
315315

316+
using namespace torch::executor;
317+
316318
int main() {
317319
// Set up the prompt. This provides the seed text for the model to elaborate.
318320
std::cout << "Enter model prompt: ";
@@ -327,7 +329,7 @@ int main() {
327329
BasicSampler sampler = BasicSampler();
328330

329331
// Load the exported nanoGPT program, which was generated via the previous steps.
330-
Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
332+
Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
331333

332334
const auto max_input_tokens = 1024;
333335
const auto max_output_tokens = 30;
@@ -787,15 +789,14 @@ Include the ETDump header in your code.
787789

788790
Create an Instance of the ETDumpGen class and pass it to the Module constructor.
789791
```cpp
790-
std::unique_ptr<torch::executor::ETDumpGen> etdump_gen_ = std::make_unique<torch::executor::ETDumpGen>();
791-
Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors, std::move(etdump_gen_));
792+
std::unique_ptr<ETDumpGen> etdump_gen_ = std::make_unique<ETDumpGen>();
793+
Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors, std::move(etdump_gen_));
792794
```
793795
794796
After calling `generate()`, save the ETDump to a file. You can capture multiple
795797
model runs in a single trace, if desired.
796798
```cpp
797-
torch::executor::ETDumpGen* etdump_gen =
798-
static_cast<torch::executor::ETDumpGen*>(model.event_tracer());
799+
ETDumpGen* etdump_gen = static_cast<ETDumpGen*>(model.event_tracer());
799800
800801
ET_LOG(Info, "ETDump size: %zu blocks", etdump_gen->get_num_blocks());
801802
etdump_result result = etdump_gen->get_etdump_data();

examples/llm_manual/main.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,7 @@ int main() {
110110

111111
// Load the exported nanoGPT program, which was generated via the previous
112112
// steps.
113-
Module model(
114-
"nanogpt.pte",
115-
torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
113+
Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
116114

117115
const auto max_input_tokens = 1024;
118116
const auto max_output_tokens = 30;

examples/models/llama2/TARGETS

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ runtime.python_binary(
6464
runtime.python_library(
6565
name = "export_library",
6666
srcs = [
67-
"builder.py",
6867
"export_llama.py",
6968
"export_llama_lib.py",
7069
"model.py",
@@ -82,13 +81,10 @@ runtime.python_library(
8281
],
8382
deps = [
8483
"//caffe2:torch",
85-
"//executorch/backends/transforms:duplicate_dynamic_quant_chain",
8684
"//executorch/examples/models:model_base",
8785
"//executorch/examples/models:models",
8886
"//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
89-
"//executorch/exir:lib",
9087
"//executorch/extension/llm/export:export_lib",
91-
"//executorch/extension/export_util:export_util",
9288
# one definition has to be included in the user of the libarary
9389
# depending on what library the client wants to use
9490
# "//executorch/extension/pybindings:aten_lib",

examples/models/llama2/eval_llama_lib.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
Tokenizer as SentencePieceTokenizer,
2020
)
2121

22+
from executorch.extension.llm.export import LLMEdgeManager
23+
2224
from lm_eval.api.model import LM
2325

24-
from .builder import LlamaEdgeManager
2526
from .export_llama_lib import (
2627
_prepare_for_llama_export,
2728
build_args_parser as _build_args_parser,
@@ -130,7 +131,7 @@ def gen_eval_wrapper(
130131

131132
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
132133
# GPTFastEvalWrapper: Create a wrapper around a pre-exported model
133-
manager: LlamaEdgeManager = _prepare_for_llama_export(model_name, args)
134+
manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args)
134135

135136
if len(quantizers) != 0:
136137
manager = manager.capture_pre_autograd_graph().pt2e_quantize(quantizers)

examples/models/llama2/export_llama_lib.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
from executorch.examples.models.llama2.llama_transformer import ModelArgs
2424

25+
from executorch.extension.llm.export.builder import DType, LLMEdgeManager
26+
2527
from executorch.extension.llm.export.partitioner_lib import (
2628
get_coreml_partitioner,
2729
get_mps_partitioner,
@@ -40,8 +42,6 @@
4042
from executorch.util.activation_memory_profiler import generate_memory_trace
4143

4244
from ..model_factory import EagerModelFactory
43-
44-
from .builder import DType, LlamaEdgeManager
4545
from .source_transformation.quantize import (
4646
get_quant_embedding_transform,
4747
get_quant_weight_transform,
@@ -333,12 +333,12 @@ def export_llama(modelname, args) -> str:
333333
return filename
334334

335335

336-
def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
336+
def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
337337
"""
338338
Helper function for export_llama. Loads the model from checkpoint and params,
339-
and sets up a LlamaEdgeManager with initial transforms and dtype conversion.
339+
and sets up a LLMEdgeManager with initial transforms and dtype conversion.
340340
341-
Returns a LlamaEdgeManager prior to calling export_to_edge with quantizers
341+
Returns a LLMEdgeManager prior to calling export_to_edge with quantizers
342342
"""
343343

344344
# load model from checkpoint and params.json
@@ -429,7 +429,7 @@ def _validate_args(args):
429429
)
430430

431431

432-
def _export_llama(modelname, args) -> LlamaEdgeManager: # noqa: C901
432+
def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
433433
_validate_args(args)
434434
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
435435

@@ -579,12 +579,12 @@ def _load_llama_model(
579579
verbose: bool = False,
580580
max_seq_len: int = 128,
581581
metadata_str: Optional[str] = None,
582-
) -> "LlamaEdgeManager":
582+
) -> "LLMEdgeManager":
583583
"""
584-
A helper util that builds a Llama2 model. It returns a LlamaEdgeManager that
584+
A helper util that builds a Llama2 model. It returns a LLMEdgeManager that
585585
can help further lower the model to ExecuTorch.
586586
Returns:
587-
An instance of LlamaEdgeManager which contains the eager mode model.
587+
An instance of LLMEdgeManager which contains the eager mode model.
588588
"""
589589
assert (
590590
checkpoint or checkpoint_dir
@@ -622,13 +622,12 @@ def _load_llama_model(
622622
else:
623623
raise ValueError(f"Unsupported dtype {dtype}")
624624

625-
return LlamaEdgeManager(
625+
return LLMEdgeManager(
626626
model=model,
627627
modelname=modelname,
628-
weight_type=weight_type,
628+
max_seq_len=model.params.max_seq_len,
629629
dtype=dtype,
630630
use_kv_cache=use_kv_cache,
631-
use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
632631
example_inputs=example_inputs,
633632
enable_dynamic_shape=enable_dynamic_shape,
634633
verbose=verbose,

examples/models/llama2/runner/runner.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#else /* BPE */
1616
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1717
#endif /* ET_USE_TIKTOKEN*/
18-
#include <executorch/extension/data_loader/file_data_loader.h>
1918
#include <executorch/extension/evalue_util/print_evalue.h>
2019
#include <executorch/extension/runner_util/managed_tensor.h>
2120

@@ -43,7 +42,10 @@ Runner::Runner(
4342
const std::string& model_path,
4443
const std::string& tokenizer_path,
4544
const float temperature)
46-
: model_path_(model_path),
45+
// NOTE: we observed ~2x loading performance increase on iPhone 15
46+
// and a ~5% improvement on Galaxy S22 by switching to
47+
// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
48+
: module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
4749
tokenizer_path_(tokenizer_path),
4850
temperature_(temperature) {
4951
ET_LOG(
@@ -54,22 +56,13 @@ Runner::Runner(
5456
}
5557

5658
bool Runner::is_loaded() const {
57-
return module_ && module_->is_loaded() && tokenizer_ && sampler_;
59+
return module_->is_loaded() && tokenizer_ && sampler_;
5860
}
5961

6062
Error Runner::load() {
6163
if (is_loaded()) {
6264
return Error::Ok;
6365
}
64-
// NOTE: we observed ~2x loading performance increase on iPhone 15
65-
// and a ~5% improvement on Galaxy S22 by switching to
66-
// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
67-
auto data_loader_result = util::FileDataLoader::from(model_path_.c_str());
68-
if (!data_loader_result.ok()) {
69-
return data_loader_result.error();
70-
}
71-
module_ = std::make_unique<Module>(
72-
std::make_unique<util::FileDataLoader>(std::move(*data_loader_result)));
7366
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
7467

7568
// Read out metadata: vocab_size (expected by the model), BOS, EOS, n_BOS,

examples/models/llama2/runner/targets.bzl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@ def define_common_targets():
3131
visibility = [
3232
"@EXECUTORCH_CLIENTS",
3333
],
34-
deps = [
35-
"//executorch/extension/data_loader:file_data_loader",
36-
],
3734
exported_deps = [
3835
"//executorch/backends/xnnpack:xnnpack_backend",
3936
"//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,

examples/models/llama2/source_transformation/quantize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
import torch.nn as nn
1313
import torch.nn.functional as F
1414

15-
from sentencepiece import SentencePieceProcessor
15+
from executorch.extension.llm.export.builder import DType
1616

17-
from ..builder import DType
17+
from sentencepiece import SentencePieceProcessor
1818

1919
try:
2020
from fairseq2.nn.embedding import (

examples/models/phi-3-mini/export_model.py

Lines changed: 0 additions & 65 deletions
This file was deleted.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import torch
8+
from executorch.extension.llm.export.builder import DType, LLMEdgeManager
9+
10+
from executorch.extension.llm.export.partitioner_lib import get_xnnpack_partitioner
11+
from executorch.extension.llm.export.quantizer_lib import (
12+
DynamicQuantLinearOptions,
13+
get_pt2e_quantizers,
14+
PT2EQuantOptions,
15+
)
16+
17+
from transformers import Phi3ForCausalLM
18+
19+
20+
def main() -> None:
21+
torch.manual_seed(42)
22+
23+
# pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
24+
model = Phi3ForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
25+
26+
modelname = "phi-3-mini"
27+
28+
(
29+
LLMEdgeManager(
30+
model=model,
31+
modelname=modelname,
32+
max_seq_len=128,
33+
dtype=DType.fp32,
34+
use_kv_cache=False,
35+
example_inputs=(torch.randint(0, 100, (1, 100), dtype=torch.long),),
36+
enable_dynamic_shape=True,
37+
verbose=True,
38+
)
39+
.set_output_dir(".")
40+
.capture_pre_autograd_graph()
41+
.pt2e_quantize(
42+
get_pt2e_quantizers(PT2EQuantOptions(None, DynamicQuantLinearOptions()))
43+
)
44+
.export_to_edge()
45+
.to_backend([get_xnnpack_partitioner()])
46+
.to_executorch()
47+
.save_to_pte(f"{modelname}.pte")
48+
)
49+
50+
51+
if __name__ == "__main__":
52+
main()

examples/models/phi-3-mini/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ int main() {
8383

8484
SentencePieceTokenizer tokenizer("tokenizer.model");
8585

86-
Module model("phi-3-mini.pte", Module::MlockConfig::UseMlockIgnoreErrors);
86+
Module model("phi-3-mini.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
8787

8888
const auto max_output_tokens = 128;
8989
generate(model, prompt, tokenizer, max_output_tokens);

examples/qualcomm/llama2/llama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
generate_htp_compiler_spec,
3131
generate_qnn_executorch_compiler_spec,
3232
)
33-
from executorch.examples.models.llama2.builder import DType
3433
from executorch.examples.qualcomm.llama2.model.static_llama import LlamaModel, ModelArgs
3534
from executorch.examples.qualcomm.scripts.utils import (
3635
make_output_dir,
@@ -41,6 +40,7 @@
4140
from executorch.exir.capture._config import ExecutorchBackendConfig
4241
from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
4342
from executorch.exir.program._program import _get_updated_graph_signature
43+
from executorch.extension.llm.export.builder import DType
4444

4545
from sentencepiece import SentencePieceProcessor
4646
from torch.ao.quantization.observer import MinMaxObserver

0 commit comments

Comments
 (0)