Skip to content

Commit 4936dc4

Browse files
Gasoonjiafacebook-github-bot
authored andcommitted
update doc for dynamic export nanogpt
Differential Revision: D56365041
1 parent 023ca07 commit 4936dc4

File tree

1 file changed

+70
-13
lines changed

1 file changed

+70
-13
lines changed

docs/source/llm/getting-started.md

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,23 @@ model = GPT.from_pretrained('gpt2')
141141

142142
# Create example inputs. This is used in the export process to provide
143143
# hints on the expected shape of the model input.
144-
example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
144+
example_inputs = (torch.randint(0, 100, (1, model.config.block_size - 1), dtype=torch.long), )
145+
146+
# Set up dynamic shape configuration, which makes the input tensors'
147+
# sizes during the runtime does not need to match the size of tensors
148+
# in `example_inputs`, but follow the rule dynamic shape configuration shares.
149+
# Here we set the range of 0th model input's 1st dimension as [0, model.config.block_size - 1]
150+
# Detials of dynamic shape and how to create it customized can follow
151+
# [ExecuTorch Concept](https://pytorch.org/executorch/0.2/concepts.html#dynamic-shapes)
152+
dynamic_shape = (
153+
{1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
154+
)
145155

146156
# Trace the model, converting it to a portable intermediate representation.
147157
# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
148158
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
149-
m = capture_pre_autograd_graph(model, example_inputs)
150-
traced_model = export(m, example_inputs)
159+
m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
160+
traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
151161

152162
# Convert the model into a runnable ExecuTorch program.
153163
edge_config = EdgeCompileConfig(_check_ir_validity=False)
@@ -204,17 +214,21 @@ output token by token. Each generated token is passed as input for the next run.
204214
```cpp
205215
// main.cpp
206216
217+
#define ENDOFTEXT 50256
218+
207219
std::string generate(
208220
Module& llm_model,
209221
std::string& prompt,
210222
BasicTokenizer& tokenizer,
211223
BasicSampler& sampler,
224+
size_t max_input_length,
212225
size_t max_output_length) {
213226
214227
// Convert the input text into a list of integers (tokens) that represents
215228
// it, using the string-to-token mapping that the model was trained on.
216229
// Each token is an integer that represents a word or part of a word.
217230
std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
231+
// std::vector<int64_t> input_tokens(1023, 10);
218232
std::vector<int64_t> output_tokens;
219233
220234
for (auto i = 0u; i < max_output_length; i++) {
@@ -223,6 +237,7 @@ std::string generate(
223237
ManagedTensor tensor_tokens(
224238
input_tokens.data(),
225239
{1, static_cast<int>(input_tokens.size())},
240+
// {1, 1023},
226241
ScalarType::Long);
227242
std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
228243
@@ -237,14 +252,23 @@ std::string generate(
237252
238253
// Sample the next token from the logits.
239254
int64_t next_token = sampler.sample(logits);
255+
256+
// Break if we reached the end of the text.
257+
if (next_token == ENDOFTEXT) {
258+
break;
259+
}
260+
261+
// Add the next token to the output.
240262
output_tokens.push_back(next_token);
241263
242264
std::cout << tokenizer.decode({ next_token });
243265
std::cout.flush();
244266
245267
// Update next input.
246-
input_tokens.erase(input_tokens.begin());
247268
input_tokens.push_back(next_token);
269+
if (input_tokens.size() > max_input_length) {
270+
input_tokens.erase(input_tokens.begin());
271+
}
248272
}
249273
250274
std::cout << std::endl;
@@ -278,7 +302,9 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
278302

279303
int main() {
280304
// Set up the prompt. This provides the seed text for the model to elaborate.
281-
std::string prompt = "Once upon a time, there was a";
305+
std::cout << "Prompt: ";
306+
std::string prompt;
307+
std::getline(std::cin, prompt);
282308

283309
// The tokenizer is used to convert between tokens (used by the model) and
284310
// human-readable strings.
@@ -290,9 +316,10 @@ int main() {
290316
// Load the exported nanoGPT program, which was generated via the previous steps.
291317
Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
292318

319+
const auto max_input_tokens = 1024;
293320
const auto max_output_tokens = 30;
294321
std::cout << prompt;
295-
generate(model, prompt, tokenizer, sampler, max_output_tokens);
322+
generate(model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
296323
}
297324
```
298325

@@ -363,10 +390,19 @@ cmake --build cmake-out -j10
363390
./cmake-out/nanogpt_runner
364391
```
365392

366-
You should see something like the following:
393+
You should see the instruction like the following to make you input the initial prompt:
367394

368395
```
369-
Once upon a time, there was a man who was a member of the military...
396+
Prompt:
397+
```
398+
399+
Here we use "Hello world!" as example prompt. After you input your prompt and press enter:
400+
401+
```
402+
Prompt: Hello world!
403+
Hello world!
404+
405+
I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
370406
```
371407

372408
At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
@@ -423,14 +459,24 @@ model = GPT.from_pretrained('gpt2')
423459
# Create example inputs. This is used in the export process to provide
424460
# hints on the expected shape of the model input.
425461
example_inputs = (
426-
torch.randint(0, 100, (1, 8), dtype=torch.long),
462+
torch.randint(0, 100, (1, model.config.block_size - 1), dtype=torch.long),
427463
)
428464

465+
# Set up dynamic shape configuration, which makes the input tensors'
466+
# sizes during the runtime does not need to match the size of tensors
467+
# in `example_inputs`, but follow the rule dynamic shape configuration shares.
468+
# Here we set the range of 0th model input's 1st dimension as [0, model.config.block_size - 1]
469+
# Detials of dynamic shape and how to create it customized can follow
470+
# [ExecuTorch Concept](https://pytorch.org/executorch/0.2/concepts.html#dynamic-shapes)
471+
dynamic_shape = (
472+
{1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
473+
)
474+
429475
# Trace the model, converting it to a portable intermediate representation.
430476
# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
431477
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
432-
m = capture_pre_autograd_graph(model, example_inputs)
433-
traced_model = export(m, example_inputs)
478+
m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
479+
traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
434480

435481
# Convert the model into a runnable ExecuTorch program.
436482
# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
@@ -512,12 +558,23 @@ cmake --build cmake-out -j10
512558
./cmake-out/nanogpt_runner
513559
```
514560

515-
You should see something like the following:
561+
562+
You should see the instruction like the following to make you input the initial prompt:
563+
564+
```
565+
Prompt:
566+
```
567+
568+
Here we use "Hello world!" as example prompt. After you input your prompt and press enter:
516569

517570
```
518-
Once upon a time, there was a man who was a member of the military...
571+
Prompt: Hello world!
572+
Hello world!
573+
574+
I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
519575
```
520576

577+
Now you'll be able to clearly feel the acceleration of the generation process, compare with no delegation.
521578

522579
For more information regarding backend delegateion, see the ExecuTorch guides
523580
for the

0 commit comments

Comments
 (0)