@@ -141,13 +141,23 @@ model = GPT.from_pretrained('gpt2')
141
141
142
142
# Create example inputs. This is used in the export process to provide
143
143
# hints on the expected shape of the model input.
144
- example_inputs = (torch.randint(0 , 100 , (1 , 8 ), dtype = torch.long), )
144
+ example_inputs = (torch.randint(0 , 100 , (1 , model.config.block_size - 1 ), dtype = torch.long), )
145
+
146
+ # Set up dynamic shape configuration, which makes the input tensors'
147
+ # sizes during the runtime does not need to match the size of tensors
148
+ # in `example_inputs`, but follow the rule dynamic shape configuration shares.
149
+ # Here we set the range of 0th model input's 1st dimension as [0, model.config.block_size - 1]
150
+ # Detials of dynamic shape and how to create it customized can follow
151
+ # [ExecuTorch Concept](https://pytorch.org/executorch/0.2/concepts.html#dynamic-shapes)
152
+ dynamic_shape = (
153
+ {1 : torch.export.Dim(" token_dim" , max = model.config.block_size - 1 )},
154
+ )
145
155
146
156
# Trace the model, converting it to a portable intermediate representation.
147
157
# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
148
158
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH ]), torch.no_grad():
149
- m = capture_pre_autograd_graph(model, example_inputs)
150
- traced_model = export(m, example_inputs)
159
+ m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes = dynamic_shape )
160
+ traced_model = export(m, example_inputs, dynamic_shapes = dynamic_shape )
151
161
152
162
# Convert the model into a runnable ExecuTorch program.
153
163
edge_config = EdgeCompileConfig(_check_ir_validity = False )
@@ -204,17 +214,21 @@ output token by token. Each generated token is passed as input for the next run.
204
214
```cpp
205
215
// main.cpp
206
216
217
+ #define ENDOFTEXT 50256
218
+
207
219
std::string generate(
208
220
Module& llm_model,
209
221
std::string& prompt,
210
222
BasicTokenizer& tokenizer,
211
223
BasicSampler& sampler,
224
+ size_t max_input_length,
212
225
size_t max_output_length) {
213
226
214
227
// Convert the input text into a list of integers (tokens) that represents
215
228
// it, using the string-to-token mapping that the model was trained on.
216
229
// Each token is an integer that represents a word or part of a word.
217
230
std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
231
+ // std::vector<int64_t> input_tokens(1023, 10);
218
232
std::vector<int64_t> output_tokens;
219
233
220
234
for (auto i = 0u; i < max_output_length; i++) {
@@ -223,6 +237,7 @@ std::string generate(
223
237
ManagedTensor tensor_tokens(
224
238
input_tokens.data(),
225
239
{1, static_cast<int>(input_tokens.size())},
240
+ // {1, 1023},
226
241
ScalarType::Long);
227
242
std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
228
243
@@ -237,14 +252,23 @@ std::string generate(
237
252
238
253
// Sample the next token from the logits.
239
254
int64_t next_token = sampler.sample(logits);
255
+
256
+ // Break if we reached the end of the text.
257
+ if (next_token == ENDOFTEXT) {
258
+ break;
259
+ }
260
+
261
+ // Add the next token to the output.
240
262
output_tokens.push_back(next_token);
241
263
242
264
std::cout << tokenizer.decode({ next_token });
243
265
std::cout.flush();
244
266
245
267
// Update next input.
246
- input_tokens.erase(input_tokens.begin());
247
268
input_tokens.push_back(next_token);
269
+ if (input_tokens.size() > max_input_length) {
270
+ input_tokens.erase(input_tokens.begin());
271
+ }
248
272
}
249
273
250
274
std::cout << std::endl;
@@ -278,7 +302,9 @@ penalties for repeated tokens, and biases to prioritize or de-prioritize specifi
278
302
279
303
int main () {
280
304
// Set up the prompt. This provides the seed text for the model to elaborate.
281
- std::string prompt = "Once upon a time, there was a";
305
+ std::cout << "Prompt: ";
306
+ std::string prompt;
307
+ std::getline (std::cin, prompt);
282
308
283
309
// The tokenizer is used to convert between tokens (used by the model) and
284
310
// human-readable strings.
@@ -290,9 +316,10 @@ int main() {
290
316
// Load the exported nanoGPT program, which was generated via the previous steps.
291
317
Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
292
318
319
+ const auto max_input_tokens = 1024;
293
320
const auto max_output_tokens = 30;
294
321
std::cout << prompt;
295
- generate (model, prompt, tokenizer, sampler, max_output_tokens);
322
+ generate (model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
296
323
}
297
324
```
298
325
@@ -363,10 +390,19 @@ cmake --build cmake-out -j10
363
390
./cmake-out/nanogpt_runner
364
391
```
365
392
366
- You should see something like the following:
393
+ You should see the instruction like the following to make you input the initial prompt :
367
394
368
395
```
369
- Once upon a time, there was a man who was a member of the military...
396
+ Prompt:
397
+ ```
398
+
399
+ Here we use "Hello world!" as example prompt. After you input your prompt and press enter:
400
+
401
+ ```
402
+ Prompt: Hello world!
403
+ Hello world!
404
+
405
+ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
370
406
```
371
407
372
408
At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
@@ -423,14 +459,24 @@ model = GPT.from_pretrained('gpt2')
423
459
# Create example inputs. This is used in the export process to provide
424
460
# hints on the expected shape of the model input.
425
461
example_inputs = (
426
- torch.randint(0 , 100 , (1 , 8 ), dtype = torch.long),
462
+ torch.randint(0 , 100 , (1 , model.config.block_size - 1 ), dtype = torch.long),
427
463
)
428
464
465
+ # Set up dynamic shape configuration, which makes the input tensors'
466
+ # sizes during the runtime does not need to match the size of tensors
467
+ # in `example_inputs`, but follow the rule dynamic shape configuration shares.
468
+ # Here we set the range of 0th model input's 1st dimension as [0, model.config.block_size - 1]
469
+ # Detials of dynamic shape and how to create it customized can follow
470
+ # [ExecuTorch Concept](https://pytorch.org/executorch/0.2/concepts.html#dynamic-shapes)
471
+ dynamic_shape = (
472
+ {1 : torch.export.Dim(" token_dim" , max = model.config.block_size - 1 )},
473
+ )
474
+
429
475
# Trace the model, converting it to a portable intermediate representation.
430
476
# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
431
477
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH ]), torch.no_grad():
432
- m = capture_pre_autograd_graph(model, example_inputs)
433
- traced_model = export(m, example_inputs)
478
+ m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes = dynamic_shape )
479
+ traced_model = export(m, example_inputs, dynamic_shapes = dynamic_shape )
434
480
435
481
# Convert the model into a runnable ExecuTorch program.
436
482
# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
@@ -512,12 +558,23 @@ cmake --build cmake-out -j10
512
558
./cmake-out/nanogpt_runner
513
559
```
514
560
515
- You should see something like the following:
561
+
562
+ You should see the instruction like the following to make you input the initial prompt:
563
+
564
+ ```
565
+ Prompt:
566
+ ```
567
+
568
+ Here we use "Hello world!" as example prompt. After you input your prompt and press enter:
516
569
517
570
```
518
- Once upon a time, there was a man who was a member of the military...
571
+ Prompt: Hello world!
572
+ Hello world!
573
+
574
+ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a very popular game in
519
575
```
520
576
577
+ Now you'll be able to clearly feel the acceleration of the generation process, compare with no delegation.
521
578
522
579
For more information regarding backend delegateion, see the ExecuTorch guides
523
580
for the
0 commit comments