Skip to content

Commit dde7642

Browse files
committed
Make it possible to load both CPU and CUDA models using same runner (#815)
By wrapping attempt to load a model with `try {} catch (std::runtime_error) {}` and attempting to create model on GPU first, as attempt to load CPU model on CUDA destroys CUDA context (bugs/fixes againt PyTorch are coming, tracked in pytorch/pytorch#126547 ) Also, fix two bugs in the repo: - Initialize `Tokenizer::initialized_` to false - Change name of the tokenizer file in a workflow from `tokenizer.bin` to `tokenizer.model` Fixes pytorch/torchchat#709 Test plan: ``` python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model_cpu.so --device cpu python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model.so ./cmake-out/aoti_run ./model.so -z checkpoints/stories15M/tokenizer.model ./cmake-out/aoti_run ./model_cpu.so -z checkpoints/stories15M/tokenizer.model ```
1 parent ff4d281 commit dde7642

File tree

4 files changed

+22
-10
lines changed

4 files changed

+22
-10
lines changed

.github/workflows/runner-cuda-dtype.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ jobs:
5858
5959
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
6060
61-
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
61+
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
6262
6363
echo "**********************************************"
6464
echo "******** INT4 HQQ group-wise quantized *******"

runner/aoti.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ if(Torch_FOUND)
1313
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")
1414

1515
add_executable(aoti_run runner/run.cpp)
16-
1716
target_compile_options(aoti_run PUBLIC -D__AOTI_MODEL__)
17+
if(DEFINED TORCH_CUDA_LIBRARIES)
18+
target_compile_options(aoti_run PUBLIC -DUSE_CUDA)
19+
endif()
1820
target_include_directories(aoti_run PRIVATE ${TORCHCHAT_ROOT}/runner)
1921
target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
2022
set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)

runner/run.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424

2525
#ifdef __AOTI_MODEL__
2626
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
27-
torch::Device cpu_device(torch::kCPU);
27+
#ifdef USE_CUDA
28+
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
29+
#endif
30+
torch::Device aoti_device(torch::kCPU);
2831

2932
#else // __ET_MODEL__
3033
#include <executorch/extension/module/module.h>
@@ -82,7 +85,7 @@ typedef struct {
8285
RunState state; // buffers for the "wave" of activations in the forward pass
8386

8487
#ifdef __AOTI_MODEL__
85-
torch::inductor::AOTIModelContainerRunnerCpu* runner;
88+
torch::inductor::AOTIModelContainerRunner* runner;
8689
#else // __ET_MODEL__
8790
Module* runner;
8891
#endif
@@ -132,9 +135,16 @@ void build_transformer(
132135
malloc_run_state(&t->state, &t->config);
133136

134137
#ifdef __AOTI_MODEL__
135-
t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(
136-
/* path to model DSO */ model_path,
137-
/* thread pool size */ 1);
138+
#ifdef USE_CUDA
139+
try {
140+
t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
141+
aoti_device = torch::Device(torch::kCUDA);
142+
} catch (std::runtime_error& e) {
143+
#else
144+
{
145+
#endif
146+
t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path);
147+
}
138148
#else //__ET_MODEL__
139149
t->runner = new Module(
140150
/* path to PTE model */ model_path,
@@ -186,11 +196,11 @@ float* forward(Transformer* transformer, int token, int pos) {
186196
torch::Tensor token_tensor =
187197
torch::from_blob(token_buffer, {1, 1}, torch::kLong);
188198
torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
189-
std::vector<torch::Tensor> inputs{token_tensor, pos_tensor};
199+
std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
190200

191201
torch::Tensor result = transformer->runner->run(inputs)[0]
192202
.to(torch::dtype(torch::kFloat32))
193-
.to(cpu_device);
203+
.to(torch::kCPU);
194204
auto logits = result[0].data_ptr();
195205
#else // __ET_MODEL__
196206
ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);

tokenizer/tokenizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class Tokenizer {
5050
}
5151

5252
protected:
53-
bool initialized_;
53+
bool initialized_ = false;
5454
int32_t vocab_size_;
5555
uint64_t bos_tok_, eos_tok_;
5656
};

0 commit comments

Comments
 (0)