Skip to content

Commit 5f097e2

Browse files
committed
Direct I/O and Transparent HugePages
--direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.
1 parent 7d1a378 commit 5f097e2

File tree

9 files changed

+361
-57
lines changed

9 files changed

+361
-57
lines changed

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10681068
params.use_mmap = false;
10691069
return true;
10701070
}
1071+
if (arg == "--direct-io") {
1072+
params.use_direct_io = true;
1073+
return true;
1074+
}
10711075
if (arg == "--numa") {
10721076
if (++i >= argc) {
10731077
invalid_param = true;
@@ -1751,6 +1755,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
17511755
if (llama_supports_mmap()) {
17521756
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
17531757
}
1758+
if (llama_supports_direct_io()) {
1759+
options.push_back({ "*", " --direct-io", "use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)" });
1760+
}
17541761
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
17551762
" - distribute: spread execution evenly over all nodes\n"
17561763
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
@@ -2289,6 +2296,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
22892296
mparams.split_mode = params.split_mode;
22902297
mparams.tensor_split = params.tensor_split;
22912298
mparams.use_mmap = params.use_mmap;
2299+
mparams.use_direct_io = params.use_direct_io;
22922300
mparams.use_mlock = params.use_mlock;
22932301
mparams.check_tensors = params.check_tensors;
22942302
if (params.kv_overrides.empty()) {
@@ -3249,6 +3257,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
32493257
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
32503258
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
32513259
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
3260+
fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
32523261
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
32533262
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
32543263
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ struct gpt_params {
162162
bool ignore_eos = false; // ignore generated EOS tokens
163163
bool logits_all = false; // return logits for all tokens in the batch
164164
bool use_mmap = true; // use mmap for faster loads
165+
bool use_direct_io = false; // use direct I/O
165166
bool use_mlock = false; // use mlock to keep model in memory
166167
bool verbose = false;
167168
bool verbose_prompt = false; // print prompt tokens before generation

examples/llama-bench/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ options:
3838
-nkvo, --no-kv-offload <0|1> (default: 0)
3939
-fa, --flash-attn <0|1> (default: 0)
4040
-mmp, --mmap <0|1> (default: 1)
41+
-dio, --direct-io <0|1> (default: 0)
4142
--numa <distribute|isolate|numactl> (default: disabled)
4243
-embd, --embeddings <0|1> (default: 0)
4344
-ts, --tensor-split <ts0/ts1/..> (default: 0)

examples/llama-bench/llama-bench.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ struct cmd_params {
189189
std::vector<bool> flash_attn;
190190
std::vector<std::vector<float>> tensor_split;
191191
std::vector<bool> use_mmap;
192+
std::vector<bool> use_direct_io;
192193
std::vector<bool> embeddings;
193194
ggml_numa_strategy numa;
194195
int reps;
@@ -215,6 +216,7 @@ static const cmd_params cmd_params_defaults = {
215216
/* flash_attn */ {false},
216217
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
217218
/* use_mmap */ {true},
219+
/* use_direct_io */ {false},
218220
/* embeddings */ {false},
219221
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
220222
/* reps */ 5,
@@ -244,6 +246,7 @@ static void print_usage(int /* argc */, char ** argv) {
244246
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
245247
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
246248
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
249+
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
247250
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
248251
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
249252
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
@@ -461,6 +464,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
461464
}
462465
auto p = string_split<bool>(argv[i], split_delim);
463466
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
467+
} else if (arg == "-dio" || arg == "--direct-io") {
468+
if (++i >= argc) {
469+
invalid_param = true;
470+
break;
471+
}
472+
auto p = string_split<bool>(argv[i], split_delim);
473+
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
464474
} else if (arg == "-embd" || arg == "--embeddings") {
465475
if (++i >= argc) {
466476
invalid_param = true;
@@ -538,6 +548,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
538548
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
539549
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
540550
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
551+
if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
541552
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
542553
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
543554

@@ -561,6 +572,7 @@ struct cmd_params_instance {
561572
bool flash_attn;
562573
std::vector<float> tensor_split;
563574
bool use_mmap;
575+
bool use_direct_io;
564576
bool embeddings;
565577

566578
llama_model_params to_llama_mparams() const {
@@ -574,6 +586,7 @@ struct cmd_params_instance {
574586
mparams.main_gpu = main_gpu;
575587
mparams.tensor_split = tensor_split.data();
576588
mparams.use_mmap = use_mmap;
589+
mparams.use_direct_io = use_direct_io;
577590

578591
return mparams;
579592
}
@@ -585,6 +598,7 @@ struct cmd_params_instance {
585598
split_mode == other.split_mode &&
586599
main_gpu == other.main_gpu &&
587600
use_mmap == other.use_mmap &&
601+
use_direct_io == other.use_direct_io &&
588602
tensor_split == other.tensor_split;
589603
}
590604

@@ -615,6 +629,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
615629
for (const auto & mg : params.main_gpu)
616630
for (const auto & ts : params.tensor_split)
617631
for (const auto & mmp : params.use_mmap)
632+
for (const auto & dio : params.use_direct_io)
618633
for (const auto & embd : params.embeddings)
619634
for (const auto & nb : params.n_batch)
620635
for (const auto & nub : params.n_ubatch)
@@ -644,6 +659,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
644659
/* .flash_attn = */ fa,
645660
/* .tensor_split = */ ts,
646661
/* .use_mmap = */ mmp,
662+
/* .use_direct_io= */ dio,
647663
/* .embeddings = */ embd,
648664
};
649665
instances.push_back(instance);
@@ -670,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
670686
/* .flash_attn = */ fa,
671687
/* .tensor_split = */ ts,
672688
/* .use_mmap = */ mmp,
689+
/* .use_direct_io= */ dio,
673690
/* .embeddings = */ embd,
674691
};
675692
instances.push_back(instance);
@@ -696,6 +713,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
696713
/* .flash_attn = */ fa,
697714
/* .tensor_split = */ ts,
698715
/* .use_mmap = */ mmp,
716+
/* .use_direct_io= */ dio,
699717
/* .embeddings = */ embd,
700718
};
701719
instances.push_back(instance);
@@ -734,6 +752,7 @@ struct test {
734752
bool flash_attn;
735753
std::vector<float> tensor_split;
736754
bool use_mmap;
755+
bool use_direct_io;
737756
bool embeddings;
738757
int n_prompt;
739758
int n_gen;
@@ -759,6 +778,7 @@ struct test {
759778
flash_attn = inst.flash_attn;
760779
tensor_split = inst.tensor_split;
761780
use_mmap = inst.use_mmap;
781+
use_direct_io = inst.use_direct_io;
762782
embeddings = inst.embeddings;
763783
n_prompt = inst.n_prompt;
764784
n_gen = inst.n_gen;
@@ -832,7 +852,7 @@ struct test {
832852
"n_threads", "type_k", "type_v",
833853
"n_gpu_layers", "split_mode",
834854
"main_gpu", "no_kv_offload", "flash_attn",
835-
"tensor_split", "use_mmap", "embeddings",
855+
"tensor_split", "use_mmap", "use_direct_io", "embeddings",
836856
"n_prompt", "n_gen", "test_time",
837857
"avg_ns", "stddev_ns",
838858
"avg_ts", "stddev_ts"
@@ -853,7 +873,7 @@ struct test {
853873
}
854874
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
855875
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
856-
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
876+
field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
857877
return BOOL;
858878
}
859879
if (field == "avg_ts" || field == "stddev_ts") {
@@ -888,7 +908,7 @@ struct test {
888908
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
889909
std::to_string(n_gpu_layers), split_mode_str(split_mode),
890910
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
891-
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
911+
tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
892912
std::to_string(n_prompt), std::to_string(n_gen), test_time,
893913
std::to_string(avg_ns()), std::to_string(stdev_ns()),
894914
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -1064,6 +1084,9 @@ struct markdown_printer : public printer {
10641084
if (field == "use_mmap") {
10651085
return "mmap";
10661086
}
1087+
if (field == "use_direct_io") {
1088+
return "direct_io";
1089+
}
10671090
if (field == "embeddings") {
10681091
return "embd";
10691092
}
@@ -1116,6 +1139,9 @@ struct markdown_printer : public printer {
11161139
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
11171140
fields.emplace_back("use_mmap");
11181141
}
1142+
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
1143+
fields.emplace_back("use_direct_io");
1144+
}
11191145
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
11201146
fields.emplace_back("embeddings");
11211147
}

examples/main/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,10 @@ These options help improve the performance and memory usage of the LLaMA models.
269269

270270
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
271271

272+
### Direct I/O
273+
274+
- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
275+
272276
### NUMA support
273277

274278
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The project is under active development, and we are [looking for feedback and co
3434
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
3535
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
3636
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
37+
- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution.
3738
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
3839
- `--numa distribute`: Spread execution evenly over all nodes
3940
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on

0 commit comments

Comments
 (0)