Direct I/O and Transparent HugePages

pavelfatin · pavelfatin · commit 5f097e2485e8 · 2024-06-05T19:19:18.000+02:00
--direct-io for bypassing page cache (and using THP on Linux)

Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.
diff --git a/common/common.cpp b/common/common.cpp
@@ -1068,6 +1068,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.use_mmap = false;
         return true;
     }
+    if (arg == "--direct-io") {
+        params.use_direct_io = true;
+        return true;
+    }
     if (arg == "--numa") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1751,6 +1755,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     if (llama_supports_mmap()) {
         options.push_back({ "*",           "       --no-mmap",              "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
     }
+    if (llama_supports_direct_io()) {
+        options.push_back({ "*",           "       --direct-io",            "use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)" });
+    }
     options.push_back({ "*",           "       --numa TYPE",            "attempt optimizations that help on some NUMA systems\n"
                                                                         "  - distribute: spread execution evenly over all nodes\n"
                                                                         "  - isolate: only spawn threads on CPUs on the node that execution started on\n"
@@ -2289,6 +2296,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
+    mparams.use_direct_io   = params.use_direct_io;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
     if (params.kv_overrides.empty()) {
@@ -3249,6 +3257,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
+    fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
     fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
diff --git a/common/common.h b/common/common.h
@@ -162,6 +162,7 @@ struct gpt_params {
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
+    bool use_direct_io     = false; // use direct I/O
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose           = false;
     bool verbose_prompt    = false; // print prompt tokens before generation
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
@@ -38,6 +38,7 @@ options:
   -nkvo, --no-kv-offload <0|1>        (default: 0)
   -fa, --flash-attn <0|1>             (default: 0)
   -mmp, --mmap <0|1>                  (default: 1)
+  -dio, --direct-io <0|1>             (default: 0)
   --numa <distribute|isolate|numactl> (default: disabled)
   -embd, --embeddings <0|1>           (default: 0)
   -ts, --tensor-split <ts0/ts1/..>    (default: 0)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -189,6 +189,7 @@ struct cmd_params {
     std::vector<bool> flash_attn;
     std::vector<std::vector<float>> tensor_split;
     std::vector<bool> use_mmap;
+    std::vector<bool> use_direct_io;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
     int reps;
@@ -215,6 +216,7 @@ static const cmd_params cmd_params_defaults = {
     /* flash_attn           */ {false},
     /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
     /* use_mmap             */ {true},
+    /* use_direct_io        */ {false},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
@@ -244,6 +246,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -dio, --direct-io <0|1>             (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
@@ -461,6 +464,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<bool>(argv[i], split_delim);
             params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+        } else if (arg == "-dio" || arg == "--direct-io") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
         } else if (arg == "-embd" || arg == "--embeddings") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -538,6 +548,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
     if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
     if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
+    if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
     if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
 
@@ -561,6 +572,7 @@ struct cmd_params_instance {
     bool flash_attn;
     std::vector<float> tensor_split;
     bool use_mmap;
+    bool use_direct_io;
     bool embeddings;
 
     llama_model_params to_llama_mparams() const {
@@ -574,6 +586,7 @@ struct cmd_params_instance {
         mparams.main_gpu = main_gpu;
         mparams.tensor_split = tensor_split.data();
         mparams.use_mmap = use_mmap;
+        mparams.use_direct_io = use_direct_io;
 
         return mparams;
     }
@@ -585,6 +598,7 @@ struct cmd_params_instance {
                split_mode == other.split_mode &&
                main_gpu == other.main_gpu &&
                use_mmap == other.use_mmap &&
+               use_direct_io == other.use_direct_io &&
                tensor_split == other.tensor_split;
     }
 
@@ -615,6 +629,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & mmp : params.use_mmap)
+    for (const auto & dio : params.use_direct_io)
     for (const auto & embd : params.embeddings)
     for (const auto & nb : params.n_batch)
     for (const auto & nub : params.n_ubatch)
@@ -644,6 +659,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -670,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -696,6 +713,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
                 /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
@@ -734,6 +752,7 @@ struct test {
     bool flash_attn;
     std::vector<float> tensor_split;
     bool use_mmap;
+    bool use_direct_io;
     bool embeddings;
     int n_prompt;
     int n_gen;
@@ -759,6 +778,7 @@ struct test {
         flash_attn = inst.flash_attn;
         tensor_split = inst.tensor_split;
         use_mmap = inst.use_mmap;
+        use_direct_io = inst.use_direct_io;
         embeddings = inst.embeddings;
         n_prompt = inst.n_prompt;
         n_gen = inst.n_gen;
@@ -832,7 +852,7 @@ struct test {
             "n_threads", "type_k", "type_v",
             "n_gpu_layers", "split_mode",
             "main_gpu", "no_kv_offload", "flash_attn",
-            "tensor_split", "use_mmap", "embeddings",
+            "tensor_split", "use_mmap", "use_direct_io", "embeddings",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
             "avg_ts", "stddev_ts"
@@ -853,7 +873,7 @@ struct test {
         }
         if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
             field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
+            field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -888,7 +908,7 @@ struct test {
             std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
-            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
+            tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
             std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -1064,6 +1084,9 @@ struct markdown_printer : public printer {
         if (field == "use_mmap") {
             return "mmap";
         }
+        if (field == "use_direct_io") {
+            return "direct_io";
+        }
         if (field == "embeddings") {
             return "embd";
         }
@@ -1116,6 +1139,9 @@ struct markdown_printer : public printer {
         if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
             fields.emplace_back("use_mmap");
         }
+        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
+            fields.emplace_back("use_direct_io");
+        }
         if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
             fields.emplace_back("embeddings");
         }
diff --git a/examples/main/README.md b/examples/main/README.md
@@ -269,6 +269,10 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
 
+### Direct I/O
+
+-   `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
+
 ### NUMA support
 
 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -34,6 +34,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution.
 - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py