ggml-org · JohannesGaessler · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/Makefile b/Makefile
@@ -47,7 +47,7 @@ OPT = -O3
 endif
 CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
-LDFLAGS  =
+LDFLAGS  = -lsqlite3
 
 ifdef LLAMA_DEBUG
 	CFLAGS   += -O0 -g

diff --git a/benchmark.sh b/benchmark.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+
+for ngl in {0..35}
+do
+    ./main --model models/nvme/llama-7b-ggml-q4_0.bin --seed 1337 --ignore-eos --n-predict 128 --ctx-size 2048 --threads 8 -ngl $ngl -mmq
+done
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -17,8 +17,10 @@
 #include <ctime>
 #include <fstream>
 #include <iostream>
+#include <sstream>
 #include <string>
 #include <vector>
+#include <sqlite3.h>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -163,6 +165,30 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    sqlite3 * db = NULL;
+    int return_code;
+    const int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE;
+    return_code = sqlite3_open_v2("llama.sqlite", &db, flags, NULL);
+    fprintf(stderr, "\nsqlite open: %d %s\n\n", return_code, sqlite3_errmsg(db));
+
+    const std::string sql_create_table ="CREATE TABLE IF NOT EXISTS llama_runs("
+        "id           INTEGER PRIMARY KEY AUTOINCREMENT,"
+        "build_number INTEGER NOT NULL,"
+        "build_commit TEXT NOT NULL,"
+
+        "n_gpu_layers BIGINT NOT NULL,"
+
+        "t_sample_us  BIGINT NOT NULL,"
+        "t_eval_us    BIGINT NOT NULL,"
+        "t_p_eval_us  BIGINT NOT NULL,"
+        "n_sample     BIGINT NOT NULL,"
+        "n_eval       BIGINT NOT NULL,"
+        "n_p_eval     BIGINT NOT NULL);";
+
+    char * errmsg;
+    return_code = sqlite3_exec(db, sql_create_table.c_str(), NULL, NULL, &errmsg);
+    fprintf(stderr, "\nsqlite create table: %d %s\n\n", return_code, errmsg);
+
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
@@ -808,6 +834,17 @@ int main(int argc, char ** argv) {
     }
 
     llama_print_timings(ctx);
+
+    std::ostringstream sql_insert_values;
+    sql_insert_values << "INSERT INTO llama_runs(build_number, build_commit, n_gpu_layers, "
+        "t_sample_us, t_eval_us, t_p_eval_us, n_sample, n_eval, n_p_eval) VALUES (";
+    sql_insert_values << BUILD_NUMBER << ",";
+    sql_insert_values << "'" << BUILD_COMMIT << "',";
+    sql_insert_values << params.n_gpu_layers << ",";
+    llama_sqlite_append_timings(ctx, sql_insert_values);
+    return_code = sqlite3_exec(db, sql_insert_values.str().c_str(), NULL, NULL, &errmsg);
+    fprintf(stderr, "\nsqlite insert data: %d %s\n\n", return_code, errmsg);
+
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);

diff --git a/llama.cpp b/llama.cpp
@@ -4243,6 +4243,15 @@ void llama_print_timings(struct llama_context * ctx) {
     fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }
 
+void llama_sqlite_append_timings(struct llama_context * ctx, std::ostringstream & sql_insert_values) {
+    sql_insert_values << ctx->t_sample_us << ",";
+    sql_insert_values << ctx->t_eval_us   << ",";
+    sql_insert_values << ctx->t_p_eval_us << ",";
+    sql_insert_values << ctx->n_sample    << ",";
+    sql_insert_values << ctx->n_eval      << ",";
+    sql_insert_values << ctx->n_p_eval    << ");";
+}
+
 void llama_reset_timings(struct llama_context * ctx) {
     ctx->t_start_us = ggml_time_us();
     ctx->t_sample_us = ctx->n_sample = 0;

diff --git a/llama.h b/llama.h
@@ -8,6 +8,7 @@
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
+#include <sstream>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -446,6 +447,7 @@ extern "C" {
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
+    LLAMA_API void llama_sqlite_append_timings(struct llama_context * ctx, std::ostringstream & sql_insert_values);
     LLAMA_API void llama_reset_timings(struct llama_context * ctx);
 
     // Print system information

diff --git a/plot_ts_per_ngl.py b/plot_ts_per_ngl.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import sqlite3
+import numpy as np
+import matplotlib.pyplot as plt
+
+con = sqlite3.connect("llama.sqlite")
+cur = con.cursor()
+
+res = cur.execute("SELECT n_gpu_layers, 1000000.0*n_eval/t_eval_us FROM llama_runs ORDER BY n_gpu_layers;")
+ts = np.array(res.fetchall())
+
+plt.plot(ts[:, 0], ts[:, 1])
+plt.xlim(0, 35)
+plt.ylim(0, 130)
+plt.title("7b q4_0, 3700X, 3200 MHz dual-channel RAM, RTX 3090")
+plt.xlabel("-ngl")
+plt.ylabel("Generated t/s")
+plt.savefig("benchmark.png", dpi=240)
+plt.show()