examples : add "retrieval" (ggml-org#6193)

mscheong01 · ggerganov · tybalex · commit c8b7620a61f7 · 2024-04-17T15:28:35.000-07:00
* add `retrieval` example

* add README

* minor fixes

* cast filepos on print

* remove use of variable sized array

* store similarities in separate vector

* print error on insufficient batch size

* fix error message printing

* assign n_batch value to n_ubatch

* fix param definitions

* define retrieval-only parameters in retrieval.cpp

* fix `--context-file` option to be provided multiple times for multiple files

* use vector for `query_emb`

* add usage description in README

* fix merge conflict

* fix usage printing

* remove seed setting

* fix lint

* increase file read buffer size

* retrieval : minor

---------

Co-authored-by: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -77,6 +77,7 @@ models-mnt
 /batched-bench
 /export-lora
 /finetune
+/retrieval
 /speculative
 /parallel
 /train-text-from-scratch
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -810,6 +810,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/common/common.cpp b/common/common.cpp
@@ -157,7 +157,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return result;
 }
 
-static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
     llama_sampling_params& sparams = params.sparams;
 
     if (arg == "-s" || arg == "--seed") {
diff --git a/common/common.h b/common/common.h
@@ -171,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
 std::string get_system_info(const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
diff --git a/common/log.h b/common/log.h
@@ -566,6 +566,7 @@ inline void log_print_usage()
     printf("  --log-new             Create a separate new log file on start. "
                                    "Each log file will have unique name: \"<name>.<ID>.log\"\n");
     printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
 }
 
 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -34,6 +34,7 @@ else()
     add_subdirectory(perplexity)
     add_subdirectory(quantize)
     add_subdirectory(quantize-stats)
+    add_subdirectory(retrieval)
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
     add_subdirectory(passkey)
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET retrieval)
+add_executable(${TARGET} retrieval.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md
@@ -0,0 +1,69 @@
+# llama.cpp/examples/retrieval
+
+Demonstration of simple retrieval technique based on cosine similarity
+
+More info:
+https://github.com/ggerganov/llama.cpp/pull/6193
+
+### How to use
+
+`retieval.cpp` has parameters of its own:
+- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
+- `--chunk-size`: minimum size of each text chunk to be embedded
+- `--chunk-separator`: STRING to divide chunks by. newline by default
+
+`retrieval` example can be tested as follows:
+
+```bash
+make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+```
+
+This chunks and embeds all given files and starts a loop requesting query inputs:
+
+```
+Enter query:
+```
+
+On each query input, top k chunks are shown along with file name, chunk position within file and original text:
+
+```
+Enter query: describe the mit license
+batch_decode: n_tokens = 6, n_seq = 1
+Top 3 similar chunks:
+filename: README.md
+filepos: 119
+similarity: 0.762334
+textdata:
+png)
+
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+
+[Roadmap](https://github.
+--------------------
+filename: License
+filepos: 0
+similarity: 0.725146
+textdata:
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+--------------------
+filename: README.md
+filepos: 9178
+similarity: 0.621722
+textdata:
+com/cztomsik/ava) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [pythops/tenere](https://github.
+--------------------
+```
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
diff --git a/retrieval b/retrieval

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {`
`157`	`157`	`return result;`
`158`	`158`	`}`
`159`	`159`
`160`		`-static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {`
	`160`	`+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {`
`161`	`161`	`llama_sampling_params& sparams = params.sparams;`
`162`	`162`
`163`	`163`	`if (arg == "-s" \|\| arg == "--seed") {`
Original file line number	Diff line number	Diff line change
`@@ -566,6 +566,7 @@ inline void log_print_usage()`
`566`	`566`	`printf(" --log-new Create a separate new log file on start. "`
`567`	`567`	`"Each log file will have unique name: \"<name>.<ID>.log\"\n");`
`568`	`568`	`printf(" --log-append Don't truncate the old log file.\n");`
	`569`	`+ printf("\n");`
`569`	`570`	`}`
`570`	`571`
`571`	`572`	`#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)`