4
4
#include < algorithm>
5
5
#include < cassert>
6
6
#include < cmath>
7
+ #include < cstdint>
7
8
#include < cstring>
8
9
#include < ctime>
9
10
#include < fstream>
12
13
#include < regex>
13
14
#include < sstream>
14
15
#include < string>
16
+ #include < system_error>
15
17
#include < unordered_map>
16
18
#include < unordered_set>
17
19
#include < vector>
@@ -720,6 +722,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
720
722
break ;
721
723
}
722
724
params.lookup_cache_static = argv[i];
725
+ } else if (arg == " -lcd" || arg == " --lookup-cache-dynamic" ) {
726
+ if (++i >= argc) {
727
+ invalid_param = true ;
728
+ break ;
729
+ }
730
+ params.lookup_cache_dynamic = argv[i];
723
731
} else if (arg == " --save-all-logits" || arg == " --kl-divergence-base" ) {
724
732
if (++i >= argc) {
725
733
invalid_param = true ;
@@ -1100,7 +1108,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1100
1108
printf (" -ld LOGDIR, --logdir LOGDIR\n " );
1101
1109
printf (" path under which to save YAML logs (no logging if unset)\n " );
1102
1110
printf (" -lcs FNAME, --lookup-cache-static FNAME\n " );
1103
- printf (" path to static lookup cache to use for lookup decoding\n " );
1111
+ printf (" path to static lookup cache to use for lookup decoding (not updated by generation)\n " );
1112
+ printf (" -lcd FNAME, --lookup-cache-dynamic FNAME\n " );
1113
+ printf (" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n " );
1104
1114
printf (" --override-kv KEY=TYPE:VALUE\n " );
1105
1115
printf (" advanced option to override model metadata by key. may be specified multiple times.\n " );
1106
1116
printf (" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n " );
@@ -1860,15 +1870,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
1860
1870
printf (" \n === Done dumping\n " );
1861
1871
}
1862
1872
1863
- void llama_ngram_cache_update (std::vector< llama_ngram_cache> & ncs , int ngram_min,
1873
+ void llama_ngram_cache_update (llama_ngram_cache & ngram_cache , int ngram_min, int ngram_max ,
1864
1874
std::vector<llama_token> & inp, int nnew, bool print_progress) {
1865
1875
const int64_t t_start_ms = ggml_time_ms ();
1866
- const int ngram_max = ngram_min + ncs.size ()-1 ;
1867
1876
const int inp_size = inp.size ();
1868
1877
1869
1878
for (int ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
1870
- llama_ngram_cache & nc = ncs[ngram_size - ngram_min];
1871
-
1872
1879
const int i_start = std::max (inp_size - nnew, ngram_size);
1873
1880
for (int i = i_start; i < inp_size; ++i) {
1874
1881
const int ngram_start = i - ngram_size;
@@ -1880,11 +1887,11 @@ void llama_ngram_cache_update(std::vector<llama_ngram_cache> & ncs, int ngram_mi
1880
1887
}
1881
1888
const llama_token token = inp[i];
1882
1889
1883
- llama_ngram_cache::iterator part_it = nc .find (ngram);
1884
- if (part_it == nc .end ()) {
1890
+ llama_ngram_cache::iterator part_it = ngram_cache .find (ngram);
1891
+ if (part_it == ngram_cache .end ()) {
1885
1892
llama_ngram_cache_part part;
1886
1893
part.emplace (token, 1 );
1887
- nc .emplace (ngram, part);
1894
+ ngram_cache .emplace (ngram, part);
1888
1895
} else {
1889
1896
llama_ngram_cache_part::iterator token_count_it = part_it->second .find (token);
1890
1897
if (token_count_it == part_it->second .end ()) {
@@ -1911,128 +1918,150 @@ static llama_token get_token(const std::vector<llama_token> & inp, const std::ve
1911
1918
};
1912
1919
1913
1920
// If sample size or percentage in context are below these thresholds the draft is aborted early:
1914
- constexpr int draft_min_sample_size[LLAMA_NGRAM_MAX] = { 2 , 2 , 1 , 1 };
1915
- constexpr int draft_min_percent[LLAMA_NGRAM_MAX] = {50 , 50 , 50 , 50 };
1921
+ constexpr int draft_min_sample_size_t1[LLAMA_NGRAM_MAX] = { 2 , 2 , 1 , 1 };
1922
+ constexpr int draft_min_percent_t1[LLAMA_NGRAM_MAX] = {66 , 50 , 50 , 50 };
1923
+ constexpr int draft_min_sample_size_t2[LLAMA_NGRAM_MAX] = { 4 , 3 , 2 , 2 };
1924
+ constexpr int draft_min_percent_t2[LLAMA_NGRAM_MAX] = {75 , 66 , 66 , 66 };
1916
1925
1917
- void llama_ngram_cache_draft (
1918
- std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft,
1919
- std::vector<llama_ngram_cache> & ncs_t1, int ngram_min, llama_ngram_cache & nc_t2
1920
- ) {
1921
- const int inp_size = inp. size ();
1922
- const int ngram_max = ngram_min + ncs_t1. size ()- 1 ;
1926
+ static llama_token try_draft (llama_ngram_cache & nc_primary, const uint64_t ngram_primary) {
1927
+ llama_ngram_cache::iterator part_primary_it = nc_primary. find (ngram_primary);
1928
+ if (part_primary_it == nc_primary. end ()) {
1929
+ return - 1 ;
1930
+ }
1931
+ const llama_ngram_cache_part part_primary = part_primary_it-> second ;
1923
1932
1924
- while ((int ) draft.size ()-1 < n_draft) {
1925
- bool draft_success = false ;
1933
+ int max_count_primary = 0 ;
1934
+ int sum_count_primary = 0 ;
1935
+ llama_token max_token = -1 ;
1926
1936
1927
- const int ngram_start_t2 = inp_size-2 + draft.size ()-1 ;
1928
- uint64_t ngram_t2 = get_token (inp, draft, ngram_start_t2);
1929
- for (int j = ngram_start_t2+1 ; j < ngram_start_t2 + 2 ; ++j) {
1930
- const uint64_t token = get_token (inp, draft, j);
1931
- ngram_t2 <<= 16 ;
1932
- ngram_t2 |= token;
1933
- }
1934
- llama_ngram_cache::iterator part_t2_it = nc_t2.find (ngram_t2);
1935
- llama_ngram_cache_part part_t2;
1936
- if (part_t2_it != nc_t2.end ()) {
1937
- part_t2 = part_t2_it->second ;
1937
+ for (std::pair<llama_token, int > token_count_primary : part_primary) {
1938
+ const llama_token token = token_count_primary.first ;
1939
+ const int32_t count_primary = token_count_primary.second ;
1940
+
1941
+ if (count_primary > max_count_primary) {
1942
+ max_token = token;
1943
+ max_count_primary = count_primary;
1938
1944
}
1945
+ sum_count_primary += count_primary;
1946
+ }
1939
1947
1940
- for (int ngram_size = ngram_max; ngram_size >= ngram_min; --ngram_size) {
1941
- if (ngram_size > inp_size) {
1942
- continue ;
1943
- }
1948
+ if (sum_count_primary < draft_min_sample_size_t1[2 -1 ]) {
1949
+ return -1 ;
1950
+ }
1951
+ if (100 *max_count_primary < draft_min_percent_t1[2 -1 ]*sum_count_primary) {
1952
+ return -1 ;
1953
+ }
1954
+ return max_token;
1955
+ }
1944
1956
1945
- llama_ngram_cache & nc_t1 = ncs_t1[ngram_size - ngram_min];
1957
+ static llama_token try_draft (
1958
+ llama_ngram_cache & nc_primary, const std::vector<uint64_t > & ngrams_primary, llama_ngram_cache_part & part_validate,
1959
+ const int * min_sample_size, const int * min_percent) {
1946
1960
1947
- const int ngram_start_t1 = inp_size-ngram_size + draft.size ()-1 ;
1948
- uint64_t ngram_t1 = get_token (inp, draft, ngram_start_t1);
1949
- for (int j = ngram_start_t1+1 ; j < ngram_start_t1 + ngram_size; ++j) {
1950
- const uint64_t token = get_token (inp, draft, j);
1951
- ngram_t1 <<= 16 ;
1952
- ngram_t1 |= token;
1953
- }
1961
+ llama_token drafted_token = -1 ;
1954
1962
1955
- llama_ngram_cache::iterator part_t1_it = nc_t1.find (ngram_t1);
1956
- if (part_t1_it == nc_t1.end ()) {
1957
- continue ;
1958
- }
1959
- const llama_ngram_cache_part part_t1 = part_t1_it->second ;
1963
+ for (int i = ngrams_primary.size ()-1 ; i >= 0 && drafted_token == -1 ; --i) {
1964
+ const uint64_t ngram_primary = ngrams_primary[i];
1960
1965
1961
- int max_count_t1 = 0 ;
1962
- int max_count_t2 = 0 ;
1963
- int sum_count_t1 = 0 ;
1964
- llama_token max_token = -1 ;
1966
+ llama_ngram_cache::iterator part_primary_it = nc_primary.find (ngram_primary);
1967
+ if (part_primary_it == nc_primary.end ()) {
1968
+ continue ;
1969
+ }
1970
+ const llama_ngram_cache_part part_primary = part_primary_it->second ;
1965
1971
1966
- for (std::pair<llama_token, int > token_count_t1 : part_t1) {
1967
- const llama_token token = token_count_t1.first ;
1972
+ int max_count_primary = 0 ;
1973
+ int max_count_validate = 0 ;
1974
+ int sum_count_primary = 0 ;
1975
+ llama_token max_token = -1 ;
1968
1976
1969
- llama_ngram_cache_part::iterator token_count_t2_it = part_t2.find (token);
1970
- const int32_t count_t1 = token_count_t1.second ;
1971
- const int32_t count_t2 = token_count_t2_it != part_t2.end () ? 100 *token_count_t2_it->second : 1 ;
1977
+ for (std::pair<llama_token, int > token_count_primary : part_primary) {
1978
+ const llama_token token = token_count_primary.first ;
1972
1979
1973
- if (count_t1*count_t2 > max_count_t1*max_count_t2) {
1974
- max_token = token;
1975
- max_count_t1 = count_t1;
1976
- max_count_t2 = count_t2;
1977
- }
1978
- sum_count_t1 += count_t1;
1979
- }
1980
- // Skip this candidate if the sample size is too low:
1981
- if (sum_count_t1 < draft_min_sample_size[ngram_size-1 ]) {
1982
- continue ;
1983
- }
1984
- // skip this candidate if the empirically most likely token following this token is not likely enough:
1985
- if (100 *max_count_t1 < draft_min_percent[ngram_size-1 ]*sum_count_t1) {
1986
- continue ;
1980
+ llama_ngram_cache_part::iterator token_count_validate_it = part_validate.find (token);
1981
+
1982
+ const int32_t count_primary = token_count_primary.second ;
1983
+ const int32_t count_validate = token_count_validate_it != part_validate.end () ? 100 *token_count_validate_it->second : 1 ;
1984
+
1985
+ if (count_primary*count_validate > max_count_primary*max_count_validate) {
1986
+ max_token = token;
1987
+ max_count_primary = count_primary;
1988
+ max_count_validate = count_validate;
1987
1989
}
1990
+ sum_count_primary += count_primary;
1991
+ }
1988
1992
1989
- LOG (" - draft candidate: token=%d count=%d\n " , max_token, max_count_t1);
1990
- draft.push_back (max_token);
1991
- draft_success = true ;
1992
- break ;
1993
+ if (sum_count_primary < min_sample_size[i]) {
1994
+ continue ;
1993
1995
}
1996
+ if (100 *max_count_primary < min_percent[i]*sum_count_primary) {
1997
+ continue ;;
1998
+ }
1999
+ drafted_token = max_token;
2000
+ }
1994
2001
1995
- if (!draft_success) {
1996
- int max_count_t2 = 0 ;
1997
- int sum_count_t2 = 0 ;
1998
- llama_token max_token = -1 ;
2002
+ return drafted_token;
2003
+ }
1999
2004
2000
- for (std::pair<llama_token, int > token_count_t2 : part_t2) {
2001
- const llama_token token = token_count_t2.first ;
2002
- const int32_t count_t2 = token_count_t2.second ;
2005
+ void llama_ngram_cache_draft (
2006
+ std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
2007
+ llama_ngram_cache & nc_t1, llama_ngram_cache & nc_t2, llama_ngram_cache & nc_t3
2008
+ ) {
2009
+ const int inp_size = inp.size ();
2003
2010
2004
- if (count_t2 > max_count_t2) {
2005
- max_token = token;
2006
- max_count_t2 = count_t2;
2007
- }
2008
- sum_count_t2 += count_t2;
2009
- }
2011
+ if (inp_size < 2 ) {
2012
+ return ;
2013
+ }
2010
2014
2011
- // Skip this candidate if the sample size is too low:
2012
- if (sum_count_t2 < draft_min_sample_size[2 -1 ]) {
2013
- break ;
2014
- }
2015
- // skip this candidate if the empirically most likely token following this token is not likely enough:
2016
- if (100 *max_count_t2 < draft_min_percent[2 -1 ]*sum_count_t2) {
2017
- break ;
2018
- }
2015
+ while ((int ) draft.size ()-1 < n_draft) {
2016
+ llama_token drafted_token = -1 ;
2019
2017
2020
- LOG (" - draft candidate: token=%d count=%d\n " , max_token, max_count_t2);
2021
- draft.push_back (max_token);
2022
- draft_success = true ;
2023
- break ;
2018
+ const int ngram_start_t23 = inp_size-2 + draft.size ()-1 ;
2019
+ uint64_t ngram_t23 = get_token (inp, draft, ngram_start_t23);
2020
+ for (int j = ngram_start_t23+1 ; j < ngram_start_t23 + 2 ; ++j) {
2021
+ const uint64_t token = get_token (inp, draft, j);
2022
+ ngram_t23 <<= 16 ;
2023
+ ngram_t23 |= token;
2024
+ }
2025
+ llama_ngram_cache::iterator part_t3_it = nc_t3.find (ngram_t23);
2026
+ llama_ngram_cache_part part_t3;
2027
+ if (part_t3_it != nc_t3.end ()) {
2028
+ part_t3 = part_t3_it->second ;
2024
2029
}
2025
2030
2026
- if (!draft_success) {
2031
+ std::vector<uint64_t > ngrams_t12;
2032
+ for (int ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
2033
+ const int ngram_start_t12 = inp_size-ngram_size + draft.size ()-1 ;
2034
+ uint64_t ngram_t12 = get_token (inp, draft, ngram_start_t12);
2035
+ for (int j = ngram_start_t12+1 ; j < ngram_start_t12 + ngram_size; ++j) {
2036
+ const uint64_t token = get_token (inp, draft, j);
2037
+ ngram_t12 <<= 16 ;
2038
+ ngram_t12 |= token;
2039
+ }
2040
+ ngrams_t12.push_back (ngram_t12);
2041
+ }
2042
+ if (drafted_token == -1 ) {
2043
+ drafted_token = try_draft (nc_t1, ngrams_t12, part_t3, draft_min_sample_size_t1, draft_min_percent_t1);
2044
+ }
2045
+ if (drafted_token == -1 ) {
2046
+ drafted_token = try_draft (nc_t2, ngrams_t12, part_t3, draft_min_sample_size_t2, draft_min_percent_t2);
2047
+ }
2048
+ if (drafted_token == -1 ) {
2049
+ drafted_token = try_draft (nc_t3, ngram_t23);
2050
+ }
2051
+
2052
+ if (drafted_token == -1 ) {
2027
2053
break ;
2028
2054
}
2055
+
2056
+ LOG (" - draft candidate: token=%d\n " , drafted_token);
2057
+ draft.push_back (drafted_token);
2029
2058
}
2030
2059
};
2031
2060
2032
- void llama_ngram_cache_save (std::vector< llama_ngram_cache> & ngram_cache, std::string & filename) {
2061
+ void llama_ngram_cache_save (llama_ngram_cache & ngram_cache, std::string & filename) {
2033
2062
GGML_ASSERT (ngram_cache.size () == 1 );
2034
2063
std::ofstream file_out (filename, std::ios::binary);
2035
- for (std::pair<uint64_t , llama_ngram_cache_part> item : ngram_cache[ 0 ] ) {
2064
+ for (std::pair<uint64_t , llama_ngram_cache_part> item : ngram_cache) {
2036
2065
const uint64_t ngram = item.first ;
2037
2066
llama_ngram_cache_part token_counts = item.second ;
2038
2067
GGML_ASSERT (!token_counts.empty ());
@@ -2054,8 +2083,7 @@ void llama_ngram_cache_save(std::vector<llama_ngram_cache> & ngram_cache, std::s
2054
2083
llama_ngram_cache llama_ngram_cache_load (std::string & filename) {
2055
2084
std::ifstream hashmap_file (filename, std::ios::binary);
2056
2085
if (!hashmap_file) {
2057
- fprintf (stderr, " error: failed to open file '%s'\n " , filename.c_str ());
2058
- exit (1 );
2086
+ throw std::system_error ();
2059
2087
}
2060
2088
llama_ngram_cache ngram_cache;
2061
2089
@@ -2084,3 +2112,29 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
2084
2112
2085
2113
return ngram_cache;
2086
2114
}
2115
+
2116
+ void llama_ngram_cache_merge (llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
2117
+ for (std::pair<uint64_t , llama_ngram_cache_part> ngram_part : ngram_cache_add) {
2118
+ const uint64_t ngram = ngram_part.first ;
2119
+ llama_ngram_cache_part part = ngram_part.second ;
2120
+
2121
+ llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find (ngram);
2122
+ if (part_merged_it == ngram_cache_target.end ()) {
2123
+ ngram_cache_target.emplace (ngram, part);
2124
+ continue ;
2125
+ }
2126
+
2127
+ for (std::pair<llama_token, int32_t > token_count : part) {
2128
+ const llama_token token = token_count.first ;
2129
+ const int32_t count = token_count.second ;
2130
+
2131
+ llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second .find (token);
2132
+ if (token_count_merged_it == part_merged_it->second .end ()) {
2133
+ part_merged_it->second .emplace (token, count);
2134
+ continue ;
2135
+ }
2136
+
2137
+ token_count_merged_it->second += count;
2138
+ }
2139
+ }
2140
+ }
0 commit comments