Skip to content

Commit 5624a29

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 0480362 + f5bfea0 commit 5624a29

File tree

15 files changed

+881
-298
lines changed

15 files changed

+881
-298
lines changed

Makefile

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
142142
#CXXFLAGS += -mssse3
143143
endif
144144

145+
ifneq ($(filter aarch64%,$(UNAME_M)),)
146+
# Apple M1, M2, etc.
147+
# Raspberry Pi 3, 4, Zero 2 (64-bit)
148+
CFLAGS += -mcpu=native
149+
CXXFLAGS += -mcpu=native
150+
endif
151+
152+
ifneq ($(filter armv6%,$(UNAME_M)),)
153+
# Raspberry Pi 1, Zero
154+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
155+
endif
156+
157+
ifneq ($(filter armv7%,$(UNAME_M)),)
158+
# Raspberry Pi 2
159+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
160+
endif
161+
162+
ifneq ($(filter armv8%,$(UNAME_M)),)
163+
# Raspberry Pi 3, 4, Zero 2 (32-bit)
164+
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
165+
endif
166+
145167
ifneq ($(filter ppc64%,$(UNAME_M)),)
146168
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
147169
ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -270,28 +292,6 @@ ifdef LLAMA_METAL
270292
OBJS += ggml-metal.o
271293
endif # LLAMA_METAL
272294

273-
ifneq ($(filter aarch64%,$(UNAME_M)),)
274-
# Apple M1, M2, etc.
275-
# Raspberry Pi 3, 4, Zero 2 (64-bit)
276-
CFLAGS += -mcpu=native
277-
CXXFLAGS += -mcpu=native
278-
endif
279-
280-
ifneq ($(filter armv6%,$(UNAME_M)),)
281-
# Raspberry Pi 1, Zero
282-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
283-
endif
284-
285-
ifneq ($(filter armv7%,$(UNAME_M)),)
286-
# Raspberry Pi 2
287-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
288-
endif
289-
290-
ifneq ($(filter armv8%,$(UNAME_M)),)
291-
# Raspberry Pi 3, 4, Zero 2 (32-bit)
292-
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
293-
endif
294-
295295
ifdef LLAMA_METAL
296296
ggml-metal.o: ggml-metal.m ggml-metal.h
297297
$(CC) $(CFLAGS) -c $< -o $@
@@ -380,7 +380,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
380380
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
381381
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
382382

383-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
383+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
384384
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
385385

386386
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)

build.zig

Lines changed: 77 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,87 @@
1+
// Compatible with Zig Version 0.11.0
12
const std = @import("std");
2-
const commit_hash = @embedFile(".git/refs/heads/master");
3+
const Compile = std.Build.Step.Compile;
4+
const ConfigHeader = std.Build.Step.ConfigHeader;
5+
const Mode = std.builtin.Mode;
6+
const CrossTarget = std.zig.CrossTarget;
37

4-
// Zig Version: 0.11.0-dev.3986+e05c242cd
5-
pub fn build(b: *std.build.Builder) void {
6-
const target = b.standardTargetOptions(.{});
7-
const optimize = b.standardOptimizeOption(.{});
8+
const Maker = struct {
9+
builder: *std.build.Builder,
10+
target: CrossTarget,
11+
optimize: Mode,
12+
config_header: *ConfigHeader,
13+
14+
const cflags = .{"-std=c11"};
15+
const cxxflags = .{"-std=c++11"};
16+
17+
fn init(builder: *std.build.Builder) Maker {
18+
const commit_hash = @embedFile(".git/refs/heads/master");
19+
const config_header = builder.addConfigHeader(
20+
.{ .style = .blank, .include_path = "build-info.h" },
21+
.{
22+
.BUILD_NUMBER = 0,
23+
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
24+
},
25+
);
26+
return Maker{
27+
.builder = builder,
28+
.target = builder.standardTargetOptions(.{}),
29+
.optimize = builder.standardOptimizeOption(.{}),
30+
.config_header = config_header,
31+
};
32+
}
833

9-
const config_header = b.addConfigHeader(
10-
.{ .style = .blank, .include_path = "build-info.h" },
11-
.{
12-
.BUILD_NUMBER = 0,
13-
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
14-
},
15-
);
34+
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
35+
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
36+
if (std.mem.endsWith(u8, src, ".c")) {
37+
o.addCSourceFiles(&.{src}, &cflags);
38+
o.linkLibC();
39+
} else {
40+
o.addCSourceFiles(&.{src}, &cxxflags);
41+
o.linkLibCpp();
42+
}
43+
o.addIncludePath(.{ .path = "." });
44+
o.addIncludePath(.{ .path = "./examples" });
45+
return o;
46+
}
47+
48+
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
49+
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
50+
e.addIncludePath(.{ .path = "." });
51+
e.addIncludePath(.{ .path = "./examples" });
52+
e.addCSourceFiles(&.{src}, &cxxflags);
53+
for (deps) |d| e.addObject(d);
54+
e.linkLibC();
55+
e.linkLibCpp();
56+
e.addConfigHeader(m.config_header);
57+
m.builder.installArtifact(e);
1658

17-
const lib = b.addStaticLibrary(.{
18-
.name = "llama",
19-
.target = target,
20-
.optimize = optimize,
21-
});
22-
lib.linkLibC();
23-
lib.linkLibCpp();
24-
lib.addIncludePath(".");
25-
lib.addIncludePath("./examples");
26-
lib.addConfigHeader(config_header);
27-
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
28-
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
29-
b.installArtifact(lib);
59+
// Currently a bug is preventing correct linking for optimized builds for Windows:
60+
// https://github.com/ziglang/zig/issues/15958
61+
if (e.target.isWindows()) {
62+
e.want_lto = false;
63+
}
64+
return e;
65+
}
66+
};
3067

31-
const examples = .{
32-
"main",
33-
"baby-llama",
34-
"embedding",
35-
"metal",
36-
"perplexity",
37-
"quantize",
38-
"quantize-stats",
39-
"save-load-state",
40-
"server",
41-
"simple",
42-
"train-text-from-scratch",
43-
};
68+
pub fn build(b: *std.build.Builder) void {
69+
const make = Maker.init(b);
4470

45-
inline for (examples) |example_name| {
46-
const exe = b.addExecutable(.{
47-
.name = example_name,
48-
.target = target,
49-
.optimize = optimize,
50-
});
51-
exe.addIncludePath(".");
52-
exe.addIncludePath("./examples");
53-
exe.addConfigHeader(config_header);
54-
exe.addCSourceFiles(&.{
55-
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
56-
"examples/common.cpp",
57-
}, &.{"-std=c++11"});
58-
exe.linkLibrary(lib);
59-
b.installArtifact(exe);
71+
const ggml = make.obj("ggml", "ggml.c");
72+
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
73+
const llama = make.obj("llama", "llama.cpp");
74+
const common = make.obj("common", "examples/common.cpp");
75+
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
6076

61-
const run_cmd = b.addRunArtifact(exe);
62-
run_cmd.step.dependOn(b.getInstallStep());
63-
if (b.args) |args| run_cmd.addArgs(args);
77+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
78+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
79+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
80+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
81+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
6482

65-
const run_step = b.step("run-" ++ example_name, "Run the app");
66-
run_step.dependOn(&run_cmd.step);
83+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
84+
if (server.target.isWindows()) {
85+
server.linkSystemLibrary("ws2_32");
6786
}
6887
}

convert.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,13 @@ def to_ggml(self) -> 'GGMLQuantizedTensor':
465465
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
466466
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
467467

468+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
469+
r = self.ndarray.shape[0] // 3
470+
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
471+
472+
def part(self, n_part: int) -> 'UnquantizedTensor':
473+
r = self.ndarray.shape[0] // 3
474+
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
468475

469476
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
470477

examples/common.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
203203
break;
204204
}
205205
params.rope_freq_scale = std::stof(argv[i]);
206+
} else if (arg == "--rope-scale") {
207+
if (++i >= argc) {
208+
invalid_param = true;
209+
break;
210+
}
211+
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
206212
} else if (arg == "--memory-f32") {
207213
params.memory_f16 = false;
208214
} else if (arg == "--top-p") {
@@ -575,8 +581,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
575581
fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
576582
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
577583
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
578-
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
579-
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
584+
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
585+
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
586+
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
580587
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
581588
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
582589
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");

examples/console.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,10 @@ namespace console {
8080
// Set console input codepage to UTF16
8181
_setmode(_fileno(stdin), _O_WTEXT);
8282

83-
if (!simple_io) {
84-
// Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
83+
// Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
84+
if (simple_io) {
85+
dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
86+
} else {
8587
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
8688
}
8789
if (!SetConsoleMode(hConIn, dwMode)) {

examples/llama.vim

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
" Requires an already running llama.cpp server
2+
" To install either copy or symlink to ~/.vim/autoload/llama.vim
3+
" Then start with either :call llama#doLlamaGen(),
4+
" or add a keybind to your vimrc such as
5+
" nnoremap Z :call llama#doLlamaGen()<CR>
6+
" Similarly, you could add an insert mode keybind with
7+
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
8+
"
9+
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
10+
" let g:llama_api_url = "192.168.1.10:8080"
11+
" llama_overrides can also be set through buffer/window scopes. For instance
12+
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
13+
" Could be added to your .vimrc to automatically set a lower temperature when
14+
" editing a python script
15+
" Additionally, an override dict can be stored at the top of a file
16+
" !*{"stop": ["User:"]}
17+
" Could be added to the start of your chatlog.txt to set the stopping token
18+
" These parameter dicts are merged together from lowest to highest priority:
19+
" server default -> g:llama_overrides -> w:llama_overrides ->
20+
" b:llama_overrides -> in file (!*) overrides
21+
"
22+
" Sublists (like logit_bias and stop) are overridden, not merged
23+
" Example override:
24+
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
25+
if !exists("g:llama_api_url")
26+
let g:llama_api_url= "127.0.0.1:8080"
27+
endif
28+
if !exists("g:llama_overrides")
29+
let g:llama_overrides = {}
30+
endif
31+
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
32+
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
33+
let s:linedict = {}
34+
35+
func s:callbackHandler(bufn, channel, msg)
36+
if len(a:msg) < 3
37+
return
38+
elseif a:msg[0] == "d"
39+
let l:msg = a:msg[6:-1]
40+
else
41+
let l:msg = a:msg
42+
endif
43+
let l:decoded_msg = json_decode(l:msg)
44+
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
45+
if len(l:newtext) > 0
46+
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
47+
else
48+
echo "nothing genned"
49+
endif
50+
if len(newtext) > 1
51+
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
52+
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
53+
endif
54+
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
55+
echo "Finished generation"
56+
endif
57+
endfunction
58+
59+
func llama#doLlamaGen()
60+
if exists("b:job")
61+
if job_status(b:job) == "run"
62+
call job_stop(b:job)
63+
return
64+
endif
65+
endif
66+
67+
let l:cbuffer = bufnr("%")
68+
let s:linedict[l:cbuffer] = line('$')
69+
let l:buflines = getbufline(l:cbuffer, 1, 1000)
70+
let l:querydata = copy(s:querydata)
71+
call extend(l:querydata, g:llama_overrides)
72+
if exists("w:llama_overrides")
73+
call extend(l:querydata, w:llama_overrides)
74+
endif
75+
if exists("b:llama_overrides")
76+
call extend(l:querydata, b:llama_overrides)
77+
endif
78+
if l:buflines[0][0:1] == '!*'
79+
let l:userdata = json_decode(l:buflines[0][2:-1])
80+
call extend(l:querydata, l:userdata)
81+
let l:buflines = l:buflines[1:-1]
82+
endif
83+
let l:querydata.prompt = join(l:buflines, "\n")
84+
let l:curlcommand = copy(s:curlcommand)
85+
let l:curlcommand[2] = json_encode(l:querydata)
86+
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
87+
endfunction
88+
89+
" Echos the tokkenization of the provided string , or cursor to end of word
90+
" Onus is placed on the user to include the preceding space
91+
func llama#tokenizeWord(...)
92+
if (a:0 > 0)
93+
let l:input = a:1
94+
else
95+
exe "normal \"*ye"
96+
let l:input = @*
97+
endif
98+
let l:querydata = {"content": l:input}
99+
let l:curlcommand = copy(s:curlcommand)
100+
let l:curlcommand[2] = json_encode(l:querydata)
101+
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
102+
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
103+
endfunction
104+
105+
func s:tokenizeWordCallback(plaintext, channel, msg)
106+
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
107+
endfunction
108+
109+
110+
" Echos the token count of the entire buffer (or provided string)
111+
" Example usage :echo llama#tokenCount()
112+
func llama#tokenCount(...)
113+
if (a:0 > 0)
114+
let l:buflines = a:1
115+
else
116+
let l:buflines = getline(1,1000)
117+
if l:buflines[0][0:1] == '!*'
118+
let l:buflines = l:buflines[1:-1]
119+
endif
120+
let l:buflines = join(l:buflines, "\n")
121+
endif
122+
let l:querydata = {"content": l:buflines}
123+
let l:curlcommand = copy(s:curlcommand)
124+
let l:curlcommand[2] = json_encode(l:querydata)
125+
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
126+
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
127+
endfunction
128+
129+
func s:tokenCountCallback(channel, msg)
130+
let resp = json_decode(a:msg)
131+
echo len(resp.tokens)
132+
endfunction

0 commit comments

Comments
 (0)