Skip to content

Commit 2a9c4cd

Browse files
author
ochafik
committed
Merge remote-tracking branch 'origin/master' into bins
2 parents 8cf8c12 + b61eb96 commit 2a9c4cd

File tree

8 files changed

+124
-87
lines changed

8 files changed

+124
-87
lines changed

CMakeLists.txt

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -402,12 +402,26 @@ if (LLAMA_CUBLAS)
402402
endif()
403403

404404
if (LLAMA_CUDA)
405-
cmake_minimum_required(VERSION 3.17)
405+
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
406406

407407
find_package(CUDAToolkit)
408408
if (CUDAToolkit_FOUND)
409409
message(STATUS "CUDA found")
410410

411+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
412+
# 52 == lowest CUDA 12 standard
413+
# 60 == f16 CUDA intrinsics
414+
# 61 == integer CUDA intrinsics
415+
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
416+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
417+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
418+
else()
419+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
420+
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
421+
endif()
422+
endif()
423+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
424+
411425
enable_language(CUDA)
412426

413427
set(GGML_HEADERS_CUDA ggml-cuda.h)
@@ -472,21 +486,6 @@ if (LLAMA_CUDA)
472486
else()
473487
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
474488
endif()
475-
476-
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
477-
# 52 == lowest CUDA 12 standard
478-
# 60 == f16 CUDA intrinsics
479-
# 61 == integer CUDA intrinsics
480-
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
481-
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
482-
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
483-
else()
484-
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
485-
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
486-
endif()
487-
endif()
488-
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
489-
490489
else()
491490
message(WARNING "CUDA not found")
492491
endif()

common/json-schema-to-grammar.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
4040
return result;
4141
}
4242

43-
const std::string SPACE_RULE = "\" \"?";
43+
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
4444

4545
struct BuiltinRule {
4646
std::string content;
@@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
5757
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
5858
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
5959
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
60-
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
60+
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
6161
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
6262
{"null", {"\"null\" space", {}}},
6363
};

examples/json_schema_to_grammar.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ def __init__(self, content: str, deps: list = None):
2929
self.content = content
3030
self.deps = deps or []
3131

32-
# whitespace is constrained to a single space char to prevent model "running away" in
33-
# whitespace. Also maybe improves generation quality?
34-
SPACE_RULE = '" "?'
32+
# Constraining spaces to prevent model "running away".
33+
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
3534

3635
PRIMITIVE_RULES = {
3736
'boolean' : BuiltinRule('("true" | "false") space', []),
@@ -43,7 +42,7 @@ def __init__(self, content: str, deps: list = None):
4342
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
4443
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
4544
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
46-
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
45+
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
4746
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
4847
'null' : BuiltinRule('"null" space', []),
4948
}

examples/server/public/json-schema-to-grammar.mjs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
2-
const SPACE_RULE = '" "?';
2+
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
33

44
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
55
if (minItems === 0 && maxItems === 1) {
@@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
4141
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
4242
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
4343
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
44-
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
44+
char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
4545
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
4646
null : new BuiltinRule('"null" space', []),
4747
};

grammars/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ This guide provides a brief overview. Check out the GBNF files in this directory
9494
./llama-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
9595
```
9696

97+
`llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
98+
9799
## Troubleshooting
98100

99101
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
@@ -103,3 +105,40 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
103105
A common pattern is to allow repetitions of a pattern `x` up to N times.
104106

105107
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
108+
109+
## Using GBNF grammars
110+
111+
You can use GBNF grammars:
112+
113+
- In the [server](../examples/server)'s completion endpoints, passed as the `grammar` body field
114+
- In the [main](../examples/main) CLI, passed as the `--grammar` & `--grammar-file` flags
115+
- With the [gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
116+
117+
## JSON Schemas → GBNF
118+
119+
`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
120+
121+
- In the [server](../examples/server):
122+
- For any completion endpoints, passed as the `json_schema` body field
123+
- For the `/chat/completions` endpoint, passed inside the `result_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`)
124+
- In the [main](../examples/main) CLI, passed as the `--json` / `-j` flag
125+
- To convert to a grammar ahead of time:
126+
- in CLI, with [json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
127+
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
128+
129+
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
130+
131+
Here is also a non-exhaustive list of **unsupported** features:
132+
133+
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
134+
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
135+
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
136+
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
137+
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
138+
- `string` formats `uri`, `email`
139+
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
140+
- `uniqueItems`
141+
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
142+
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
143+
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
144+
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)

grammars/json.gbnf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ array ::=
1616
string ::=
1717
"\"" (
1818
[^"\\\x7F\x00-\x1F] |
19-
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
19+
"\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
2020
)* "\"" ws
2121

22-
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
22+
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
2323

2424
# Optional space: by convention, applied in this grammar after literal chars when allowed
25-
ws ::= ([ \t\n] ws)?
25+
ws ::= | " " | "\n" [ \t]{0,20}

grammars/json_arr.gbnf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ array ::=
2525
string ::=
2626
"\"" (
2727
[^"\\\x7F\x00-\x1F] |
28-
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
28+
"\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
2929
)* "\"" ws
3030

31-
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
31+
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
3232

3333
# Optional space: by convention, applied in this grammar after literal chars when allowed
34-
ws ::= ([ \t\n] ws)?
34+
ws ::= | " " | "\n" [ \t]{0,20}

0 commit comments

Comments
 (0)