Skip to content

Commit 05bfb76

Browse files
committed
Merge branch 'master' into ci-build-cross
2 parents ef7376e + af04481 commit 05bfb76

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+5903
-1312
lines changed

.github/workflows/build.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,35 @@ jobs:
679679
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
680680
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
681681
682+
macOS-latest-cmake-visionos:
683+
runs-on: macos-latest
684+
685+
steps:
686+
- name: Clone
687+
id: checkout
688+
uses: actions/checkout@v4
689+
690+
- name: Dependencies
691+
id: depends
692+
continue-on-error: true
693+
run: |
694+
brew update
695+
696+
- name: Build
697+
id: cmake_build
698+
run: |
699+
sysctl -a
700+
cmake -B build -G Xcode \
701+
-DGGML_METAL_USE_BF16=ON \
702+
-DGGML_METAL_EMBED_LIBRARY=ON \
703+
-DLLAMA_BUILD_EXAMPLES=OFF \
704+
-DLLAMA_BUILD_TESTS=OFF \
705+
-DLLAMA_BUILD_SERVER=OFF \
706+
-DCMAKE_SYSTEM_NAME=visionOS \
707+
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
708+
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
709+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
710+
682711
macOS-latest-swift:
683712
runs-on: macos-latest
684713

build-xcframework.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
432432
-DCMAKE_SYSTEM_NAME=visionOS \
433433
-DCMAKE_OSX_SYSROOT=xros \
434434
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
435-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
436-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
435+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
436+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
437437
-S .
438438
cmake --build build-visionos --config Release -- -quiet
439439

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
445445
-DCMAKE_SYSTEM_NAME=visionOS \
446446
-DCMAKE_OSX_SYSROOT=xrsimulator \
447447
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
448-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
449-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
448+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
449+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
450450
-S .
451451
cmake --build build-visionos-sim --config Release -- -quiet
452452

convert_hf_to_gguf.py

Lines changed: 236 additions & 55 deletions
Large diffs are not rendered by default.

docs/backend/SYCL.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
237237
cmake --build buildWithCublas --config Release
238238
```
239239

240+
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
241+
242+
```sh
243+
git clone https://github.com/oneapi-src/oneDNN.git
244+
cd oneDNN
245+
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
246+
cmake --build build-nvidia --config Release
247+
```
248+
240249
- **Adding support to AMD GPUs**
241250

242251
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
@@ -327,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
327336
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
328337

329338
# Option 1: Use FP32 (recommended for better performance in most cases)
330-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
339+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
331340

332341
# Option 2: Use FP16
333-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
342+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
334343

335344
# build all binary
336345
cmake --build build --config Release -j -v
@@ -660,8 +669,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
660669
|--------------------|---------------------------------------|---------------------------------------------|
661670
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
662671
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
663-
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
672+
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
664673
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
674+
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
665675
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
666676
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
667677

@@ -671,6 +681,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
671681
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
672682
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
673683
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
684+
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
674685
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
675686

676687

examples/main/README.md

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,24 @@ Once downloaded, place your model in the models folder in llama.cpp.
2727
##### Input prompt (One-and-done)
2828

2929
```bash
30-
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
30+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
3131
```
3232
##### Conversation mode (Allow for continuous interaction with the model)
3333

3434
```bash
35-
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
35+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
36+
```
37+
38+
##### Conversation mode using built-in jinja chat template
39+
40+
```bash
41+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
42+
```
43+
44+
##### One-and-done query using jinja with custom system prompt and a starting prompt
45+
46+
```bash
47+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
3648
```
3749

3850
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -44,12 +56,24 @@ Once downloaded, place your model in the models folder in llama.cpp.
4456

4557
##### Input prompt (One-and-done)
4658
```powershell
47-
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
59+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
4860
```
4961
##### Conversation mode (Allow for continuous interaction with the model)
5062

5163
```powershell
52-
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
64+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
65+
```
66+
67+
##### Conversation mode using built-in jinja chat template
68+
69+
```powershell
70+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
71+
```
72+
73+
##### One-and-done query using jinja with custom system prompt and a starting prompt
74+
75+
```powershell
76+
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
5377
```
5478

5579
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -77,6 +101,8 @@ The `llama-cli` program provides several ways to interact with the LLaMA models
77101

78102
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
79103
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
104+
- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)).
105+
- `--system-prompt-file FNAME`: Provide a file containing a system prompt.
80106
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
81107

82108
## Interaction
@@ -89,7 +115,10 @@ In interactive mode, users can participate in text generation by injecting their
89115

90116
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
91117
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
92-
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
118+
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found)
119+
- `-no-cnv`: Disable conversation mode (default: false)
120+
- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit.
121+
- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false)
93122
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
94123

95124
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -125,6 +154,8 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t
125154

126155
Example usage: `--chat-template gemma`
127156

157+
`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py
158+
128159
## Context Management
129160

130161
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

examples/server/public/index.html.gz

90 Bytes
Binary file not shown.

examples/server/server.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,10 @@ struct server_context {
18721872
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
18731873
params_dft.n_parallel = 1;
18741874

1875+
// force F16 KV cache for the draft model for extra performance
1876+
params_dft.cache_type_k = GGML_TYPE_F16;
1877+
params_dft.cache_type_v = GGML_TYPE_F16;
1878+
18751879
llama_init_dft = common_init_from_params(params_dft);
18761880

18771881
model_dft = llama_init_dft.model.get();
@@ -1892,10 +1896,6 @@ struct server_context {
18921896
cparams_dft = common_context_params_to_llama(params_dft);
18931897
cparams_dft.n_batch = n_ctx_dft;
18941898

1895-
// force F16 KV cache for the draft model for extra performance
1896-
cparams_dft.type_k = GGML_TYPE_F16;
1897-
cparams_dft.type_v = GGML_TYPE_F16;
1898-
18991899
// the context is not needed - we will create one for each slot
19001900
llama_init_dft.context.reset();
19011901
}

examples/server/webui/src/components/ChatScreen.tsx

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,9 @@ export default function ChatScreen() {
9999
canvasData,
100100
replaceMessageAndGenerate,
101101
} = useAppContext();
102-
const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
103-
const inputRef = useRef<HTMLTextAreaElement>(null);
102+
const textarea = useOptimizedTextarea(prefilledMsg.content());
104103

105-
const { extraContext, clearExtraContext } = useVSCodeContext(
106-
inputRef,
107-
setInputMsg
108-
);
104+
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
109105
// TODO: improve this when we have "upload file" feature
110106
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
111107

@@ -135,9 +131,10 @@ export default function ChatScreen() {
135131
};
136132

137133
const sendNewMessage = async () => {
138-
if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
139-
const lastInpMsg = inputMsg;
140-
setInputMsg('');
134+
const lastInpMsg = textarea.value();
135+
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
136+
return;
137+
textarea.setValue('');
141138
scrollToBottom(false);
142139
setCurrNodeId(-1);
143140
// get the last message node
@@ -146,13 +143,13 @@ export default function ChatScreen() {
146143
!(await sendMessage(
147144
currConvId,
148145
lastMsgNodeId,
149-
inputMsg,
146+
lastInpMsg,
150147
currExtra,
151148
onChunk
152149
))
153150
) {
154151
// restore the input message if failed
155-
setInputMsg(lastInpMsg);
152+
textarea.setValue(lastInpMsg);
156153
}
157154
// OK
158155
clearExtraContext();
@@ -195,16 +192,13 @@ export default function ChatScreen() {
195192
// send the prefilled message if needed
196193
sendNewMessage();
197194
} else {
198-
// otherwise, focus on the input and move the cursor to the end
199-
if (inputRef.current) {
200-
inputRef.current.focus();
201-
inputRef.current.selectionStart = inputRef.current.value.length;
202-
}
195+
// otherwise, focus on the input
196+
textarea.focus();
203197
}
204198
prefilledMsg.clear();
205199
// no need to keep track of sendNewMessage
206200
// eslint-disable-next-line react-hooks/exhaustive-deps
207-
}, [inputRef]);
201+
}, [textarea.ref]);
208202

209203
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
210204
const pendingMsgDisplay: MessageDisplay[] =
@@ -258,9 +252,7 @@ export default function ChatScreen() {
258252
<textarea
259253
className="textarea textarea-bordered w-full"
260254
placeholder="Type a message (Shift+Enter to add a new line)"
261-
ref={inputRef}
262-
value={inputMsg}
263-
onChange={(e) => setInputMsg(e.target.value)}
255+
ref={textarea.ref}
264256
onKeyDown={(e) => {
265257
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
266258
if (e.key === 'Enter' && e.shiftKey) return;
@@ -280,11 +272,7 @@ export default function ChatScreen() {
280272
Stop
281273
</button>
282274
) : (
283-
<button
284-
className="btn btn-primary ml-2"
285-
onClick={sendNewMessage}
286-
disabled={inputMsg.trim().length === 0}
287-
>
275+
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
288276
Send
289277
</button>
290278
)}
@@ -298,3 +286,43 @@ export default function ChatScreen() {
298286
</div>
299287
);
300288
}
289+
290+
export interface OptimizedTextareaValue {
291+
value: () => string;
292+
setValue: (value: string) => void;
293+
focus: () => void;
294+
ref: React.RefObject<HTMLTextAreaElement>;
295+
}
296+
297+
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
298+
// See https://github.com/ggml-org/llama.cpp/pull/12299
299+
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
300+
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
301+
const textareaRef = useRef<HTMLTextAreaElement>(null);
302+
303+
useEffect(() => {
304+
if (textareaRef.current && savedInitValue) {
305+
textareaRef.current.value = savedInitValue;
306+
setSavedInitValue('');
307+
}
308+
}, [textareaRef, savedInitValue, setSavedInitValue]);
309+
310+
return {
311+
value: () => {
312+
return textareaRef.current?.value ?? savedInitValue;
313+
},
314+
setValue: (value: string) => {
315+
if (textareaRef.current) {
316+
textareaRef.current.value = value;
317+
}
318+
},
319+
focus: () => {
320+
if (textareaRef.current) {
321+
// focus and move the cursor to the end
322+
textareaRef.current.focus();
323+
textareaRef.current.selectionStart = textareaRef.current.value.length;
324+
}
325+
},
326+
ref: textareaRef,
327+
};
328+
}

examples/server/webui/src/utils/llama-vscode.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { useEffect, useState } from 'react';
22
import { MessageExtraContext } from './types';
3+
import { OptimizedTextareaValue } from '../components/ChatScreen';
34

45
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
56
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -14,10 +15,7 @@ interface SetTextEvData {
1415
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
1516
*/
1617

17-
export const useVSCodeContext = (
18-
inputRef: React.RefObject<HTMLTextAreaElement>,
19-
setInputMsg: (text: string) => void
20-
) => {
18+
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
2119
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
2220
null
2321
);
@@ -27,20 +25,20 @@ export const useVSCodeContext = (
2725
const handleMessage = (event: MessageEvent) => {
2826
if (event.data?.command === 'setText') {
2927
const data: SetTextEvData = event.data;
30-
setInputMsg(data?.text);
28+
textarea.setValue(data?.text);
3129
if (data?.context && data.context.length > 0) {
3230
setExtraContext({
3331
type: 'context',
3432
content: data.context,
3533
});
3634
}
37-
inputRef.current?.focus();
35+
textarea.focus();
3836
}
3937
};
4038

4139
window.addEventListener('message', handleMessage);
4240
return () => window.removeEventListener('message', handleMessage);
43-
}, [inputRef, setInputMsg]);
41+
}, [textarea]);
4442

4543
// Add a keydown listener that sends the "escapePressed" message to the parent window
4644
useEffect(() => {

0 commit comments

Comments
 (0)