Skip to content

Commit 35e9ed4

Browse files
authored
Merge pull request #18 from apicalshark/temp
Temp
2 parents 91a01ce + 89329f7 commit 35e9ed4

File tree

21 files changed

+1396
-481
lines changed

21 files changed

+1396
-481
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,16 +62,6 @@ env:
6262
# LLAMA_LOG_TIMESTAMPS: 1
6363

6464
jobs:
65-
66-
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
67-
# how to debug it.
68-
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
69-
70-
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
71-
# how to debug it.
72-
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
73-
# would be great if we fix these
74-
7565
# CUDA Release
7666

7767
ubuntu-latest-cmake:
@@ -120,7 +110,7 @@ jobs:
120110
export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
121111
mkdir build
122112
cd build
123-
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON
113+
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
124114
cmake --build . --config Release -j $(nproc)
125115
126116
- name: Determine tag name
@@ -202,7 +192,7 @@ jobs:
202192
export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
203193
mkdir build
204194
cd build
205-
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON
195+
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
206196
cmake --build . --config Release -j $(nproc)
207197
208198
- name: Determine tag name
@@ -241,6 +231,7 @@ jobs:
241231
release:
242232
permissions: write-all
243233

234+
244235
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
245236

246237
runs-on: ubuntu-latest

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,10 @@ ifdef GGML_METAL
878878
MK_CPPFLAGS += -DGGML_USE_METAL
879879
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
880880
OBJ_GGML += ggml/src/ggml-metal.o
881+
882+
ifdef GGML_METAL_USE_BF16
883+
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
884+
endif # GGML_METAL_USE_BF16
881885
ifdef GGML_METAL_NDEBUG
882886
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
883887
endif

Package.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,15 @@ let package = Package(
6161
name: "llama",
6262
path: ".",
6363
exclude: [
64+
"build",
6465
"cmake",
6566
"examples",
6667
"scripts",
6768
"models",
6869
"tests",
6970
"CMakeLists.txt",
70-
"Makefile"
71+
"Makefile",
72+
"ggml/src/ggml-metal-embed.metal"
7173
],
7274
sources: sources,
7375
resources: resources,

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ SRC=`pwd`
3939
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
4040

4141
if [ ! -z ${GG_BUILD_METAL} ]; then
42-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
42+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
4343
fi
4444

4545
if [ ! -z ${GG_BUILD_CUDA} ]; then

examples/chat-persistent.sh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
2323
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
2424
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
2525

26-
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
27-
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
26+
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
27+
'|'\
28+
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
2829
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
2930

3031
CTX_SIZE=2048
@@ -129,15 +130,12 @@ while read -e line; do
129130

130131
printf ' '
131132

132-
# HACK get num tokens from debug message
133-
# TODO get both messages in one go
134-
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
135-
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
133+
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
136134
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
137135
exit 1
138136
fi
139137

140-
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
138+
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
141139

142140
if ((n_tokens > CTX_ROTATE_POINT)); then
143141
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
256256
if (s == "f16") {
257257
return GGML_TYPE_F16;
258258
}
259+
if (s == "bf16") {
260+
return GGML_TYPE_BF16;
261+
}
259262
if (s == "q8_0") {
260263
return GGML_TYPE_Q8_0;
261264
}

examples/server/public/index.html

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -200,23 +200,38 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
200200
<div class="label">System Message</div>
201201
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
202202
</label>
203-
<template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
204-
<label class="input input-bordered flex items-center gap-2 mb-2">
205-
<b>{{ key }}</b>
206-
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
207-
</label>
203+
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
204+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
208205
</template>
209206
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
210-
<div class="collapse collapse-arrow bg-base-200 mb-2">
211-
<input type="checkbox" />
212-
<div class="collapse-title font-bold">Advanced config</div>
207+
<!-- Section: Other sampler settings -->
208+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
209+
<summary class="collapse-title font-bold">Other sampler settings</summary>
210+
<div class="collapse-content">
211+
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
212+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
213+
</template>
214+
</div>
215+
</details>
216+
<!-- Section: Penalties settings -->
217+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
218+
<summary class="collapse-title font-bold">Penalties settings</summary>
219+
<div class="collapse-content">
220+
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
221+
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
222+
</template>
223+
</div>
224+
</details>
225+
<!-- Section: Advanced config -->
226+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
227+
<summary class="collapse-title font-bold">Advanced config</summary>
213228
<div class="collapse-content">
214229
<label class="form-control mb-2">
215230
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
216231
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
217232
</label>
218233
</div>
219-
</div>
234+
</details>
220235
</div>
221236

222237
<!-- action buttons -->
@@ -229,6 +244,21 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
229244
</dialog>
230245
</div>
231246

247+
<!-- Template to be used by settings modal -->
248+
<template id="settings-modal-numeric-input">
249+
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
250+
<!-- Show help message on hovering on the input label -->
251+
<div class="dropdown dropdown-hover">
252+
<div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
253+
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
254+
{{ configInfo[configKey] || '(no help message available)' }}
255+
</div>
256+
</div>
257+
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
258+
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
259+
</label>
260+
</template>
261+
232262
<script src="./deps_markdown-it.js"></script>
233263
<script type="module">
234264
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
@@ -245,12 +275,48 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
245275
systemMessage: 'You are a helpful assistant.',
246276
// make sure these default values are in sync with `common.h`
247277
temperature: 0.8,
278+
dynatemp_range: 0.0,
279+
dynatemp_exponent: 1.0,
248280
top_k: 40,
249281
top_p: 0.95,
250282
min_p: 0.05,
283+
xtc_probability: 0.0,
284+
xtc_threshold: 0.1,
285+
typical_p: 1.0,
286+
repeat_last_n: 64,
287+
repeat_penalty: 1.0,
288+
presence_penalty: 0.0,
289+
frequency_penalty: 0.0,
290+
dry_multiplier: 0.0,
291+
dry_base: 1.75,
292+
dry_allowed_length: 2,
293+
dry_penalty_last_n: -1,
251294
max_tokens: -1,
252295
custom: '', // custom json-stringified object
253296
};
297+
const CONFIG_INFO = {
298+
apiKey: '',
299+
systemMessage: 'The starting message that defines how model should behave.',
300+
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
301+
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
302+
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
303+
top_k: 'Keeps only k top tokens.',
304+
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
305+
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
306+
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
307+
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
308+
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
309+
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
310+
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
311+
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
312+
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
313+
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
314+
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
315+
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
316+
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
317+
max_tokens: 'The maximum number of token per output.',
318+
custom: '', // custom json-stringified object
319+
};
254320
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
255321
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
256322
// list of themes supported by daisyui
@@ -269,6 +335,12 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
269335
{ props: ["source", "options", "plugins"] }
270336
);
271337

338+
// inout field to be used by settings modal
339+
const SettingsModalNumericInput = defineComponent({
340+
template: document.getElementById('settings-modal-numeric-input').innerHTML,
341+
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
342+
});
343+
272344
// coversations is stored in localStorage
273345
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
274346
// convId is a string prefixed with 'conv-'
@@ -359,6 +431,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
359431
const mainApp = createApp({
360432
components: {
361433
VueMarkdown,
434+
SettingsModalNumericInput,
362435
},
363436
data() {
364437
return {
@@ -376,6 +449,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
376449
// const
377450
themes: THEMES,
378451
configDefault: {...CONFIG_DEFAULT},
452+
configInfo: {...CONFIG_INFO},
379453
}
380454
},
381455
computed: {},
@@ -452,8 +526,22 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
452526
stream: true,
453527
cache_prompt: true,
454528
temperature: this.config.temperature,
529+
dynatemp_range: this.config.dynatemp_range,
530+
dynatemp_exponent: this.config.dynatemp_exponent,
455531
top_k: this.config.top_k,
456532
top_p: this.config.top_p,
533+
min_p: this.config.min_p,
534+
typical_p: this.config.typical_p,
535+
xtc_probability: this.config.xtc_probability,
536+
xtc_threshold: this.config.xtc_threshold,
537+
repeat_last_n: this.config.repeat_last_n,
538+
repeat_penalty: this.config.repeat_penalty,
539+
presence_penalty: this.config.presence_penalty,
540+
frequency_penalty: this.config.frequency_penalty,
541+
dry_multiplier: this.config.dry_multiplier,
542+
dry_base: this.config.dry_base,
543+
dry_allowed_length: this.config.dry_allowed_length,
544+
dry_penalty_last_n: this.config.dry_penalty_last_n,
457545
max_tokens: this.config.max_tokens,
458546
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
459547
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),

flake.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
163163
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
164164
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
165165
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
166+
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
166167
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
167168
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
168169
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})

ggml/include/ggml.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,9 @@ extern "C" {
17461746
struct ggml_tensor * a,
17471747
enum ggml_prec prec);
17481748

1749+
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
1750+
const struct ggml_tensor * a);
1751+
17491752
// TODO: needs to be adapted to ggml_flash_attn_ext
17501753
GGML_API struct ggml_tensor * ggml_flash_attn_back(
17511754
struct ggml_context * ctx,

ggml/src/CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ if (GGML_METAL)
6868
add_compile_definitions(GGML_METAL_NDEBUG)
6969
endif()
7070

71+
if (GGML_METAL_USE_BF16)
72+
add_compile_definitions(GGML_METAL_USE_BF16)
73+
endif()
74+
7175
# copy ggml-common.h and ggml-metal.metal to bin directory
7276
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
7377
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
@@ -1271,8 +1275,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
12711275
endif()
12721276
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
12731277
message(STATUS "PowerPC detected")
1274-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
1275-
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
1278+
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
1279+
OUTPUT_VARIABLE POWER10_M)
1280+
string(FIND ${POWER10_M} "POWER10" substring_index)
1281+
if(${substring_index} GREATER_EQUAL 0)
1282+
list(APPEND ARCH_FLAGS -mcpu=power10)
1283+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
1284+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
12761285
else()
12771286
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
12781287
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)

ggml/src/ggml-cuda.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
31593159
#ifndef FLASH_ATTN_AVAILABLE
31603160
return false;
31613161
#endif
3162+
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
3163+
return false;
3164+
}
31623165
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
31633166
return true;
31643167
}

ggml/src/ggml-cuda/count-equal.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4444

4545
const int64_t ne = ggml_nelements(src0);
4646
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
47-
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
47+
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
4848

4949
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
5050

0 commit comments

Comments
 (0)