Skip to content

Temp #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Nov 11, 2024
Merged

Temp #18

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,6 @@ env:
# LLAMA_LOG_TIMESTAMPS: 1

jobs:

# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these

# CUDA Release

ubuntu-latest-cmake:
Expand Down Expand Up @@ -120,7 +110,7 @@ jobs:
export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)

- name: Determine tag name
Expand Down Expand Up @@ -202,7 +192,7 @@ jobs:
export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON
cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)

- name: Determine tag name
Expand Down Expand Up @@ -241,6 +231,7 @@ jobs:
release:
permissions: write-all


if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

runs-on: ubuntu-latest
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,10 @@ ifdef GGML_METAL
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJ_GGML += ggml/src/ggml-metal.o

ifdef GGML_METAL_USE_BF16
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
endif # GGML_METAL_USE_BF16
ifdef GGML_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif
Expand Down
4 changes: 3 additions & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ let package = Package(
name: "llama",
path: ".",
exclude: [
"build",
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"Makefile"
"Makefile",
"ggml/src/ggml-metal-embed.metal"
],
sources: sources,
resources: resources,
Expand Down
2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"

if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
fi

if [ ! -z ${GG_BUILD_CUDA} ]; then
Expand Down
12 changes: 5 additions & 7 deletions examples/chat-persistent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"

SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
'|'\
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"

CTX_SIZE=2048
Expand Down Expand Up @@ -129,15 +130,12 @@ while read -e line; do

printf ' '

# HACK get num tokens from debug message
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1
fi

n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")

if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
Expand Down
3 changes: 3 additions & 0 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
Expand Down
106 changes: 97 additions & 9 deletions examples/server/public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -200,23 +200,38 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
<div class="label">System Message</div>
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
</label>
<template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<label class="input input-bordered flex items-center gap-2 mb-2">
<b>{{ key }}</b>
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
</label>
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
<div class="collapse collapse-arrow bg-base-200 mb-2">
<input type="checkbox" />
<div class="collapse-title font-bold">Advanced config</div>
<!-- Section: Other sampler settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Other sampler settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
</div>
</details>
<!-- Section: Penalties settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Penalties settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content">
<label class="form-control mb-2">
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
</label>
</div>
</div>
</details>
</div>

<!-- action buttons -->
Expand All @@ -229,6 +244,21 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
</dialog>
</div>

<!-- Template to be used by settings modal -->
<template id="settings-modal-numeric-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
<!-- Show help message on hovering on the input label -->
<div class="dropdown dropdown-hover">
<div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{{ configInfo[configKey] || '(no help message available)' }}
</div>
</div>
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
</label>
</template>

<script src="./deps_markdown-it.js"></script>
<script type="module">
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
Expand All @@ -245,12 +275,48 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
systemMessage: 'You are a helpful assistant.',
// make sure these default values are in sync with `common.h`
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typical_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
custom: '', // custom json-stringified object
};
const CONFIG_INFO = {
apiKey: '',
systemMessage: 'The starting message that defines how model should behave.',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
top_k: 'Keeps only k top tokens.',
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
max_tokens: 'The maximum number of token per output.',
custom: '', // custom json-stringified object
};
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui
Expand All @@ -269,6 +335,12 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
{ props: ["source", "options", "plugins"] }
);

// inout field to be used by settings modal
const SettingsModalNumericInput = defineComponent({
template: document.getElementById('settings-modal-numeric-input').innerHTML,
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
});

// coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
Expand Down Expand Up @@ -359,6 +431,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
const mainApp = createApp({
components: {
VueMarkdown,
SettingsModalNumericInput,
},
data() {
return {
Expand All @@ -376,6 +449,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
// const
themes: THEMES,
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
}
},
computed: {},
Expand Down Expand Up @@ -452,8 +526,22 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
stream: true,
cache_prompt: true,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
top_k: this.config.top_k,
top_p: this.config.top_p,
min_p: this.config.min_p,
typical_p: this.config.typical_p,
xtc_probability: this.config.xtc_probability,
xtc_threshold: this.config.xtc_threshold,
repeat_last_n: this.config.repeat_last_n,
repeat_penalty: this.config.repeat_penalty,
presence_penalty: this.config.presence_penalty,
frequency_penalty: this.config.frequency_penalty,
dry_multiplier: this.config.dry_multiplier,
dry_base: this.config.dry_base,
dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
Expand Down
6 changes: 3 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1746,6 +1746,9 @@ extern "C" {
struct ggml_tensor * a,
enum ggml_prec prec);

GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
const struct ggml_tensor * a);

// TODO: needs to be adapted to ggml_flash_attn_ext
GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
Expand Down
13 changes: 11 additions & 2 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ if (GGML_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()

if (GGML_METAL_USE_BF16)
add_compile_definitions(GGML_METAL_USE_BF16)
endif()

# copy ggml-common.h and ggml-metal.metal to bin directory
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
Expand Down Expand Up @@ -1271,8 +1275,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
OUTPUT_VARIABLE POWER10_M)
string(FIND ${POWER10_M} "POWER10" substring_index)
if(${substring_index} GREATER_EQUAL 0)
list(APPEND ARCH_FLAGS -mcpu=power10)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
#ifndef FLASH_ATTN_AVAILABLE
return false;
#endif
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
return false;
}
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/count-equal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);

CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));

Expand Down
Loading
Loading