Skip to content

Commit d88d66b

Browse files
committed
Merge branch 'master' into hp/split/load-model-from-url
2 parents 0741014 + 1943c01 commit d88d66b

File tree

7 files changed

+548
-28
lines changed

7 files changed

+548
-28
lines changed

.github/workflows/build.yml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,118 @@ env:
2121
GGML_N_THREADS: 1
2222

2323
jobs:
24+
macOS-latest-cmake-arm64:
25+
runs-on: macos-14
26+
27+
steps:
28+
- name: Clone
29+
id: checkout
30+
uses: actions/checkout@v3
31+
32+
- name: Dependencies
33+
id: depends
34+
continue-on-error: true
35+
run: |
36+
brew update
37+
38+
- name: Build
39+
id: cmake_build
40+
run: |
41+
sysctl -a
42+
mkdir build
43+
cd build
44+
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
45+
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
46+
47+
- name: Test
48+
id: cmake_test
49+
run: |
50+
cd build
51+
ctest -L main --verbose --timeout 900
52+
53+
- name: Determine tag name
54+
id: tag
55+
shell: bash
56+
run: |
57+
BUILD_NUMBER="$(git rev-list --count HEAD)"
58+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
59+
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
60+
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
61+
else
62+
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
63+
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
64+
fi
65+
66+
- name: Pack artifacts
67+
id: pack_artifacts
68+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
69+
run: |
70+
cp LICENSE ./build/bin/
71+
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
72+
73+
- name: Upload artifacts
74+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
75+
uses: actions/upload-artifact@v3
76+
with:
77+
path: |
78+
llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
79+
80+
macOS-latest-cmake-x64:
81+
runs-on: macos-latest
82+
83+
steps:
84+
- name: Clone
85+
id: checkout
86+
uses: actions/checkout@v3
87+
88+
- name: Dependencies
89+
id: depends
90+
continue-on-error: true
91+
run: |
92+
brew update
93+
94+
- name: Build
95+
id: cmake_build
96+
run: |
97+
sysctl -a
98+
mkdir build
99+
cd build
100+
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
101+
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
102+
103+
- name: Test
104+
id: cmake_test
105+
run: |
106+
cd build
107+
ctest -L main --verbose --timeout 900
108+
109+
- name: Determine tag name
110+
id: tag
111+
shell: bash
112+
run: |
113+
BUILD_NUMBER="$(git rev-list --count HEAD)"
114+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
115+
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
116+
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
117+
else
118+
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
119+
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
120+
fi
121+
122+
- name: Pack artifacts
123+
id: pack_artifacts
124+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
125+
run: |
126+
cp LICENSE ./build/bin/
127+
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
128+
129+
- name: Upload artifacts
130+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
131+
uses: actions/upload-artifact@v3
132+
with:
133+
path: |
134+
llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
135+
24136
ubuntu-focal-make:
25137
runs-on: ubuntu-20.04
26138

@@ -748,6 +860,8 @@ jobs:
748860
- macOS-latest-cmake
749861
- windows-latest-cmake
750862
- windows-latest-cmake-cublas
863+
- macOS-latest-cmake-arm64
864+
- macOS-latest-cmake-x64
751865

752866
steps:
753867
- name: Clone

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,6 +1591,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
15911591
if (s == "q4_1") {
15921592
return GGML_TYPE_Q4_1;
15931593
}
1594+
if (s == "iq4_nl") {
1595+
return GGML_TYPE_IQ4_NL;
1596+
}
15941597
if (s == "q5_0") {
15951598
return GGML_TYPE_Q5_0;
15961599
}

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
249249
if (s == "q5_1") {
250250
return GGML_TYPE_Q5_1;
251251
}
252+
if (s == "iq4_nl") {
253+
return GGML_TYPE_IQ4_NL;
254+
}
252255

253256
return GGML_TYPE_COUNT;
254257
}

ggml-cuda.cu

Lines changed: 171 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,9 @@ static ggml_cuda_device_info ggml_cuda_init() {
294294

295295
ggml_cuda_device_info info = {};
296296

297-
if (cudaGetDeviceCount(&info.device_count) != cudaSuccess) {
298-
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
297+
cudaError_t err = cudaGetDeviceCount(&info.device_count);
298+
if (err != cudaSuccess) {
299+
fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
299300
return info;
300301
}
301302

@@ -369,12 +370,6 @@ struct ggml_cuda_pool {
369370

370371
virtual void * alloc(size_t size, size_t * actual_size) = 0;
371372
virtual void free(void * ptr, size_t size) = 0;
372-
373-
ggml_cuda_pool() = default;
374-
ggml_cuda_pool(const ggml_cuda_pool &) = delete;
375-
ggml_cuda_pool(ggml_cuda_pool &&) = delete;
376-
ggml_cuda_pool& operator=(const ggml_cuda_pool &) = delete;
377-
ggml_cuda_pool& operator=(ggml_cuda_pool &&) = delete;
378373
};
379374

380375
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
@@ -6762,6 +6757,123 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
67626757
}
67636758
}
67646759

6760+
static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
6761+
const float * xi = (const float *) cxi;
6762+
block_q5_0 * dsti = (block_q5_0 *) cdsti;
6763+
6764+
float amax = 0.0f;
6765+
float vmax = 0.0f;
6766+
6767+
for (int j = 0; j < QK5_0; ++j) {
6768+
const float v = xi[j];
6769+
if (amax < fabsf(v)) {
6770+
amax = fabsf(v);
6771+
vmax = v;
6772+
}
6773+
}
6774+
6775+
const float d = vmax / -16;
6776+
const float id = d ? 1.0f/d : 0.0f;
6777+
6778+
dsti->d = d;
6779+
6780+
uint32_t qh = 0;
6781+
for (int j = 0; j < QK5_0/2; ++j) {
6782+
const float x0 = xi[0 + j]*id;
6783+
const float x1 = xi[QK5_0/2 + j]*id;
6784+
6785+
const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
6786+
const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
6787+
6788+
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
6789+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
6790+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
6791+
}
6792+
memcpy(dsti->qh, &qh, sizeof(qh));
6793+
}
6794+
6795+
static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
6796+
const float * xi = (const float *) cxi;
6797+
block_q5_1 * dsti = (block_q5_1 *) cdsti;
6798+
6799+
float min = xi[0];
6800+
float max = xi[0];
6801+
6802+
for (int j = 1; j < QK5_1; ++j) {
6803+
const float v = xi[j];
6804+
min = v < min ? v : min;
6805+
max = v > max ? v : max;
6806+
}
6807+
6808+
const float d = (max - min) / 31;
6809+
const float id = d ? 1.0f/d : 0.0f;
6810+
6811+
dsti->dm.x = d;
6812+
dsti->dm.y = min;
6813+
6814+
uint32_t qh = 0;
6815+
for (int j = 0; j < QK5_1/2; ++j) {
6816+
const float x0 = (xi[0 + j] - min)*id;
6817+
const float x1 = (xi[QK5_1/2 + j] - min)*id;
6818+
6819+
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
6820+
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
6821+
6822+
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
6823+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
6824+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
6825+
}
6826+
memcpy(dsti->qh, &qh, sizeof(qh));
6827+
}
6828+
6829+
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
6830+
if (x <= val[0]) return 0;
6831+
if (x >= val[n-1]) return n-1;
6832+
int ml = 0, mu = n-1;
6833+
while (mu-ml > 1) {
6834+
int mav = (ml+mu)/2;
6835+
if (x < val[mav]) mu = mav; else ml = mav;
6836+
}
6837+
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
6838+
}
6839+
6840+
static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
6841+
const float * xi = (const float *) cxi;
6842+
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
6843+
6844+
float amax = 0.0f;
6845+
float vmax = 0.0f;
6846+
6847+
for (int j = 0; j < QK4_NL; ++j) {
6848+
const float v = xi[j];
6849+
if (amax < fabsf(v)) {
6850+
amax = fabsf(v);
6851+
vmax = v;
6852+
}
6853+
}
6854+
6855+
float d = vmax / kvalues_iq4nl[0];
6856+
const float id = d ? 1.0f/d : 0.0f;
6857+
6858+
float sumqx = 0, sumq2 = 0;
6859+
for (int j = 0; j < QK4_NL/2; ++j) {
6860+
const float x0 = xi[0 + j]*id;
6861+
const float x1 = xi[QK4_NL/2 + j]*id;
6862+
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
6863+
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
6864+
dsti->qs[j] = xi0 | (xi1 << 4);
6865+
const float v0 = kvalues_iq4nl[xi0];
6866+
const float v1 = kvalues_iq4nl[xi1];
6867+
const float w0 = xi[0 + j]*xi[0 + j];
6868+
const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
6869+
sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
6870+
sumq2 += w0*v0*v0 + w1*v1*v1;
6871+
}
6872+
6873+
dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
6874+
}
6875+
6876+
67656877
template <cpy_kernel_t cpy_blck, int qk>
67666878
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
67676879
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -6968,7 +7080,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
69687080
}
69697081

69707082
template<typename T>
6971-
static inline __device__ void swap(T & a, T & b) {
7083+
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
69727084
T tmp = a;
69737085
a = b;
69747086
b = tmp;
@@ -6997,11 +7109,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
69977109
if (ixj > col) {
69987110
if ((col & k) == 0) {
69997111
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
7000-
swap(dst_row[col], dst_row[ixj]);
7112+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
70017113
}
70027114
} else {
70037115
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
7004-
swap(dst_row[col], dst_row[ixj]);
7116+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
70057117
}
70067118
}
70077119
}
@@ -8495,6 +8607,39 @@ static void ggml_cpy_f32_q4_1_cuda(
84958607
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
84968608
}
84978609

8610+
static void ggml_cpy_f32_q5_0_cuda(
8611+
const char * cx, char * cdst, const int ne,
8612+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8613+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8614+
8615+
GGML_ASSERT(ne % QK5_0 == 0);
8616+
const int num_blocks = ne / QK5_0;
8617+
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
8618+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8619+
}
8620+
8621+
static void ggml_cpy_f32_q5_1_cuda(
8622+
const char * cx, char * cdst, const int ne,
8623+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8624+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8625+
8626+
GGML_ASSERT(ne % QK5_1 == 0);
8627+
const int num_blocks = ne / QK5_1;
8628+
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
8629+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8630+
}
8631+
8632+
static void ggml_cpy_f32_iq4_nl_cuda(
8633+
const char * cx, char * cdst, const int ne,
8634+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
8635+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
8636+
8637+
GGML_ASSERT(ne % QK4_NL == 0);
8638+
const int num_blocks = ne / QK4_NL;
8639+
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
8640+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
8641+
}
8642+
84988643
static void ggml_cpy_f16_f16_cuda(
84998644
const char * cx, char * cdst, const int ne,
85008645
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -10893,6 +11038,12 @@ static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * s
1089311038
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
1089411039
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
1089511040
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11041+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
11042+
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11043+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
11044+
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
11045+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
11046+
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
1089611047
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
1089711048
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
1089811049
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -11309,6 +11460,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1130911460
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
1131011461
return true;
1131111462
}
11463+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
11464+
return true;
11465+
}
11466+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
11467+
return true;
11468+
}
11469+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
11470+
return true;
11471+
}
1131211472
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
1131311473
return true;
1131411474
}

0 commit comments

Comments
 (0)