Skip to content

Commit 26e8f23

Browse files
author
S
committed
Reverted blocked multiplication code as it still has issues and could affect other Llama arches
1 parent 6745ea7 commit 26e8f23

File tree

1 file changed

+1
-335
lines changed

1 file changed

+1
-335
lines changed

llama.cpp

Lines changed: 1 addition & 335 deletions
Original file line numberDiff line numberDiff line change
@@ -5734,340 +5734,6 @@ static void llm_build_kv_store(
57345734
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
57355735
}
57365736

5737-
static struct ggml_tensor * llama_build_mat_mul_blocked_computation(
5738-
/*
5739-
* Does (almost) same thing as ggml_mat_mul mathematically speaking,
5740-
* but splits the computation into chunks.
5741-
*
5742-
* Why would you want to do this? As part of Command-R+ coding, we
5743-
* discovered that quite a bit of the GPU code is not prepared for
5744-
* matrices with more than 2**31-1 elements (~2 billion).
5745-
*
5746-
* Some context:
5747-
* https://github.com/ggerganov/llama.cpp/pull/6491
5748-
*
5749-
* This function has a limit (set to 2B) that if any constituent parts
5750-
* of it (input, output, result) would go over that limit byte-wise,
5751-
* it'll use the splitted computation. This is based on the idea that
5752-
* this minimizes the chance that somewhere downstream in GPU code, be
5753-
* it MPS or Cuda, has something like: int x = y * z; where the values
5754-
* of y and z overflow the multiplication and then silently (or not so
5755-
* silently) does something weird. At the time of writing (2024-04-05);
5756-
* it seems that CUDA code outright crashes and MPS silently gives bad
5757-
* results.
5758-
*
5759-
* This is a band-aid workaround. The ideal state of the world is that
5760-
* this function does nothing but "return ggml_mat_mul(ctx, a, b)".
5761-
*
5762-
* The last argument (forced_block_size) is for debugging. You can
5763-
* force a certain block size to use with the computation. If zero
5764-
* (default) then the block size is determined on the fly. Production
5765-
* code should always have it zero; and only set it to a non-zero value
5766-
* for debugging and testing.
5767-
*/
5768-
struct ggml_context * ctx,
5769-
struct ggml_tensor * a,
5770-
struct ggml_tensor * b,
5771-
const llama_model & model,
5772-
const llm_build_cb & cb,
5773-
int64_t il,
5774-
size_t forced_block_size)
5775-
{
5776-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
5777-
5778-
if (forced_block_size != 0) {
5779-
//fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size);
5780-
}
5781-
5782-
const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000;
5783-
5784-
// the actual ggml_mul_mat supports batching. But this one doesn't.
5785-
GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1);
5786-
GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1);
5787-
5788-
// bail out if if the number of elements would be zero.
5789-
// nicer than getting a segfault.
5790-
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
5791-
GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a').");
5792-
GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b').");
5793-
}
5794-
5795-
// Use the max size of: a, b, result size
5796-
const size_t a_rows = a->ne[1];
5797-
const size_t a_cols = a->ne[0];
5798-
5799-
// b is transposed
5800-
const size_t b_rows = b->ne[0];
5801-
const size_t b_cols = b->ne[1];
5802-
5803-
const size_t c_rows = a_rows;
5804-
const size_t c_cols = b_cols;
5805-
5806-
// determine a size of a block that's as big as possible.
5807-
// we start with block size of the maximum size, and if that passes,
5808-
// then we just use ggml_mat_mul()
5809-
//
5810-
// the block is square.
5811-
size_t cand_block_size = a_rows;
5812-
if (a_cols > cand_block_size) { cand_block_size = a_cols; }
5813-
if (b_rows > cand_block_size) { cand_block_size = b_rows; }
5814-
if (b_cols > cand_block_size) { cand_block_size = b_cols; }
5815-
if (c_rows > cand_block_size) { cand_block_size = c_rows; }
5816-
if (c_cols > cand_block_size) { cand_block_size = c_cols; }
5817-
5818-
size_t block_size = 1;
5819-
while (block_size < cand_block_size) {
5820-
block_size <<= 1;
5821-
}
5822-
5823-
if (forced_block_size != 0) {
5824-
block_size = forced_block_size;
5825-
} else {
5826-
// figure out what is largest block_size we can use that will never
5827-
// have an intermediate result bigger than
5828-
// MAX_BYTES_BEFORE_SPLIT
5829-
bool ok = true;
5830-
while (block_size > 0) {
5831-
ok = true;
5832-
5833-
// keep the byte calculations in sync with the blocked code in
5834-
// the computation part.
5835-
5836-
// Criteria:
5837-
// 1. result block size
5838-
{
5839-
const size_t i_min = 0;
5840-
const size_t j_min = 0;
5841-
size_t i_max = i_min + block_size;
5842-
size_t j_max = j_min + block_size;
5843-
if (i_max > a_rows) { i_max = a_rows; }
5844-
if (j_max > b_cols) { j_max = b_cols; }
5845-
5846-
const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min);
5847-
if (bytes_size > MAX_BYTES_BEFORE_SPLIT) {
5848-
ok = false;
5849-
}
5850-
}
5851-
// 2. and 3.
5852-
// Block size from 'a' and 'b'
5853-
{
5854-
const size_t i_min = 0;
5855-
const size_t j_min = 0;
5856-
const size_t k_min = 0;
5857-
5858-
size_t i_max = i_min + block_size;
5859-
size_t j_max = j_min + block_size;
5860-
size_t k_max = k_min + block_size;
5861-
5862-
if (i_max > a_rows) { i_max = a_rows; }
5863-
if (j_max > b_cols) { j_max = b_cols; }
5864-
if (k_max > a_cols) { k_max = a_cols; }
5865-
5866-
const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min);
5867-
const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min);
5868-
5869-
if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) {
5870-
ok = false;
5871-
}
5872-
}
5873-
5874-
if (!ok) {
5875-
block_size /= 2;
5876-
continue;
5877-
}
5878-
break;
5879-
}
5880-
GGML_ASSERT(block_size > 0);
5881-
}
5882-
5883-
//fprintf(stderr, "block_size=%zu a shape: %d %d b shape: %d %d\n", block_size, a_rows, a_cols, b_rows, b_cols);
5884-
5885-
// O(N^3) nested loop, where N is number of blocks on one of the
5886-
// constituent parts.
5887-
size_t nb_A = (a_rows + block_size - 1) / block_size;
5888-
size_t nb_B = (b_cols + block_size - 1) / block_size;
5889-
size_t nb_A2 = (a_cols + block_size - 1) / block_size;
5890-
5891-
// make placeholder tensors for each block results.
5892-
// 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y
5893-
struct ggml_tensor ** result_blocks = (struct ggml_tensor **) malloc(nb_A * nb_B * sizeof(struct ggml_tensor *));
5894-
5895-
for (size_t i = 0; i < nb_A; ++i) {
5896-
for (size_t j = 0; j < nb_B; ++j) {
5897-
const size_t i_min = i * block_size;
5898-
const size_t j_min = j * block_size;
5899-
size_t i_max = i_min + block_size;
5900-
size_t j_max = j_min + block_size;
5901-
5902-
if (i_max > a_rows) { i_max = a_rows; }
5903-
if (j_max > b_cols) { j_max = b_cols; }
5904-
5905-
struct ggml_tensor * result_block = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, i_max - i_min, j_max - j_min);
5906-
result_block = ggml_scale(ctx, result_block, 0.0f);
5907-
5908-
cb(result_block, "result_block-fresh", il);
5909-
result_blocks[i * nb_B + j] = result_block;
5910-
}
5911-
}
5912-
5913-
size_t num_blocks = 0;
5914-
for (size_t i = 0; i < nb_A; ++i) {
5915-
for (size_t j = 0; j < nb_B; ++j) {
5916-
for (size_t k = 0; k < nb_A2; ++k) {
5917-
num_blocks++;
5918-
5919-
const size_t i_min = i * block_size;
5920-
const size_t j_min = j * block_size;
5921-
const size_t k_min = k * block_size;
5922-
5923-
size_t i_max = i_min + block_size;
5924-
size_t j_max = j_min + block_size;
5925-
size_t k_max = k_min + block_size;
5926-
if (i_max > a_rows) { i_max = a_rows; }
5927-
if (j_max > b_cols) { j_max = b_cols; }
5928-
if (k_max > a_cols) { k_max = a_cols; }
5929-
5930-
const size_t blck_size_a = (const size_t) ggml_blck_size(a->type);
5931-
const size_t blck_size_b = (const size_t) ggml_blck_size(b->type);
5932-
const size_t type_size_a = ggml_type_size(a->type);
5933-
const size_t type_size_b = ggml_type_size(b->type);
5934-
5935-
GGML_ASSERT(k_min * type_size_a % blck_size_a == 0);
5936-
GGML_ASSERT(k_min * type_size_b % blck_size_b == 0);
5937-
5938-
// blck_size=32
5939-
// type_size_a=19
5940-
//
5941-
// k_min = 4
5942-
//
5943-
// byte_offset = (type_size_a * (k_min/blck_size)) =
5944-
// 19 * (4/32) = 2
5945-
5946-
struct ggml_tensor * a_slice = ggml_view_2d(
5947-
ctx, a,
5948-
k_max - k_min, // k:k_max size
5949-
i_max - i_min, // i:i_max size
5950-
ggml_row_size(a->type, a->ne[0]),
5951-
ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a);
5952-
5953-
cb(a_slice, "a_slice", il);
5954-
5955-
struct ggml_tensor * b_slice = ggml_view_2d(
5956-
ctx, b,
5957-
k_max - k_min, // k:k_max size
5958-
j_max - j_min, // j:j_max size
5959-
ggml_row_size(b->type, b->ne[0]),
5960-
ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b);
5961-
5962-
cb(b_slice, "b_slice", il);
5963-
5964-
struct ggml_tensor * result_slice = result_blocks[i * nb_B + j];
5965-
5966-
struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice);
5967-
cb(mm_result, "mm_result", il);
5968-
5969-
result_blocks[i * nb_B + j] = ggml_add(ctx, result_slice, mm_result);
5970-
cb(result_blocks[i * nb_B + j], "result_slice", il);
5971-
}
5972-
}
5973-
}
5974-
5975-
// concate the results into one chonky tensor.
5976-
// ggml_concat goes mad if the first two dimensions are not the same.
5977-
//
5978-
// We use this strategy: find largest power of two that divides the
5979-
// size of all the tensors. Power of two to make it friendly to GPU
5980-
// code; (TODO: LCD might be better? but not sure it won't break code).
5981-
//
5982-
// Flatten all the tensors to (X, 1, N, 1).
5983-
size_t split_size = 1;
5984-
while (1) {
5985-
size_t candidate_split_size = split_size << 1;
5986-
bool bad = false;
5987-
5988-
for (size_t i = 0; i < nb_A * nb_B; ++i) {
5989-
size_t rows = result_blocks[i]->ne[0];
5990-
size_t cols = result_blocks[i]->ne[1];
5991-
5992-
if (candidate_split_size > rows * cols) {
5993-
bad = true;
5994-
break;
5995-
}
5996-
5997-
if ((rows * cols) % candidate_split_size != 0) {
5998-
bad = true;
5999-
break;
6000-
}
6001-
}
6002-
6003-
if (bad) {
6004-
break;
6005-
}
6006-
6007-
split_size = candidate_split_size;
6008-
}
6009-
6010-
struct ggml_tensor * result_final = nullptr;
6011-
const ggml_type wanted_final_type = a->type;
6012-
6013-
// TODO: looks like concat also wants f32, so everything is casted to
6014-
// f32 here.. A datatype-agnostic concat would be nice; or ability to
6015-
// do the tensor equivalent of unsafe type cast.
6016-
//
6017-
// The Command-R+ tensor this code was written for was 6GB. So this is
6018-
// going to handle 12GB I guess. Oof.
6019-
//
6020-
// I believe you could be smarter and combine hierarchially instead of
6021-
// one by one. I.e. we are doing a concetenation like this:
6022-
// for x in range(100):
6023-
// accum = accum + [x] (copies accum every time? maybe. didn't read concat code)
6024-
//
6025-
// You could instead divide and conquer to make it a bit smarter.
6026-
for (size_t i = 0; i < nb_A; ++i) {
6027-
for (size_t j = 0; j < nb_B; ++j) {
6028-
struct ggml_tensor * src_block = result_blocks[i * nb_B + j];
6029-
6030-
const size_t rows = src_block->ne[0];
6031-
const size_t cols = src_block->ne[1];
6032-
GGML_ASSERT(rows * cols % split_size == 0);
6033-
6034-
const size_t nflattened_rows = split_size;
6035-
const size_t n3 = (rows * cols) / split_size;
6036-
6037-
src_block = ggml_view_3d(ctx, src_block,
6038-
nflattened_rows,
6039-
1,
6040-
n3,
6041-
nflattened_rows * ggml_element_size(src_block),
6042-
nflattened_rows * ggml_element_size(src_block),
6043-
0);
6044-
6045-
if (result_final == nullptr) {
6046-
if (src_block->type != GGML_TYPE_F32) {
6047-
result_final = ggml_cast(ctx, src_block, GGML_TYPE_F32);
6048-
cb(result_final, "result-upcast", il);
6049-
} else {
6050-
result_final = src_block;
6051-
}
6052-
continue;
6053-
}
6054-
6055-
if (src_block->type != GGML_TYPE_F32) {
6056-
src_block = ggml_cast(ctx, src_block, GGML_TYPE_F32);
6057-
}
6058-
result_final = ggml_concat(ctx, result_final, src_block);
6059-
cb(result_final, "result_final-accumulator", il);
6060-
}
6061-
}
6062-
6063-
result_final = ggml_reshape_2d(ctx, result_final, c_rows, c_cols);
6064-
cb(result_final, "result_final", il);
6065-
6066-
free(result_blocks);
6067-
6068-
return result_final;
6069-
}
6070-
60715737
static struct ggml_tensor * llm_build_norm(
60725738
struct ggml_context * ctx,
60735739
struct ggml_tensor * cur,
@@ -6813,7 +6479,7 @@ struct llm_build_context {
68136479
cb(cur, "result_norm", -1);
68146480

68156481
// lm_head
6816-
cur = llama_build_mat_mul_blocked_computation(ctx0, model.output, cur, model, cb, -1, 0);
6482+
cur = ggml_mul_mat(ctx0, model.output, cur);
68176483
cb(cur, "result_output", -1);
68186484

68196485
ggml_build_forward_expand(gf, cur);

0 commit comments

Comments
 (0)