Reverted blocked multiplication code as it still has issues and could affect other Llama arches

S · S · commit 26e8f23bf3ef · 2024-04-06T20:21:42.000+01:00
diff --git a/llama.cpp b/llama.cpp
@@ -5734,340 +5734,6 @@ static void llm_build_kv_store(
     ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
 }
 
-static struct ggml_tensor * llama_build_mat_mul_blocked_computation(
-    /*
-     * Does (almost) same thing as ggml_mat_mul mathematically speaking,
-     * but splits the computation into chunks.
-     *
-     * Why would you want to do this? As part of Command-R+ coding, we
-     * discovered that quite a bit of the GPU code is not prepared for
-     * matrices with more than 2**31-1 elements (~2 billion).
-     *
-     * Some context:
-     * https://github.com/ggerganov/llama.cpp/pull/6491
-     *
-     * This function has a limit (set to 2B) that if any constituent parts
-     * of it (input, output, result) would go over that limit byte-wise,
-     * it'll use the splitted computation. This is based on the idea that
-     * this minimizes the chance that somewhere downstream in GPU code, be
-     * it MPS or Cuda, has something like: int x = y * z; where the values
-     * of y and z overflow the multiplication and then silently (or not so
-     * silently) does something weird. At the time of writing (2024-04-05);
-     * it seems that CUDA code outright crashes and MPS silently gives bad
-     * results.
-     *
-     * This is a band-aid workaround. The ideal state of the world is that
-     * this function does nothing but "return ggml_mat_mul(ctx, a, b)".
-     *
-     * The last argument (forced_block_size) is for debugging. You can
-     * force a certain block size to use with the computation. If zero
-     * (default) then the block size is determined on the fly. Production
-     * code should always have it zero; and only set it to a non-zero value
-     * for debugging and testing.
-     */
-    struct     ggml_context * ctx,
-    struct     ggml_tensor  * a,
-    struct     ggml_tensor  * b,
-    const llama_model       & model,
-    const     llm_build_cb  & cb,
-    int64_t                   il,
-    size_t                    forced_block_size)
-{
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    if (forced_block_size != 0) {
-        //fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size);
-    }
-
-    const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000;
-
-    // the actual ggml_mul_mat supports batching. But this one doesn't.
-    GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1);
-    GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1);
-
-    // bail out if if the number of elements would be zero.
-    // nicer than getting a segfault.
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a').");
-        GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b').");
-    }
-
-    // Use the max size of: a, b, result size
-    const size_t a_rows = a->ne[1];
-    const size_t a_cols = a->ne[0];
-
-    // b is transposed
-    const size_t b_rows = b->ne[0];
-    const size_t b_cols = b->ne[1];
-
-    const size_t c_rows = a_rows;
-    const size_t c_cols = b_cols;
-
-    // determine a size of a block that's as big as possible.
-    // we start with block size of the maximum size, and if that passes,
-    // then we just use ggml_mat_mul()
-    //
-    // the block is square.
-    size_t cand_block_size = a_rows;
-    if (a_cols > cand_block_size) { cand_block_size = a_cols; }
-    if (b_rows > cand_block_size) { cand_block_size = b_rows; }
-    if (b_cols > cand_block_size) { cand_block_size = b_cols; }
-    if (c_rows > cand_block_size) { cand_block_size = c_rows; }
-    if (c_cols > cand_block_size) { cand_block_size = c_cols; }
-
-    size_t block_size = 1;
-    while (block_size < cand_block_size) {
-        block_size <<= 1;
-    }
-
-    if (forced_block_size != 0) {
-        block_size = forced_block_size;
-    } else {
-        // figure out what is largest block_size we can use that will never
-        // have an intermediate result bigger than
-        // MAX_BYTES_BEFORE_SPLIT
-        bool ok = true;
-        while (block_size > 0) {
-            ok = true;
-
-            // keep the byte calculations in sync with the blocked code in
-            // the computation part.
-
-            // Criteria:
-            // 1. result block size
-            {
-                const size_t i_min = 0;
-                const size_t j_min = 0;
-                size_t i_max = i_min + block_size;
-                size_t j_max = j_min + block_size;
-                if (i_max > a_rows) { i_max = a_rows; }
-                if (j_max > b_cols) { j_max = b_cols; }
-
-                const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min);
-                if (bytes_size > MAX_BYTES_BEFORE_SPLIT) {
-                    ok = false;
-                }
-            }
-            // 2. and 3.
-            // Block size from 'a' and 'b'
-            {
-                const size_t i_min = 0;
-                const size_t j_min = 0;
-                const size_t k_min = 0;
-
-                size_t i_max = i_min + block_size;
-                size_t j_max = j_min + block_size;
-                size_t k_max = k_min + block_size;
-
-                if (i_max > a_rows) { i_max = a_rows; }
-                if (j_max > b_cols) { j_max = b_cols; }
-                if (k_max > a_cols) { k_max = a_cols; }
-
-                const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min);
-                const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min);
-
-                if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) {
-                    ok = false;
-                }
-            }
-
-            if (!ok) {
-                block_size /= 2;
-                continue;
-            }
-            break;
-        }
-        GGML_ASSERT(block_size > 0);
-    }
-
-    //fprintf(stderr, "block_size=%zu a shape: %d %d b shape: %d %d\n", block_size, a_rows, a_cols, b_rows, b_cols);
-
-    // O(N^3) nested loop, where N is number of blocks on one of the
-    // constituent parts.
-    size_t nb_A = (a_rows + block_size - 1) / block_size;
-    size_t nb_B = (b_cols + block_size - 1) / block_size;
-    size_t nb_A2 = (a_cols + block_size - 1) / block_size;
-
-    // make placeholder tensors for each block results.
-    // 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y
-    struct ggml_tensor ** result_blocks = (struct ggml_tensor **) malloc(nb_A * nb_B * sizeof(struct ggml_tensor *));
-
-    for (size_t i = 0; i < nb_A; ++i) {
-        for (size_t j = 0; j < nb_B; ++j) {
-            const size_t i_min = i * block_size;
-            const size_t j_min = j * block_size;
-            size_t i_max = i_min + block_size;
-            size_t j_max = j_min + block_size;
-
-            if (i_max > a_rows) { i_max = a_rows; }
-            if (j_max > b_cols) { j_max = b_cols; }
-
-            struct ggml_tensor * result_block = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, i_max - i_min, j_max - j_min);
-            result_block = ggml_scale(ctx, result_block, 0.0f);
-
-            cb(result_block, "result_block-fresh", il);
-            result_blocks[i * nb_B + j] = result_block;
-        }
-    }
-
-    size_t num_blocks = 0;
-    for (size_t i = 0; i < nb_A; ++i) {
-        for (size_t j = 0; j < nb_B; ++j) {
-            for (size_t k = 0; k < nb_A2; ++k) {
-                num_blocks++;
-
-                const size_t i_min = i * block_size;
-                const size_t j_min = j * block_size;
-                const size_t k_min = k * block_size;
-
-                size_t i_max = i_min + block_size;
-                size_t j_max = j_min + block_size;
-                size_t k_max = k_min + block_size;
-                if (i_max > a_rows) { i_max = a_rows; }
-                if (j_max > b_cols) { j_max = b_cols; }
-                if (k_max > a_cols) { k_max = a_cols; }
-
-                const size_t blck_size_a = (const size_t) ggml_blck_size(a->type);
-                const size_t blck_size_b = (const size_t) ggml_blck_size(b->type);
-                const size_t type_size_a = ggml_type_size(a->type);
-                const size_t type_size_b = ggml_type_size(b->type);
-
-                GGML_ASSERT(k_min * type_size_a % blck_size_a == 0);
-                GGML_ASSERT(k_min * type_size_b % blck_size_b == 0);
-
-                // blck_size=32
-                // type_size_a=19
-                //
-                // k_min = 4
-                //
-                // byte_offset = (type_size_a * (k_min/blck_size)) =
-                //                    19       *   (4/32) = 2
-
-                struct ggml_tensor * a_slice = ggml_view_2d(
-                        ctx, a,
-                        k_max - k_min,   // k:k_max size
-                        i_max - i_min,   // i:i_max size
-                        ggml_row_size(a->type, a->ne[0]),
-                        ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a);
-
-                cb(a_slice, "a_slice", il);
-
-                struct ggml_tensor * b_slice = ggml_view_2d(
-                        ctx, b,
-                        k_max - k_min,   // k:k_max size
-                        j_max - j_min,   // j:j_max size
-                        ggml_row_size(b->type, b->ne[0]),
-                        ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b);
-
-                cb(b_slice, "b_slice", il);
-
-                struct ggml_tensor * result_slice = result_blocks[i * nb_B + j];
-
-                struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice);
-                cb(mm_result, "mm_result", il);
-
-                result_blocks[i * nb_B + j] = ggml_add(ctx, result_slice, mm_result);
-                cb(result_blocks[i * nb_B + j], "result_slice", il);
-            }
-        }
-    }
-
-    // concate the results into one chonky tensor.
-    // ggml_concat goes mad if the first two dimensions are not the same.
-    //
-    // We use this strategy: find largest power of two that divides the
-    // size of all the tensors. Power of two to make it friendly to GPU
-    // code; (TODO: LCD might be better? but not sure it won't break code).
-    //
-    // Flatten all the tensors to (X, 1, N, 1).
-    size_t split_size = 1;
-    while (1) {
-        size_t candidate_split_size = split_size << 1;
-        bool bad = false;
-
-        for (size_t i = 0; i < nb_A * nb_B; ++i) {
-            size_t rows = result_blocks[i]->ne[0];
-            size_t cols = result_blocks[i]->ne[1];
-
-            if (candidate_split_size > rows * cols) {
-                bad = true;
-                break;
-            }
-
-            if ((rows * cols) % candidate_split_size != 0) {
-                bad = true;
-                break;
-            }
-        }
-
-        if (bad) {
-            break;
-        }
-
-        split_size = candidate_split_size;
-    }
-
-    struct ggml_tensor * result_final = nullptr;
-    const ggml_type wanted_final_type = a->type;
-
-    // TODO: looks like concat also wants f32, so everything is casted to
-    // f32 here.. A datatype-agnostic concat would be nice; or ability to
-    // do the tensor equivalent of unsafe type cast.
-    //
-    // The Command-R+ tensor this code was written for was 6GB. So this is
-    // going to handle 12GB I guess. Oof.
-    //
-    // I believe you could be smarter and combine hierarchially instead of
-    // one by one. I.e. we are doing a concetenation like this:
-    // for x in range(100):
-    //   accum = accum + [x]     (copies accum every time? maybe. didn't read concat code)
-    //
-    // You could instead divide and conquer to make it a bit smarter.
-    for (size_t i = 0; i < nb_A; ++i) {
-        for (size_t j = 0; j < nb_B; ++j) {
-            struct ggml_tensor * src_block = result_blocks[i * nb_B + j];
-
-            const size_t rows = src_block->ne[0];
-            const size_t cols = src_block->ne[1];
-            GGML_ASSERT(rows * cols % split_size == 0);
-
-            const size_t nflattened_rows = split_size;
-            const size_t n3 = (rows * cols) / split_size;
-
-            src_block = ggml_view_3d(ctx, src_block,
-                    nflattened_rows,
-                    1,
-                    n3,
-                    nflattened_rows * ggml_element_size(src_block),
-                    nflattened_rows * ggml_element_size(src_block),
-                    0);
-
-            if (result_final == nullptr) {
-                if (src_block->type != GGML_TYPE_F32) {
-                    result_final = ggml_cast(ctx, src_block, GGML_TYPE_F32);
-                    cb(result_final, "result-upcast", il);
-                } else {
-                    result_final = src_block;
-                }
-                continue;
-            }
-
-            if (src_block->type != GGML_TYPE_F32) {
-                src_block = ggml_cast(ctx, src_block, GGML_TYPE_F32);
-            }
-            result_final = ggml_concat(ctx, result_final, src_block);
-            cb(result_final, "result_final-accumulator", il);
-        }
-    }
-
-    result_final = ggml_reshape_2d(ctx, result_final, c_rows, c_cols);
-    cb(result_final, "result_final", il);
-
-    free(result_blocks);
-
-    return result_final;
-}
-
 static struct ggml_tensor * llm_build_norm(
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
@@ -6813,7 +6479,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llama_build_mat_mul_blocked_computation(ctx0, model.output, cur, model, cb, -1, 0);
+        cur = ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);