@@ -5734,340 +5734,6 @@ static void llm_build_kv_store(
5734
5734
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
5735
5735
}
5736
5736
5737
- static struct ggml_tensor * llama_build_mat_mul_blocked_computation(
5738
- /*
5739
- * Does (almost) same thing as ggml_mat_mul mathematically speaking,
5740
- * but splits the computation into chunks.
5741
- *
5742
- * Why would you want to do this? As part of Command-R+ coding, we
5743
- * discovered that quite a bit of the GPU code is not prepared for
5744
- * matrices with more than 2**31-1 elements (~2 billion).
5745
- *
5746
- * Some context:
5747
- * https://github.com/ggerganov/llama.cpp/pull/6491
5748
- *
5749
- * This function has a limit (set to 2B) that if any constituent parts
5750
- * of it (input, output, result) would go over that limit byte-wise,
5751
- * it'll use the splitted computation. This is based on the idea that
5752
- * this minimizes the chance that somewhere downstream in GPU code, be
5753
- * it MPS or Cuda, has something like: int x = y * z; where the values
5754
- * of y and z overflow the multiplication and then silently (or not so
5755
- * silently) does something weird. At the time of writing (2024-04-05);
5756
- * it seems that CUDA code outright crashes and MPS silently gives bad
5757
- * results.
5758
- *
5759
- * This is a band-aid workaround. The ideal state of the world is that
5760
- * this function does nothing but "return ggml_mat_mul(ctx, a, b)".
5761
- *
5762
- * The last argument (forced_block_size) is for debugging. You can
5763
- * force a certain block size to use with the computation. If zero
5764
- * (default) then the block size is determined on the fly. Production
5765
- * code should always have it zero; and only set it to a non-zero value
5766
- * for debugging and testing.
5767
- */
5768
- struct ggml_context * ctx,
5769
- struct ggml_tensor * a,
5770
- struct ggml_tensor * b,
5771
- const llama_model & model,
5772
- const llm_build_cb & cb,
5773
- int64_t il,
5774
- size_t forced_block_size)
5775
- {
5776
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
5777
-
5778
- if (forced_block_size != 0) {
5779
- //fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size);
5780
- }
5781
-
5782
- const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000;
5783
-
5784
- // the actual ggml_mul_mat supports batching. But this one doesn't.
5785
- GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1);
5786
- GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1);
5787
-
5788
- // bail out if if the number of elements would be zero.
5789
- // nicer than getting a segfault.
5790
- for (int i = 0; i < GGML_MAX_DIMS; ++i) {
5791
- GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a').");
5792
- GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b').");
5793
- }
5794
-
5795
- // Use the max size of: a, b, result size
5796
- const size_t a_rows = a->ne[1];
5797
- const size_t a_cols = a->ne[0];
5798
-
5799
- // b is transposed
5800
- const size_t b_rows = b->ne[0];
5801
- const size_t b_cols = b->ne[1];
5802
-
5803
- const size_t c_rows = a_rows;
5804
- const size_t c_cols = b_cols;
5805
-
5806
- // determine a size of a block that's as big as possible.
5807
- // we start with block size of the maximum size, and if that passes,
5808
- // then we just use ggml_mat_mul()
5809
- //
5810
- // the block is square.
5811
- size_t cand_block_size = a_rows;
5812
- if (a_cols > cand_block_size) { cand_block_size = a_cols; }
5813
- if (b_rows > cand_block_size) { cand_block_size = b_rows; }
5814
- if (b_cols > cand_block_size) { cand_block_size = b_cols; }
5815
- if (c_rows > cand_block_size) { cand_block_size = c_rows; }
5816
- if (c_cols > cand_block_size) { cand_block_size = c_cols; }
5817
-
5818
- size_t block_size = 1;
5819
- while (block_size < cand_block_size) {
5820
- block_size <<= 1;
5821
- }
5822
-
5823
- if (forced_block_size != 0) {
5824
- block_size = forced_block_size;
5825
- } else {
5826
- // figure out what is largest block_size we can use that will never
5827
- // have an intermediate result bigger than
5828
- // MAX_BYTES_BEFORE_SPLIT
5829
- bool ok = true;
5830
- while (block_size > 0) {
5831
- ok = true;
5832
-
5833
- // keep the byte calculations in sync with the blocked code in
5834
- // the computation part.
5835
-
5836
- // Criteria:
5837
- // 1. result block size
5838
- {
5839
- const size_t i_min = 0;
5840
- const size_t j_min = 0;
5841
- size_t i_max = i_min + block_size;
5842
- size_t j_max = j_min + block_size;
5843
- if (i_max > a_rows) { i_max = a_rows; }
5844
- if (j_max > b_cols) { j_max = b_cols; }
5845
-
5846
- const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min);
5847
- if (bytes_size > MAX_BYTES_BEFORE_SPLIT) {
5848
- ok = false;
5849
- }
5850
- }
5851
- // 2. and 3.
5852
- // Block size from 'a' and 'b'
5853
- {
5854
- const size_t i_min = 0;
5855
- const size_t j_min = 0;
5856
- const size_t k_min = 0;
5857
-
5858
- size_t i_max = i_min + block_size;
5859
- size_t j_max = j_min + block_size;
5860
- size_t k_max = k_min + block_size;
5861
-
5862
- if (i_max > a_rows) { i_max = a_rows; }
5863
- if (j_max > b_cols) { j_max = b_cols; }
5864
- if (k_max > a_cols) { k_max = a_cols; }
5865
-
5866
- const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min);
5867
- const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min);
5868
-
5869
- if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) {
5870
- ok = false;
5871
- }
5872
- }
5873
-
5874
- if (!ok) {
5875
- block_size /= 2;
5876
- continue;
5877
- }
5878
- break;
5879
- }
5880
- GGML_ASSERT(block_size > 0);
5881
- }
5882
-
5883
- //fprintf(stderr, "block_size=%zu a shape: %d %d b shape: %d %d\n", block_size, a_rows, a_cols, b_rows, b_cols);
5884
-
5885
- // O(N^3) nested loop, where N is number of blocks on one of the
5886
- // constituent parts.
5887
- size_t nb_A = (a_rows + block_size - 1) / block_size;
5888
- size_t nb_B = (b_cols + block_size - 1) / block_size;
5889
- size_t nb_A2 = (a_cols + block_size - 1) / block_size;
5890
-
5891
- // make placeholder tensors for each block results.
5892
- // 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y
5893
- struct ggml_tensor ** result_blocks = (struct ggml_tensor **) malloc(nb_A * nb_B * sizeof(struct ggml_tensor *));
5894
-
5895
- for (size_t i = 0; i < nb_A; ++i) {
5896
- for (size_t j = 0; j < nb_B; ++j) {
5897
- const size_t i_min = i * block_size;
5898
- const size_t j_min = j * block_size;
5899
- size_t i_max = i_min + block_size;
5900
- size_t j_max = j_min + block_size;
5901
-
5902
- if (i_max > a_rows) { i_max = a_rows; }
5903
- if (j_max > b_cols) { j_max = b_cols; }
5904
-
5905
- struct ggml_tensor * result_block = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, i_max - i_min, j_max - j_min);
5906
- result_block = ggml_scale(ctx, result_block, 0.0f);
5907
-
5908
- cb(result_block, "result_block-fresh", il);
5909
- result_blocks[i * nb_B + j] = result_block;
5910
- }
5911
- }
5912
-
5913
- size_t num_blocks = 0;
5914
- for (size_t i = 0; i < nb_A; ++i) {
5915
- for (size_t j = 0; j < nb_B; ++j) {
5916
- for (size_t k = 0; k < nb_A2; ++k) {
5917
- num_blocks++;
5918
-
5919
- const size_t i_min = i * block_size;
5920
- const size_t j_min = j * block_size;
5921
- const size_t k_min = k * block_size;
5922
-
5923
- size_t i_max = i_min + block_size;
5924
- size_t j_max = j_min + block_size;
5925
- size_t k_max = k_min + block_size;
5926
- if (i_max > a_rows) { i_max = a_rows; }
5927
- if (j_max > b_cols) { j_max = b_cols; }
5928
- if (k_max > a_cols) { k_max = a_cols; }
5929
-
5930
- const size_t blck_size_a = (const size_t) ggml_blck_size(a->type);
5931
- const size_t blck_size_b = (const size_t) ggml_blck_size(b->type);
5932
- const size_t type_size_a = ggml_type_size(a->type);
5933
- const size_t type_size_b = ggml_type_size(b->type);
5934
-
5935
- GGML_ASSERT(k_min * type_size_a % blck_size_a == 0);
5936
- GGML_ASSERT(k_min * type_size_b % blck_size_b == 0);
5937
-
5938
- // blck_size=32
5939
- // type_size_a=19
5940
- //
5941
- // k_min = 4
5942
- //
5943
- // byte_offset = (type_size_a * (k_min/blck_size)) =
5944
- // 19 * (4/32) = 2
5945
-
5946
- struct ggml_tensor * a_slice = ggml_view_2d(
5947
- ctx, a,
5948
- k_max - k_min, // k:k_max size
5949
- i_max - i_min, // i:i_max size
5950
- ggml_row_size(a->type, a->ne[0]),
5951
- ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a);
5952
-
5953
- cb(a_slice, "a_slice", il);
5954
-
5955
- struct ggml_tensor * b_slice = ggml_view_2d(
5956
- ctx, b,
5957
- k_max - k_min, // k:k_max size
5958
- j_max - j_min, // j:j_max size
5959
- ggml_row_size(b->type, b->ne[0]),
5960
- ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b);
5961
-
5962
- cb(b_slice, "b_slice", il);
5963
-
5964
- struct ggml_tensor * result_slice = result_blocks[i * nb_B + j];
5965
-
5966
- struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice);
5967
- cb(mm_result, "mm_result", il);
5968
-
5969
- result_blocks[i * nb_B + j] = ggml_add(ctx, result_slice, mm_result);
5970
- cb(result_blocks[i * nb_B + j], "result_slice", il);
5971
- }
5972
- }
5973
- }
5974
-
5975
- // concate the results into one chonky tensor.
5976
- // ggml_concat goes mad if the first two dimensions are not the same.
5977
- //
5978
- // We use this strategy: find largest power of two that divides the
5979
- // size of all the tensors. Power of two to make it friendly to GPU
5980
- // code; (TODO: LCD might be better? but not sure it won't break code).
5981
- //
5982
- // Flatten all the tensors to (X, 1, N, 1).
5983
- size_t split_size = 1;
5984
- while (1) {
5985
- size_t candidate_split_size = split_size << 1;
5986
- bool bad = false;
5987
-
5988
- for (size_t i = 0; i < nb_A * nb_B; ++i) {
5989
- size_t rows = result_blocks[i]->ne[0];
5990
- size_t cols = result_blocks[i]->ne[1];
5991
-
5992
- if (candidate_split_size > rows * cols) {
5993
- bad = true;
5994
- break;
5995
- }
5996
-
5997
- if ((rows * cols) % candidate_split_size != 0) {
5998
- bad = true;
5999
- break;
6000
- }
6001
- }
6002
-
6003
- if (bad) {
6004
- break;
6005
- }
6006
-
6007
- split_size = candidate_split_size;
6008
- }
6009
-
6010
- struct ggml_tensor * result_final = nullptr;
6011
- const ggml_type wanted_final_type = a->type;
6012
-
6013
- // TODO: looks like concat also wants f32, so everything is casted to
6014
- // f32 here.. A datatype-agnostic concat would be nice; or ability to
6015
- // do the tensor equivalent of unsafe type cast.
6016
- //
6017
- // The Command-R+ tensor this code was written for was 6GB. So this is
6018
- // going to handle 12GB I guess. Oof.
6019
- //
6020
- // I believe you could be smarter and combine hierarchially instead of
6021
- // one by one. I.e. we are doing a concetenation like this:
6022
- // for x in range(100):
6023
- // accum = accum + [x] (copies accum every time? maybe. didn't read concat code)
6024
- //
6025
- // You could instead divide and conquer to make it a bit smarter.
6026
- for (size_t i = 0; i < nb_A; ++i) {
6027
- for (size_t j = 0; j < nb_B; ++j) {
6028
- struct ggml_tensor * src_block = result_blocks[i * nb_B + j];
6029
-
6030
- const size_t rows = src_block->ne[0];
6031
- const size_t cols = src_block->ne[1];
6032
- GGML_ASSERT(rows * cols % split_size == 0);
6033
-
6034
- const size_t nflattened_rows = split_size;
6035
- const size_t n3 = (rows * cols) / split_size;
6036
-
6037
- src_block = ggml_view_3d(ctx, src_block,
6038
- nflattened_rows,
6039
- 1,
6040
- n3,
6041
- nflattened_rows * ggml_element_size(src_block),
6042
- nflattened_rows * ggml_element_size(src_block),
6043
- 0);
6044
-
6045
- if (result_final == nullptr) {
6046
- if (src_block->type != GGML_TYPE_F32) {
6047
- result_final = ggml_cast(ctx, src_block, GGML_TYPE_F32);
6048
- cb(result_final, "result-upcast", il);
6049
- } else {
6050
- result_final = src_block;
6051
- }
6052
- continue;
6053
- }
6054
-
6055
- if (src_block->type != GGML_TYPE_F32) {
6056
- src_block = ggml_cast(ctx, src_block, GGML_TYPE_F32);
6057
- }
6058
- result_final = ggml_concat(ctx, result_final, src_block);
6059
- cb(result_final, "result_final-accumulator", il);
6060
- }
6061
- }
6062
-
6063
- result_final = ggml_reshape_2d(ctx, result_final, c_rows, c_cols);
6064
- cb(result_final, "result_final", il);
6065
-
6066
- free(result_blocks);
6067
-
6068
- return result_final;
6069
- }
6070
-
6071
5737
static struct ggml_tensor * llm_build_norm(
6072
5738
struct ggml_context * ctx,
6073
5739
struct ggml_tensor * cur,
@@ -6813,7 +6479,7 @@ struct llm_build_context {
6813
6479
cb(cur, "result_norm", -1);
6814
6480
6815
6481
// lm_head
6816
- cur = llama_build_mat_mul_blocked_computation (ctx0, model.output, cur, model, cb, -1, 0 );
6482
+ cur = ggml_mul_mat (ctx0, model.output, cur);
6817
6483
cb(cur, "result_output", -1);
6818
6484
6819
6485
ggml_build_forward_expand(gf, cur);
0 commit comments