@@ -4168,10 +4168,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
4168
4168
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4169
4169
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4170
4170
4171
- return
4172
- (t0->ne[0] == t1->ne[0]) &&
4173
- (t0->ne[2] == t1->ne[2]) &&
4174
- (t0->ne[3] == t1->ne[3]);
4171
+ return (t0->ne[0] == t1->ne[0]) &&
4172
+ (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
4173
+ (t1->ne[3]%t0->ne[3] == 0);
4175
4174
}
4176
4175
4177
4176
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -6036,8 +6035,8 @@ struct ggml_tensor * ggml_mul_mat(
6036
6035
is_node = true;
6037
6036
}
6038
6037
6039
- const int64_t ne[4] = { a->ne[1], b->ne[1], a ->ne[2], b->ne[3] };
6040
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN (a->n_dims, b->n_dims), ne);
6038
+ const int64_t ne[4] = { a->ne[1], b->ne[1], b ->ne[2], b->ne[3] };
6039
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX (a->n_dims, b->n_dims), ne);
6041
6040
6042
6041
result->op = GGML_OP_MUL_MAT;
6043
6042
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7173,7 +7172,6 @@ struct ggml_tensor* ggml_conv_2d(
7173
7172
int d0,
7174
7173
int d1) {
7175
7174
7176
- GGML_ASSERT(b->ne[3] == 1);
7177
7175
GGML_ASSERT(a->ne[2] == b->ne[2]);
7178
7176
bool is_node = false;
7179
7177
@@ -7185,7 +7183,7 @@ struct ggml_tensor* ggml_conv_2d(
7185
7183
const int64_t ne[4] = {
7186
7184
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
7187
7185
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
7188
- a->ne[3], 1 ,
7186
+ a->ne[3], b->ne[3] ,
7189
7187
};
7190
7188
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7191
7189
@@ -10641,7 +10639,6 @@ static void ggml_compute_forward_rms_norm_back(
10641
10639
}
10642
10640
}
10643
10641
10644
-
10645
10642
// ggml_compute_forward_mul_mat
10646
10643
10647
10644
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -10685,17 +10682,17 @@ static void ggml_compute_forward_mul_mat(
10685
10682
const int ith = params->ith;
10686
10683
const int nth = params->nth;
10687
10684
10688
- GGML_ASSERT(ne02 == ne12);
10689
- GGML_ASSERT(ne03 == ne13);
10690
- GGML_ASSERT(ne2 == ne12);
10691
- GGML_ASSERT(ne3 == ne13);
10692
-
10693
10685
const enum ggml_type type = src0->type;
10694
10686
10695
10687
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10696
10688
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10697
10689
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10698
10690
10691
+ GGML_ASSERT(ne0 == ne01);
10692
+ GGML_ASSERT(ne1 == ne11);
10693
+ GGML_ASSERT(ne2 == ne12);
10694
+ GGML_ASSERT(ne3 == ne13);
10695
+
10699
10696
// we don't support permuted src0 or src1
10700
10697
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
10701
10698
GGML_ASSERT(nb10 == sizeof(float));
@@ -10706,16 +10703,16 @@ static void ggml_compute_forward_mul_mat(
10706
10703
GGML_ASSERT(nb1 <= nb2);
10707
10704
GGML_ASSERT(nb2 <= nb3);
10708
10705
10709
- GGML_ASSERT(ne0 == ne01);
10710
- GGML_ASSERT(ne1 == ne11);
10711
- GGML_ASSERT(ne2 == ne02);
10712
- GGML_ASSERT(ne3 == ne03);
10713
-
10714
10706
// nb01 >= nb00 - src0 is not transposed
10715
10707
// compute by src0 rows
10716
10708
10717
10709
#if defined(GGML_USE_CLBLAST)
10718
10710
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10711
+ // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10712
+ // ref: https://github.com/ggerganov/ggml/pull/224
10713
+ GGML_ASSERT(ne02 == ne12);
10714
+ GGML_ASSERT(ne03 == ne13);
10715
+
10719
10716
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10720
10717
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10721
10718
}
@@ -10725,6 +10722,11 @@ static void ggml_compute_forward_mul_mat(
10725
10722
10726
10723
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10727
10724
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10725
+ // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10726
+ // ref: https://github.com/ggerganov/ggml/pull/224
10727
+ GGML_ASSERT(ne02 == ne12);
10728
+ GGML_ASSERT(ne03 == ne13);
10729
+
10728
10730
if (params->ith != 0) {
10729
10731
return;
10730
10732
}
@@ -10794,41 +10796,44 @@ static void ggml_compute_forward_mul_mat(
10794
10796
return;
10795
10797
}
10796
10798
10797
- // parallelize by src0 rows using ggml_vec_dot_q
10799
+ // parallelize by src0 rows
10800
+ const int64_t dr = (ne01 + nth - 1)/nth;
10798
10801
10799
- // total rows in src0
10800
- const int nr = ne01*ne02*ne03 ;
10802
+ const int64_t ir10 = dr*ith;
10803
+ const int64_t ir11 = MIN(ir10 + dr, ne01) ;
10801
10804
10802
- // rows per thread
10803
- const int dr = (nr + nth - 1)/nth;
10804
-
10805
- // row range for this thread
10806
- const int ir0 = dr*ith;
10807
- const int ir1 = MIN(ir0 + dr, nr);
10805
+ // src1 rows
10806
+ const int64_t nr1 = ne11*ne12*ne13;
10808
10807
10809
10808
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10810
- const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10811
-
10812
- for (int ir = ir0; ir < ir1; ++ir) {
10813
- // src0 indices
10814
- const int i03 = ir/(ne02*ne01);
10815
- const int i02 = (ir - i03*ne02*ne01)/ne01;
10816
- const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
10817
-
10818
- const int i13 = i03;
10819
- const int i12 = i02;
10820
-
10821
- const int i0 = i01;
10822
- const int i2 = i02;
10823
- const int i3 = i03;
10824
-
10825
- void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
10826
- char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size));
10827
-
10828
- float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
10829
-
10830
- for (int64_t ic = 0; ic < ne11; ++ic) {
10831
- vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
10809
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
+
10811
+ for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
10812
+ const int64_t i13 = (ir1/(ne12*ne11));
10813
+ const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10814
+ const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10815
+
10816
+ const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
10817
+ const int64_t i03 = (ir0/(ne02));
10818
+ // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
10819
+ // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
10820
+ // GG: this is likely the correct way to broadcast, though need some more thought
10821
+ // therefore leaving the comments to remind us for now
10822
+ const int64_t i02 = (i12 / (ne12 / ne02));
10823
+ // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
10824
+ // const int64_t i02 = (ir0 - i03*ne02);
10825
+
10826
+ const int64_t i1 = i11;
10827
+ const int64_t i2 = i12;
10828
+ const int64_t i3 = i13;
10829
+
10830
+ const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
+ const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10832
+
10833
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
+
10835
+ for (int64_t ir = ir10; ir < ir11; ++ir) {
10836
+ vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
10832
10837
}
10833
10838
}
10834
10839
@@ -13013,16 +13018,18 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13013
13018
{
13014
13019
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13015
13020
13016
- for (int i12 = 0; i12 < ne12; i12++) {
13017
- const float * const src = (float *)((char *) src1->data + i12*nb12);
13018
- ggml_fp16_t * dst_data = wdata;
13019
-
13020
- for (int i1 = 0; i1 < ne1; i1++) {
13021
- for (int i0 = 0; i0 < ne0; i0++) {
13022
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13023
- for (int ik0 = 0; ik0 < nk0; ik0++) {
13024
- dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13025
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13021
+ for (int i13 = 0; i13 < ne13; i13++) {
13022
+ for (int i12 = 0; i12 < ne12; i12++) {
13023
+ const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
+ ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
13025
+
13026
+ for (int i1 = 0; i1 < ne1; i1++) {
13027
+ for (int i0 = 0; i0 < ne0; i0++) {
13028
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13030
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13032
+ }
13026
13033
}
13027
13034
}
13028
13035
}
@@ -13049,14 +13056,16 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13049
13056
13050
13057
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13051
13058
13052
- for (int i2 = ip0; i2 < ip1; i2++) {
13053
- float * dst_data = (float *)((char *) dst->data + i2*nb2);
13054
-
13055
- for (int i1 = 0; i1 < ne1; ++i1) {
13056
- for (int i0 = 0; i0 < ne0; ++i0) {
13057
- ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13058
- (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13059
- (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13059
+ for (int i3 = 0; i3 < ne3; i3++) {
13060
+ for (int i2 = ip0; i2 < ip1; i2++) {
13061
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
13062
+
13063
+ for (int i1 = 0; i1 < ne1; ++i1) {
13064
+ for (int i0 = 0; i0 < ne0; ++i0) {
13065
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13066
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13067
+ (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
13068
+ }
13060
13069
}
13061
13070
}
13062
13071
}
@@ -13105,10 +13114,9 @@ static void ggml_compute_forward_conv_2d(
13105
13114
13106
13115
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13107
13116
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13108
- }
13109
- else {
13117
+ } else {
13110
13118
GGML_ASSERT(false); // only stride equal to kernel size is supported
13111
- };
13119
+ }
13112
13120
}
13113
13121
13114
13122
// ggml_compute_forward_pool_1d_sk_p0
@@ -16558,8 +16566,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16558
16566
{
16559
16567
n_tasks = n_threads;
16560
16568
16561
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16562
-
16563
16569
const int64_t ne00 = node->src[0]->ne[0]; // W
16564
16570
const int64_t ne01 = node->src[0]->ne[1]; // H
16565
16571
const int64_t ne02 = node->src[0]->ne[2]; // C
0 commit comments