@@ -10684,6 +10684,8 @@ static void ggml_compute_forward_mul_mat(
10684
10684
10685
10685
const enum ggml_type type = src0->type;
10686
10686
10687
+ const bool src1_cont = ggml_is_contiguous(src1);
10688
+
10687
10689
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10688
10690
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10689
10691
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10749,7 @@ static void ggml_compute_forward_mul_mat(
10747
10749
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10748
10750
10749
10751
if (type != GGML_TYPE_F32) {
10750
- float * const wdata = params->wdata;
10752
+ float * const wdata = params->wdata;
10751
10753
ggml_to_float_t const to_float = type_traits[type].to_float;
10752
10754
10753
10755
size_t id = 0;
@@ -10805,7 +10807,7 @@ static void ggml_compute_forward_mul_mat(
10805
10807
// src1 rows
10806
10808
const int64_t nr1 = ne11*ne12*ne13;
10807
10809
10808
- void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10810
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10809
10811
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
10812
10811
10813
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10830,15 @@ static void ggml_compute_forward_mul_mat(
10828
10830
const int64_t i3 = i13;
10829
10831
10830
10832
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
- const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10833
+
10834
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10835
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10836
+ // the original src1 data pointer, so we should index using the indices directly
10837
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10838
+ const char * src1_col = (const char *) wdata +
10839
+ (src1_cont || src1->type != vec_dot_type
10840
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10841
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10832
10842
10833
10843
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
10844
@@ -12982,12 +12992,13 @@ static void ggml_compute_forward_conv_1d(
12982
12992
};
12983
12993
}
12984
12994
12985
- // ggml_compute_forward_conv_2d_sk_p0
12995
+ // ggml_compute_forward_conv_2d
12986
12996
12987
- static void ggml_compute_forward_conv_2d_sk_p0_f16_f32 (
12997
+ static void ggml_compute_forward_conv_2d_f16_f32 (
12988
12998
const struct ggml_compute_params * params,
12989
12999
const struct ggml_tensor * src0,
12990
13000
const struct ggml_tensor * src1,
13001
+ const struct ggml_tensor * opt0,
12991
13002
struct ggml_tensor * dst) {
12992
13003
GGML_ASSERT(src0->type == GGML_TYPE_F16);
12993
13004
GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13007,28 +13018,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13007
13018
// size of the convolution row - the kernel size unrolled across all channels
13008
13019
const int ew0 = nk0*nk1*ne02;
13009
13020
13021
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13022
+ const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13023
+ const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13024
+ const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13025
+ const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13026
+ const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13027
+
13010
13028
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13011
13029
GGML_ASSERT(nb10 == sizeof(float));
13012
13030
13013
13031
if (params->type == GGML_TASK_INIT) {
13014
- // TODO: fix this memset (wsize is overestimated)
13015
13032
memset(params->wdata, 0, params->wsize);
13016
13033
13017
13034
// prepare source data (src1)
13018
13035
{
13019
13036
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13020
13037
13021
- for (int i13 = 0; i13 < ne13; i13++) {
13022
- for (int i12 = 0; i12 < ne12; i12++) {
13023
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
13038
+ for (int i12 = 0; i12 < ne12; i12++) {
13039
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13040
+ ggml_fp16_t * dst_data = wdata;
13025
13041
13026
- for (int i1 = 0; i1 < ne1; i1++) {
13027
- for (int i0 = 0; i0 < ne0; i0++) {
13028
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
- for (int ik0 = 0; ik0 < nk0; ik0++) {
13042
+ for (int i1 = 0; i1 < ne1; i1++) {
13043
+ for (int i0 = 0; i0 < ne0; i0++) {
13044
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13045
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13046
+ const int idx0 = i0*s0 + ik0*d0 - p0;
13047
+ const int idx1 = i1*s1 + ik1*d1 - p1;
13048
+
13049
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
13030
13050
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)* ne10 + (i0*nk0 + ik0) ]);
13051
+ GGML_FP32_TO_FP16(src[idx1* ne10 + idx0 ]);
13032
13052
}
13033
13053
}
13034
13054
}
@@ -13071,19 +13091,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13071
13091
}
13072
13092
}
13073
13093
13074
- static void ggml_compute_forward_conv_2d_sk_p0 (
13094
+ static void ggml_compute_forward_conv_2d (
13075
13095
const struct ggml_compute_params * params,
13076
13096
const struct ggml_tensor * src0,
13077
13097
const struct ggml_tensor * src1,
13078
- struct ggml_tensor * dst) {
13098
+ const struct ggml_tensor * opt0,
13099
+ struct ggml_tensor * dst
13100
+ ) {
13079
13101
switch (src0->type) {
13080
13102
case GGML_TYPE_F16:
13081
13103
{
13082
- ggml_compute_forward_conv_2d_sk_p0_f16_f32 (params, src0, src1, dst);
13104
+ ggml_compute_forward_conv_2d_f16_f32 (params, src0, src1, opt0 , dst);
13083
13105
} break;
13084
13106
case GGML_TYPE_F32:
13085
13107
{
13086
- //ggml_compute_forward_conv_2d_sk_p0_f32 (params, src0, src1, dst);
13108
+ //ggml_compute_forward_conv_2d_f32 (params, src0, src1, opt0 , dst);
13087
13109
GGML_ASSERT(false);
13088
13110
} break;
13089
13111
default:
@@ -13093,32 +13115,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13093
13115
}
13094
13116
}
13095
13117
13096
- // ggml_compute_forward_conv_2d
13097
-
13098
- static void ggml_compute_forward_conv_2d(
13099
- const struct ggml_compute_params* params,
13100
- const struct ggml_tensor* src0,
13101
- const struct ggml_tensor* src1,
13102
- const struct ggml_tensor* opt0,
13103
- struct ggml_tensor* dst) {
13104
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110
- GGML_ASSERT(d0 == 1); // dilation not supported
13111
- GGML_ASSERT(d1 == 1);
13112
- GGML_ASSERT(p0 == 0); // padding not supported
13113
- GGML_ASSERT(p1 == 0);
13114
-
13115
- if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117
- } else {
13118
- GGML_ASSERT(false); // only stride equal to kernel size is supported
13119
- }
13120
- }
13121
-
13122
13118
// ggml_compute_forward_pool_1d_sk_p0
13123
13119
13124
13120
static void ggml_compute_forward_pool_1d_sk_p0(
@@ -16575,19 +16571,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16575
16571
const int64_t ne11 = node->src[1]->ne[1]; // H
16576
16572
const int64_t ne12 = node->src[1]->ne[2]; // C
16577
16573
16574
+ const int64_t ne0 = node->ne[0];
16575
+ const int64_t ne1 = node->ne[1];
16576
+ const int64_t ne2 = node->ne[2];
16578
16577
const int64_t nk = ne00*ne01;
16578
+ const int64_t ew0 = nk * ne02;
16579
16579
16580
- UNUSED(ne02);
16581
16580
UNUSED(ne03);
16582
- UNUSED(nk );
16581
+ UNUSED(ne2 );
16583
16582
16584
16583
size_t cur = 0;
16585
16584
16586
16585
if (node->src[0]->type == GGML_TYPE_F16 &&
16587
- node->src[1]->type == GGML_TYPE_F32) {
16588
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12 );
16586
+ node->src[1]->type == GGML_TYPE_F32) {
16587
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0 );
16589
16588
} else if (node->src[0]->type == GGML_TYPE_F32 &&
16590
- node->src[1]->type == GGML_TYPE_F32) {
16589
+ node->src[1]->type == GGML_TYPE_F32) {
16591
16590
cur = sizeof(float)* (ne10*ne11*ne12);
16592
16591
} else {
16593
16592
GGML_ASSERT(false);
0 commit comments