Skip to content

Commit 6979666

Browse files
committed
ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)
1 parent 27ad57a commit 6979666

File tree

2 files changed

+101
-52
lines changed

2 files changed

+101
-52
lines changed

ggml-cuda.cu

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,6 +1667,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
16671667
dst[i + 1] = x0*sin_theta + x1*cos_theta;
16681668
}
16691669

1670+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
1671+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
1672+
const int half_n_dims = ncols/4;
1673+
1674+
if (col >= half_n_dims) {
1675+
return;
1676+
}
1677+
1678+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
1679+
const int i = row*ncols + col;
1680+
1681+
const float col_theta_scale = powf(theta_scale, col);
1682+
1683+
const float theta = p*col_theta_scale;
1684+
const float sin_theta = sinf(theta);
1685+
const float cos_theta = cosf(theta);
1686+
1687+
const float x0 = x[i + 0];
1688+
const float x1 = x[i + half_n_dims];
1689+
1690+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
1691+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
1692+
1693+
const float block_theta = block_p*col_theta_scale;
1694+
const float sin_block_theta = sinf(block_theta);
1695+
const float cos_block_theta = cosf(block_theta);
1696+
1697+
const float x2 = x[i + half_n_dims * 2];
1698+
const float x3 = x[i + half_n_dims * 3];
1699+
1700+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
1701+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
1702+
}
1703+
16701704
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
16711705
const int col = blockDim.x*blockIdx.x + threadIdx.x;
16721706
const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -2064,6 +2098,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
20642098
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
20652099
}
20662100

2101+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2102+
GGML_ASSERT(nrows % 4 == 0);
2103+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2104+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2105+
const dim3 block_nums(num_blocks_x, nrows, 1);
2106+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2107+
}
2108+
20672109
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
20682110
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
20692111
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2618,13 +2660,21 @@ inline void ggml_cuda_op_rope(
26182660
const int n_past = ((int32_t *) src1->data)[0];
26192661
const int n_dims = ((int32_t *) src1->data)[1];
26202662
const int mode = ((int32_t *) src1->data)[2];
2621-
GGML_ASSERT(mode == 0);
2663+
const int n_ctx = ((int32_t *) src1->data)[3];
26222664

26232665
const float theta_scale = powf(10000.0, -2.0f/n_dims);
26242666
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
26252667

2668+
bool is_glm = mode & 4;
2669+
26262670
// compute
2627-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2671+
if (is_glm) {
2672+
const float id_p = min(p, n_ctx - 2.f);
2673+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
2674+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
2675+
} else {
2676+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2677+
}
26282678

26292679
(void) dst;
26302680
(void) src0_ddq_i;

ggml.c

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10684,6 +10684,8 @@ static void ggml_compute_forward_mul_mat(
1068410684

1068510685
const enum ggml_type type = src0->type;
1068610686

10687+
const bool src1_cont = ggml_is_contiguous(src1);
10688+
1068710689
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
1068810690
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
1068910691
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10749,7 @@ static void ggml_compute_forward_mul_mat(
1074710749
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1074810750

1074910751
if (type != GGML_TYPE_F32) {
10750-
float * const wdata = params->wdata;
10752+
float * const wdata = params->wdata;
1075110753
ggml_to_float_t const to_float = type_traits[type].to_float;
1075210754

1075310755
size_t id = 0;
@@ -10805,7 +10807,7 @@ static void ggml_compute_forward_mul_mat(
1080510807
// src1 rows
1080610808
const int64_t nr1 = ne11*ne12*ne13;
1080710809

10808-
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10810+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1080910811
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
1081010812

1081110813
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10830,15 @@ static void ggml_compute_forward_mul_mat(
1082810830
const int64_t i3 = i13;
1082910831

1083010832
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831-
const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10833+
10834+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10835+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10836+
// the original src1 data pointer, so we should index using the indices directly
10837+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
10838+
const char * src1_col = (const char *) wdata +
10839+
(src1_cont || src1->type != vec_dot_type
10840+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10841+
: (i11*nb11 + i12*nb12 + i13*nb13));
1083210842

1083310843
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
1083410844

@@ -12982,12 +12992,13 @@ static void ggml_compute_forward_conv_1d(
1298212992
};
1298312993
}
1298412994

12985-
// ggml_compute_forward_conv_2d_sk_p0
12995+
// ggml_compute_forward_conv_2d
1298612996

12987-
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
12997+
static void ggml_compute_forward_conv_2d_f16_f32(
1298812998
const struct ggml_compute_params * params,
1298912999
const struct ggml_tensor * src0,
1299013000
const struct ggml_tensor * src1,
13001+
const struct ggml_tensor * opt0,
1299113002
struct ggml_tensor * dst) {
1299213003
GGML_ASSERT(src0->type == GGML_TYPE_F16);
1299313004
GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13007,28 +13018,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
1300713018
// size of the convolution row - the kernel size unrolled across all channels
1300813019
const int ew0 = nk0*nk1*ne02;
1300913020

13021+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13022+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13023+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13024+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13025+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13026+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13027+
1301013028
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1301113029
GGML_ASSERT(nb10 == sizeof(float));
1301213030

1301313031
if (params->type == GGML_TASK_INIT) {
13014-
// TODO: fix this memset (wsize is overestimated)
1301513032
memset(params->wdata, 0, params->wsize);
1301613033

1301713034
// prepare source data (src1)
1301813035
{
1301913036
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
1302013037

13021-
for (int i13 = 0; i13 < ne13; i13++) {
13022-
for (int i12 = 0; i12 < ne12; i12++) {
13023-
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024-
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
13038+
for (int i12 = 0; i12 < ne12; i12++) {
13039+
const float * const src = (float *)((char *) src1->data + i12*nb12);
13040+
ggml_fp16_t * dst_data = wdata;
1302513041

13026-
for (int i1 = 0; i1 < ne1; i1++) {
13027-
for (int i0 = 0; i0 < ne0; i0++) {
13028-
for (int ik1 = 0; ik1 < nk1; ik1++) {
13029-
for (int ik0 = 0; ik0 < nk0; ik0++) {
13042+
for (int i1 = 0; i1 < ne1; i1++) {
13043+
for (int i0 = 0; i0 < ne0; i0++) {
13044+
for (int ik1 = 0; ik1 < nk1; ik1++) {
13045+
for (int ik0 = 0; ik0 < nk0; ik0++) {
13046+
const int idx0 = i0*s0 + ik0*d0 - p0;
13047+
const int idx1 = i1*s1 + ik1*d1 - p1;
13048+
13049+
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
1303013050
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031-
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13051+
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
1303213052
}
1303313053
}
1303413054
}
@@ -13071,19 +13091,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
1307113091
}
1307213092
}
1307313093

13074-
static void ggml_compute_forward_conv_2d_sk_p0(
13094+
static void ggml_compute_forward_conv_2d(
1307513095
const struct ggml_compute_params * params,
1307613096
const struct ggml_tensor * src0,
1307713097
const struct ggml_tensor * src1,
13078-
struct ggml_tensor * dst) {
13098+
const struct ggml_tensor * opt0,
13099+
struct ggml_tensor * dst
13100+
) {
1307913101
switch (src0->type) {
1308013102
case GGML_TYPE_F16:
1308113103
{
13082-
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13104+
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
1308313105
} break;
1308413106
case GGML_TYPE_F32:
1308513107
{
13086-
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13108+
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
1308713109
GGML_ASSERT(false);
1308813110
} break;
1308913111
default:
@@ -13093,32 +13115,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
1309313115
}
1309413116
}
1309513117

13096-
// ggml_compute_forward_conv_2d
13097-
13098-
static void ggml_compute_forward_conv_2d(
13099-
const struct ggml_compute_params* params,
13100-
const struct ggml_tensor* src0,
13101-
const struct ggml_tensor* src1,
13102-
const struct ggml_tensor* opt0,
13103-
struct ggml_tensor* dst) {
13104-
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105-
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106-
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107-
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108-
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109-
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110-
GGML_ASSERT(d0 == 1); // dilation not supported
13111-
GGML_ASSERT(d1 == 1);
13112-
GGML_ASSERT(p0 == 0); // padding not supported
13113-
GGML_ASSERT(p1 == 0);
13114-
13115-
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116-
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117-
} else {
13118-
GGML_ASSERT(false); // only stride equal to kernel size is supported
13119-
}
13120-
}
13121-
1312213118
// ggml_compute_forward_pool_1d_sk_p0
1312313119

1312413120
static void ggml_compute_forward_pool_1d_sk_p0(
@@ -16575,19 +16571,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
1657516571
const int64_t ne11 = node->src[1]->ne[1]; // H
1657616572
const int64_t ne12 = node->src[1]->ne[2]; // C
1657716573

16574+
const int64_t ne0 = node->ne[0];
16575+
const int64_t ne1 = node->ne[1];
16576+
const int64_t ne2 = node->ne[2];
1657816577
const int64_t nk = ne00*ne01;
16578+
const int64_t ew0 = nk * ne02;
1657916579

16580-
UNUSED(ne02);
1658116580
UNUSED(ne03);
16582-
UNUSED(nk);
16581+
UNUSED(ne2);
1658316582

1658416583
size_t cur = 0;
1658516584

1658616585
if (node->src[0]->type == GGML_TYPE_F16 &&
16587-
node->src[1]->type == GGML_TYPE_F32) {
16588-
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16586+
node->src[1]->type == GGML_TYPE_F32) {
16587+
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
1658916588
} else if (node->src[0]->type == GGML_TYPE_F32 &&
16590-
node->src[1]->type == GGML_TYPE_F32) {
16589+
node->src[1]->type == GGML_TYPE_F32) {
1659116590
cur = sizeof(float)* (ne10*ne11*ne12);
1659216591
} else {
1659316592
GGML_ASSERT(false);

0 commit comments

Comments
 (0)