Skip to content

ggml: move op parameters from tensors to ggml_tensor::op_params #2333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2688,6 +2688,7 @@ inline void ggml_cuda_op_mul(
(void) dst;
(void) src0_ddq_i;
(void) i02;
(void) i1;
}

inline void ggml_cuda_op_gelu(
Expand Down Expand Up @@ -2983,15 +2984,15 @@ inline void ggml_cuda_op_rope(
const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;

const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];

const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3];
// RoPE alteration for extended context

float freq_base, freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
Expand All @@ -3007,6 +3008,7 @@ inline void ggml_cuda_op_rope(
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
}

(void) src1;
(void) dst;
(void) src0_ddq_i;
(void) src1_ddf_i;
Expand All @@ -3025,11 +3027,12 @@ inline void ggml_cuda_op_diag_mask_inf(
const int64_t ne01 = src0->ne[1];
const int64_t i01_diff = i01_high - i01_low;

const int n_past = ((int32_t *) src1->data)[0];
const int n_past = ((int32_t *) dst->op_params)[0];

// compute
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);

(void) src1;
(void) dst;
(void) src0_ddq_i;
(void) src1_ddf_i;
Expand Down Expand Up @@ -3738,7 +3741,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t offset = 0;
if (tensor->op == GGML_OP_VIEW) {
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
memcpy(&offset, tensor->op_params, sizeof(size_t));
}
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src0_ddc + offset;
Expand Down
20 changes: 10 additions & 10 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder];
}

const int n_past = ((int32_t *)(src1->data))[0];
const int n_past = ((int32_t *)(dst->op_params))[0];

[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
Expand Down Expand Up @@ -842,9 +842,10 @@ void ggml_metal_graph_compute(

GGML_ASSERT((src0t == GGML_TYPE_F32));

const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
const int n_head = ((int32_t *) src1->data)[1];
const float max_bias = ((float *) src1->data)[2];
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

if (__builtin_popcount(n_head) != 1) {
GGML_ASSERT(false && "only power-of-two n_head implemented");
Expand Down Expand Up @@ -882,15 +883,14 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder];
}

const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];

const int n_past = ((int32_t *)(src1->data))[0];
const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];

float freq_base;
float freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

[encoder setComputePipelineState:ctx->pipeline_rope];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
Expand Down
Loading