Skip to content

Commit 95a6c59

Browse files
authored
ggml: move op parameters from tensors to ggml_tensor::op_params (#2333)
* ggml: move op parameters from tensors to ggml_tensor::op_params * alibi: use memcpy for float params * remove `src[1] = NULL` in ops
1 parent e76d630 commit 95a6c59

File tree

4 files changed

+226
-486
lines changed

4 files changed

+226
-486
lines changed

ggml-cuda.cu

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2742,6 +2742,7 @@ inline void ggml_cuda_op_mul(
27422742
(void) dst;
27432743
(void) src0_ddq_i;
27442744
(void) i02;
2745+
(void) i1;
27452746
}
27462747

27472748
inline void ggml_cuda_op_gelu(
@@ -3037,15 +3038,15 @@ inline void ggml_cuda_op_rope(
30373038
const int64_t ne00 = src0->ne[0];
30383039
const int64_t i01_diff = i01_high - i01_low;
30393040

3040-
const int n_past = ((int32_t *) src1->data)[0];
3041-
const int n_dims = ((int32_t *) src1->data)[1];
3042-
const int mode = ((int32_t *) src1->data)[2];
3043-
const int n_ctx = ((int32_t *) src1->data)[3];
3044-
3041+
const int n_past = ((int32_t *) dst->op_params)[0];
3042+
const int n_dims = ((int32_t *) dst->op_params)[1];
3043+
const int mode = ((int32_t *) dst->op_params)[2];
3044+
const int n_ctx = ((int32_t *) dst->op_params)[3];
30453045
// RoPE alteration for extended context
3046+
30463047
float freq_base, freq_scale;
3047-
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
3048-
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
3048+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
3049+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
30493050

30503051
const float theta_scale = powf(freq_base, -2.0f/n_dims);
30513052
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
@@ -3061,6 +3062,7 @@ inline void ggml_cuda_op_rope(
30613062
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
30623063
}
30633064

3065+
(void) src1;
30643066
(void) dst;
30653067
(void) src0_ddq_i;
30663068
(void) src1_ddf_i;
@@ -3079,11 +3081,12 @@ inline void ggml_cuda_op_diag_mask_inf(
30793081
const int64_t ne01 = src0->ne[1];
30803082
const int64_t i01_diff = i01_high - i01_low;
30813083

3082-
const int n_past = ((int32_t *) src1->data)[0];
3084+
const int n_past = ((int32_t *) dst->op_params)[0];
30833085

30843086
// compute
30853087
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
30863088

3089+
(void) src1;
30873090
(void) dst;
30883091
(void) src0_ddq_i;
30893092
(void) src1_ddf_i;
@@ -3803,7 +3806,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
38033806
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
38043807
size_t offset = 0;
38053808
if (tensor->op == GGML_OP_VIEW) {
3806-
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3809+
memcpy(&offset, tensor->op_params, sizeof(size_t));
38073810
}
38083811
extra = ggml_cuda_alloc_temp_tensor_extra();
38093812
extra->data_device[g_main_device] = src0_ddc + offset;

ggml-metal.m

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ void ggml_metal_graph_compute(
585585
encoder = [command_buffer computeCommandEncoder];
586586
}
587587

588-
const int n_past = ((int32_t *)(src1->data))[0];
588+
const int n_past = ((int32_t *)(dst->op_params))[0];
589589

590590
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
591591
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -850,9 +850,10 @@ void ggml_metal_graph_compute(
850850

851851
GGML_ASSERT((src0t == GGML_TYPE_F32));
852852

853-
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
854-
const int n_head = ((int32_t *) src1->data)[1];
855-
const float max_bias = ((float *) src1->data)[2];
853+
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
854+
const int n_head = ((int32_t *) dst->op_params)[1];
855+
float max_bias;
856+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
856857

857858
if (__builtin_popcount(n_head) != 1) {
858859
GGML_ASSERT(false && "only power-of-two n_head implemented");
@@ -890,15 +891,14 @@ void ggml_metal_graph_compute(
890891
encoder = [command_buffer computeCommandEncoder];
891892
}
892893

893-
const int n_dims = ((int32_t *) src1->data)[1];
894-
const int mode = ((int32_t *) src1->data)[2];
895-
896-
const int n_past = ((int32_t *)(src1->data))[0];
894+
const int n_past = ((int32_t *) dst->op_params)[0];
895+
const int n_dims = ((int32_t *) dst->op_params)[1];
896+
const int mode = ((int32_t *) dst->op_params)[2];
897897

898898
float freq_base;
899899
float freq_scale;
900-
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
901-
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
900+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
901+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
902902

903903
[encoder setComputePipelineState:ctx->pipeline_rope];
904904
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];

0 commit comments

Comments
 (0)