Skip to content

Commit e2d23be

Browse files
committed
falcon : minor changes (still chasing the Metal problem)
1 parent a0dc47a commit e2d23be

File tree

3 files changed

+51
-52
lines changed

3 files changed

+51
-52
lines changed

ggml-metal.m

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -757,17 +757,17 @@ void ggml_metal_graph_compute(
757757
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
758758
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
759759
}
760-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
761-
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
762-
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
763-
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
764-
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
765-
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
766-
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
767-
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
768-
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
769-
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
770-
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
760+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
761+
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
762+
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
763+
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
764+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
765+
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
766+
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
767+
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
768+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
769+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
770+
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
771771
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
772772
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
773773
} else {
@@ -945,11 +945,11 @@ void ggml_metal_graph_compute(
945945
const int nth = 256;
946946

947947
[encoder setComputePipelineState:ctx->pipeline_norm];
948-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
949-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
950-
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
951-
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
952-
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
948+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
949+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
950+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
951+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
952+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
953953
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
954954

955955
const int64_t nrows = ggml_nrows(src0);
@@ -992,7 +992,9 @@ void ggml_metal_graph_compute(
992992
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
993993
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
994994
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
995+
995996
const int nth = 32;
997+
996998
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
997999
} break;
9981000
case GGML_OP_ROPE:
@@ -1007,8 +1009,8 @@ void ggml_metal_graph_compute(
10071009
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
10081010

10091011
[encoder setComputePipelineState:ctx->pipeline_rope];
1010-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1011-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1012+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1013+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
10121014
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
10131015
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
10141016
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
@@ -1059,24 +1061,24 @@ void ggml_metal_graph_compute(
10591061
default: GGML_ASSERT(false && "not implemented");
10601062
}
10611063

1062-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1063-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1064-
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1065-
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
1066-
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
1067-
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
1068-
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
1069-
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
1070-
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
1071-
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
1072-
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
1073-
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
1074-
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
1075-
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
1076-
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
1077-
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
1078-
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
1079-
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
1064+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1065+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1066+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1067+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
1068+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
1069+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
1070+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
1071+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
1072+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
1073+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
1074+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
1075+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
1076+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
1077+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
1078+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
1079+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
1080+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
1081+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
10801082

10811083
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
10821084
} break;

ggml.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
35543554
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
35553555
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
35563556

3557-
static const float GELU_COEF_A = 0.044715f;
3558-
static const float GELU_QUICK_COEF = -1.702f;
3559-
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3557+
static const float GELU_COEF_A = 0.044715f;
3558+
static const float GELU_QUICK_COEF = -1.702f;
3559+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
35603560

35613561
inline static float ggml_gelu_f32(float x) {
35623562
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));

llama.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2545,26 +2545,23 @@ static struct ggml_cgraph * llm_build_falcon(
25452545
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
25462546
ggml_set_name(KQV_merged, "KQV_merged");
25472547

2548-
cur = ggml_cpy(ctx0,
2549-
KQV_merged,
2550-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2548+
cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
25512549
ggml_set_name(cur, "KQV_merged_contiguous");
25522550

2553-
cur = ggml_cpy(ctx0,
2554-
KQV_merged,
2555-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2556-
25572551
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
25582552
ggml_set_name(cur, "result_wo");
25592553
}
25602554

2561-
struct ggml_tensor * inpFF = attn_norm;
2562-
struct ggml_tensor * attn_out = ggml_cpy(
2563-
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2555+
struct ggml_tensor * attn_out = cur;
25642556

2565-
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
2566-
cur = ggml_gelu(ctx0, cur);
2567-
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
2557+
// feed forward
2558+
{
2559+
struct ggml_tensor * inpFF = attn_norm;
2560+
2561+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
2562+
cur = ggml_gelu(ctx0, cur);
2563+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
2564+
}
25682565

25692566
cur = ggml_add(ctx0, cur, attn_out);
25702567
cur = ggml_add(ctx0, cur, inpL);

0 commit comments

Comments
 (0)