64
64
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
65
65
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
66
66
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
67
+ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M,
67
68
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
68
69
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,
69
70
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
91
92
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
92
93
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
93
94
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
95
+ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,
94
96
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
95
97
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,
96
98
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
114
116
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
115
117
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
116
118
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
119
+ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
117
120
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
118
121
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
119
122
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
134
137
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
135
138
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
136
139
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
140
+ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
137
141
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
138
142
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
139
143
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
154
158
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
155
159
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
156
160
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
161
+ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
157
162
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
158
163
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
159
164
GGML_METAL_KERNEL_TYPE_ROPE_F32,
@@ -490,6 +495,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
490
495
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true );
491
496
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true );
492
497
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true );
498
+ GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, get_rows_iq1_m, true );
493
499
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true );
494
500
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true );
495
501
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true );
@@ -517,6 +523,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
517
523
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction );
518
524
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction );
519
525
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction );
526
+ GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction );
520
527
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction );
521
528
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction );
522
529
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction );
@@ -540,6 +547,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
540
547
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction );
541
548
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction );
542
549
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction );
550
+ GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction );
543
551
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction );
544
552
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction );
545
553
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm );
@@ -560,6 +568,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
560
568
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm );
561
569
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm );
562
570
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm );
571
+ GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm );
563
572
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm );
564
573
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm );
565
574
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm );
@@ -580,6 +589,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
580
589
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm );
581
590
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm );
582
591
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm );
592
+ GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm );
583
593
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm );
584
594
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm );
585
595
GGML_METAL_ADD_KERNEL (GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true );
@@ -1421,6 +1431,7 @@ static enum ggml_status ggml_metal_graph_compute(
1421
1431
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline ; break ;
1422
1432
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline ; break ;
1423
1433
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline ; break ;
1434
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline ; break ;
1424
1435
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline ; break ;
1425
1436
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline ; break ;
1426
1437
default : GGML_ASSERT (false && " MUL MAT-MAT not implemented" );
@@ -1575,6 +1586,12 @@ static enum ggml_status ggml_metal_graph_compute(
1575
1586
nth1 = 16 ;
1576
1587
pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline ;
1577
1588
} break ;
1589
+ case GGML_TYPE_IQ1_M:
1590
+ {
1591
+ nth0 = 4 ;
1592
+ nth1 = 16 ;
1593
+ pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline ;
1594
+ } break ;
1578
1595
case GGML_TYPE_IQ4_NL:
1579
1596
{
1580
1597
nth0 = 4 ;
@@ -1619,9 +1636,9 @@ static enum ggml_status ggml_metal_graph_compute(
1619
1636
[encoder setBytes: &r2 length: sizeof (r2) atIndex: 17 ];
1620
1637
[encoder setBytes: &r3 length: sizeof (r3) atIndex: 18 ];
1621
1638
1622
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
1623
- src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
1624
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
1639
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
1640
+ src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
1641
+ src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
1625
1642
[encoder dispatchThreadgroups: MTLSizeMake ((ne01 + 7 )/8 , ne11, ne12*ne13) threadsPerThreadgroup: MTLSizeMake (nth0, nth1, 1 )];
1626
1643
}
1627
1644
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@@ -1743,6 +1760,7 @@ static enum ggml_status ggml_metal_graph_compute(
1743
1760
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline ; break ;
1744
1761
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline ; break ;
1745
1762
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline ; break ;
1763
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline ; break ;
1746
1764
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline ; break ;
1747
1765
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline ; break ;
1748
1766
default : GGML_ASSERT (false && " MUL_MAT_ID not implemented" );
@@ -1900,6 +1918,12 @@ static enum ggml_status ggml_metal_graph_compute(
1900
1918
nth1 = 16 ;
1901
1919
pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline ;
1902
1920
} break ;
1921
+ case GGML_TYPE_IQ1_M:
1922
+ {
1923
+ nth0 = 4 ;
1924
+ nth1 = 16 ;
1925
+ pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline ;
1926
+ } break ;
1903
1927
case GGML_TYPE_IQ4_NL:
1904
1928
{
1905
1929
nth0 = 4 ;
@@ -1960,9 +1984,9 @@ static enum ggml_status ggml_metal_graph_compute(
1960
1984
[encoder setBuffer: id_src_cur offset: offs_src_cur atIndex: 23 + j];
1961
1985
}
1962
1986
1963
- if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
1964
- src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
1965
- src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
1987
+ if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 ||
1988
+ src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q2_K ||
1989
+ src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) {
1966
1990
[encoder dispatchThreadgroups: MTLSizeMake ((ne21 + 7 )/8 , _ne1, ne01*ne12*ne13) threadsPerThreadgroup: MTLSizeMake (nth0, nth1, 1 )];
1967
1991
}
1968
1992
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
@@ -2024,6 +2048,7 @@ static enum ggml_status ggml_metal_graph_compute(
2024
2048
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline ; break ;
2025
2049
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline ; break ;
2026
2050
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline ; break ;
2051
+ case GGML_TYPE_IQ1_M: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline ; break ;
2027
2052
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline ; break ;
2028
2053
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline ; break ;
2029
2054
case GGML_TYPE_I32: pipeline = ctx->kernels [GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline ; break ;
0 commit comments