ggml-org
diff --git a/‎ggml/src/ggml-quants.c
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-quants.c
Lines changed: 7 additions & 0 deletions
diff --git a/‎ggml/src/ggml-quants.h
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-quants.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp
Lines changed: 289 additions & 35 deletions b/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp
Lines changed: 289 additions & 35 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Lines changed: 3 additions & 5 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Lines changed: 3 additions & 5 deletions
@@ -2020,6 +2020,13 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+size_t quantize_q8_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights; // not used
+    const size_t row_size = ggml_row_size(GGML_TYPE_Q8_1, n_per_row);
+    quantize_row_q8_1_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * row_size;
+}
+
 // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
 
 void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
 
@@ -89,6 +89,7 @@ GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q8_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 GGML_API void iq2xs_init_impl(enum ggml_type type);
 GGML_API void iq2xs_free_impl(enum ggml_type type);
 
@@ -212,7 +212,7 @@ void main() {
 #else
     ACC_TYPE sums[WMITER * TM * WNITER * TN];
     FLOAT_TYPE cache_a[WMITER * TM];
-    FLOAT_TYPE cache_b[WNITER * TN];
+    FLOAT_TYPE cache_b[TN];
 
     [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
         sums[i] = ACC_TYPE(0.0f);
@@ -744,16 +744,14 @@ void main() {
             }
             [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
                 [[unroll]] for (uint j = 0; j < TN; j++) {
-                    cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
+                    cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
                 }
-            }
 
-            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
                 [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
                     [[unroll]] for (uint cc = 0; cc < TN; cc++) {
                         [[unroll]] for (uint cr = 0; cr < TM; cr++) {
                             const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
-                            sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]);
+                            sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[cc]), sums[sums_idx]);
                         }
                     }
                 }
Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ void main() {`
`212`	`212`	`#else`
`213`	`213`	`ACC_TYPE sums[WMITER * TM * WNITER * TN];`
`214`	`214`	`FLOAT_TYPE cache_a[WMITER * TM];`
`215`		`- FLOAT_TYPE cache_b[WNITER * TN];`
	`215`	`+ FLOAT_TYPE cache_b[TN];`
`216`	`216`
`217`	`217`	`[[unroll]] for (uint i = 0; i < WMITERTMWNITER*TN; i++) {`
`218`	`218`	`sums[i] = ACC_TYPE(0.0f);`
`@@ -744,16 +744,14 @@ void main() {`
`744`	`744`	`}`
`745`	`745`	`[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {`
`746`	`746`	`[[unroll]] for (uint j = 0; j < TN; j++) {`
`747`		`- cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];`
	`747`	`+ cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];`
`748`	`748`	`}`
`749`		`- }`
`750`	`749`
`751`		`- [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {`
`752`	`750`	`[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {`
`753`	`751`	`[[unroll]] for (uint cc = 0; cc < TN; cc++) {`
`754`	`752`	`[[unroll]] for (uint cr = 0; cr < TM; cr++) {`
`755`	`753`	`const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;`
`756`		`- sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]);`
	`754`	`+ sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[cc]), sums[sums_idx]);`
`757`	`755`	`}`
`758`	`756`	`}`
`759`	`757`	`}`