ggml-org
diff --git a/‎examples/quantize/quantize.cpp
Lines changed: 1 addition & 0 deletions b/‎examples/quantize/quantize.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml-cuda.cu
Lines changed: 28 additions & 0 deletions b/‎ggml-cuda.cu
Lines changed: 28 additions & 0 deletions
diff --git a/‎ggml-cuda.h
Lines changed: 1 addition & 0 deletions b/‎ggml-cuda.h
Lines changed: 1 addition & 0 deletions
@@ -16,6 +16,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
         fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
+        fprintf(stderr, "  type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
         return 1;
     }
 
 
@@ -37,6 +37,13 @@ typedef struct {
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
+#define QK8_0 32
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
 
@@ -131,6 +138,22 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     }
 }
 
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const int8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK8_0; l++) {
+        const int8_t vi = pp[l];
+
+        y[i*QK8_0 + l] = vi*d;
+    }
+}
+
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_0;
     dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -151,6 +174,11 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
 }
 
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK8_0;
+    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16
 
 
@@ -35,6 +35,7 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 
 #ifdef  __cplusplus
 }
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ int main(int argc, char ** argv) {`
`16`	`16`	`fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);`
`17`	`17`	`fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);`
`18`	`18`	`fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);`
	`19`	`+ fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);`
`19`	`20`	`return 1;`
`20`	`21`	`}`
`21`	`22`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st`
`35`	`35`	`void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
`36`	`36`	`void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
`37`	`37`	`void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
	`38`	`+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
`38`	`39`
`39`	`40`	`#ifdef __cplusplus`
`40`	`41`	`}`