Test for gguf_util correctness (#209)

metascroy · malfet · commit eeffb0a777c1 · 2024-07-16T23:03:11.000-07:00
* add correctness test function

* correnctess test on gguf

* fix
diff --git a/.github/workflows/gguf_util.yml b/.github/workflows/gguf_util.yml
@@ -0,0 +1,56 @@
+name: Compile main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  gguf-util-test:
+    strategy:
+      matrix:
+        runner: [macos-14]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          echo "Intalling pip packages"
+          pip install gguf
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+
+          git clone https://github.com/ggerganov/llama.cpp.git
+          pushd llama.cpp
+          make
+          popd
+
+      - name: Download GGUF files
+        run: |
+          mkdir gguf_files
+          wget -O gguf_files/llama-2-7b.Q4_0.gguf "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
+          ./llama.cpp/quantize --allow-requantize gguf_files/llama-2-7b.Q4_0.gguf gguf_files/llama-2-7b.Q4_0.requant_F32.gguf F32
+
+      - name: Load files
+        run: |
+          touch test.py
+          echo "from build.gguf_util import test_by_to_float" >> test.py
+          echo "test_by_to_float(\"gguf_files/llama-2-7b.Q4_0.gguf\", \"gguf_files/llama-2-7b.Q4_0.requant_F32.gguf\")" >> test.py
+          cat test.py
+          python test.py
+
+          echo "Tests complete."
diff --git a/build/gguf_util.py b/build/gguf_util.py
@@ -6,6 +6,56 @@
 
 import torch
 import gguf
+from quantize import group_dequantize_tensor_from_qparams
+
+def to_float(t: gguf.gguf_reader.ReaderTensor):
+    """
+    Unpack and dequantize GGUF tensor to torch tensor of type torch.float32.
+    """
+
+    # All other weights are dequantized to float
+    if t.tensor_type == gguf.GGMLQuantizationType.Q4_0:
+        return group_dequantize_tensor_from_qparams(*Q4_0.unpack(t), Q4_0.n_bit, Q4_0.groupsize).to(torch.float32)
+    elif t.tensor_type == gguf.GGMLQuantizationType.Q6_K:
+        return group_dequantize_tensor_from_qparams(*Q6_K.unpack(t), Q6_K.n_bit, Q6_K.groupsize).to(torch.float32)
+    elif t.tensor_type == gguf.GGMLQuantizationType.F16:
+        return F16.unpack(t).to(torch.float32)
+    elif t.tensor_type == gguf.GGMLQuantizationType.F32:
+        return F32.unpack(t).to(torch.float32)
+    else:
+        raise ValueError(f"Unsupported tensor type {t.tensor_type}")
+
+
+def test_by_to_float(source_file: str, target_file: str) -> None:
+    """
+    Tests methods in this file by using the to_float method, and comparing with a correct
+    reference.  Raises error if there is a mismatch.
+
+    In more detail, a GGUF source_file with various GGUF tensor types is parsed, and these
+    tensors are converted with to_float.  These are then compared against a GGUF target_file.
+    The target GGUF file must only contain F32 tensors, and should be generated by a method
+    that is known to be correct.
+    """
+
+    gguf_sources = {t.name: t for t in gguf.GGUFReader(source_file, "r").tensors}
+    gguf_targets = {t.name: t for t in gguf.GGUFReader(target_file, "r").tensors}
+
+    for t in gguf_targets.values():
+        assert t.tensor_type == gguf.GGMLQuantizationType.F32, f"target_file must only contain F32 tensors, but found tensor {t.name} with type {repr(t.tensor_type)}."
+    assert gguf_sources.keys() == gguf_targets.keys(), "source_file and target_file should have the same tensors (by name)"
+
+    for k in gguf_sources:
+        source = to_float(gguf_sources[k])
+        target = to_float(gguf_targets[k])
+
+        if not torch.allclose(source, target):
+            print(f"After calling to_float on source tensor {k} of type {repr(gguf_sources[k].tensor_type)} it does not match its target.")
+            print("First 5 elements of converted source: ", source.reshape(-1)[0:5])
+            print("First 5 elements of target: ", target.reshape(-1)[0:5])
+            assert False, "found mismatch"
+
+    print("All tensors match.")
+
 
 class F16:
     @staticmethod
@@ -14,7 +64,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
         Unpacks GGUF F16 tensor.
         """
         assert gguf_tensor.tensor_type == gguf.GGMLQuantizationType.F16
-        reversed_shape = gguf_tensor.shape[::-1] # TODO: GGUF tensors are reversed
+        reversed_shape = gguf_tensor.shape[::-1]
         new_tensor = gguf_tensor.data.reshape(reversed_shape)
         return torch.from_numpy(new_tensor).to(torch.float16)
 
@@ -25,7 +75,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
         Unpacks GGUF F32 tensor.
         """
         assert gguf_tensor.tensor_type == gguf.GGMLQuantizationType.F32
-        reversed_shape = gguf_tensor.shape[::-1] # TODO: GGUF tensors are reversed
+        reversed_shape = gguf_tensor.shape[::-1]
         new_tensor = gguf_tensor.data.reshape(reversed_shape)
         return torch.from_numpy(new_tensor).to(torch.float32)
 
@@ -61,7 +111,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
 
         assert gguf_tensor.tensor_type == gguf.GGMLQuantizationType.Q4_0
         assert len(gguf_tensor.shape) == 2
-        nc, nr = gguf_tensor.shape # TODO: CHECK THIS.  GGUF TENSOR REVERSED?
+        nc, nr = gguf_tensor.shape # GGUF tensor has reversed shape
 
         QK4_0 = 32 # groupsize
 
@@ -84,7 +134,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
         # Check we finished parsing
         assert curr == block_q4_0_size
 
-        # Unpack quantized values.  Unlike the code in ggml-quants.c, we do not subtract 16
+        # Unpack quantized values.  Unlike the code in ggml-quants.c, we do not subtract 8
         x0 = qs & 0x0F
         x1 = qs >> 4
 
@@ -117,8 +167,6 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
         * s is a torch.float32 tensor of shape (nr, -1) with one scale per group
         * z is a torch.float32 tensor of shape (nr, -1) with one zero per group
 
-        There is one element of s/z per group of 32 elements of 4.
-
         Note that z is always zero because Q6_k is a scale-only scheme.
 
         See https://github.com/ggerganov/llama.cpp/blob/master/ggml-common.h for definition of block_q6_K:
@@ -142,7 +190,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor):
         """
         assert gguf_tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K
         assert len(gguf_tensor.shape) == 2
-        nc, nr = gguf_tensor.shape # TODO: CHECK THIS.  GGUF TENSOR REVERSED?
+        nc, nr = gguf_tensor.shape  # GGUF tensor has reversed shape
         QK_K = 256
 
         # Parse block_q6_K