MPS CI runs (#162)

mikekgfb · malfet · commit a3cda44ecb43 · 2024-07-16T23:03:11.000-07:00
* MPS quantization

* mps dtypes

* updates

* fix names

* typo

* no bfloat16 for older macOS

* fix typo

* remove failing embedding quantization from MPS runs

* bfloat -&gt; current model precision

* typo

* missed bfloat16 to swotch to defaulkt precision

* remove int8 quantization on mps

* enable cpu fallback for mps on int4

* hack int4pack_mm for torch.float

* typo

* disable int4 because fp16 int4pack_mm not working for float16
diff --git a/.github/workflows/compile-bf16.yml b/.github/workflows/compile-bf16.yml
@@ -44,9 +44,9 @@ jobs:
           export MODEL_NAME=stories15M
           export MODEL_DIR=/tmp
           for DTYPE in bfloat16 float16 float32; do
-            if [ $(uname -s) == Darwin ]; then
-              export DTYPE=float16
-            fi  
+            # if [ $(uname -s) == Darwin ]; then
+            #   export DTYPE=float16
+            # fi  
             python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
             cat ./output_eager
             python generate.py --dtype ${DTYPE} --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
diff --git a/.github/workflows/test_mps-dtype.yml b/.github/workflows/test_mps-dtype.yml
@@ -0,0 +1,65 @@
+name: Run eager tests on MPS with dtypes
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-mps:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      script: |
+        set -eou pipefail
+
+        echo "::group::Print machine info"
+        uname -a
+        if [ $(uname -s) == Darwin ]; then
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+        fi
+        echo "::endgroup::"
+
+        echo "::group::Install requirements"
+        # Install requirements
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+        ls -la
+        pwd
+        pip install -r requirements.txt
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        (
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+        )
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export MODEL_NAME=stories15M
+        export MODEL_DIR=/tmp
+        for DTYPE in float16 float32; do
+          # if [ $(uname -s) == Darwin ]; then
+          #   export DTYPE=float16
+          # fi  
+
+          python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          # python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          # cat ./output_eager
+          # python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          # cat ./output_eager
+          # python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          # cat ./output_eager
+          # python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          # cat ./output_eager
+          # PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"group_size": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          # cat ./output_eager
+        done
diff --git a/.github/workflows/test_mps.yml b/.github/workflows/test_mps.yml
@@ -1,4 +1,4 @@
-name: Run compile tests on MPS
+name: Run eager tests on MPS
 
 on:
   pull_request:
@@ -45,5 +45,17 @@ jobs:
         export MODEL_PATH=checkpoints/stories15M/stories15M.pt
         export MODEL_NAME=stories15M
         export MODEL_DIR=/tmp
+
         python generate.py --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
         cat ./output_eager
+        # python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        # cat ./output_eager
+        # python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        # cat ./output_eager
+        # python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        # cat ./output_eager
+        # python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        # cat ./output_eager
+        # PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"group_size": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        # cat ./output_eager
+          
diff --git a/quantize.py b/quantize.py
@@ -465,11 +465,12 @@ def __init__(
         self.register_buffer(
             "weight", torch.empty((out_features, in_features), dtype=torch.int8)
         )
-        if groupsize is None or (groupsize == 0):
-            self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))
+        dtype=get_precision()
+        if group_size is None or (group_size == 0):
+            self.register_buffer("scales", torch.ones(out_features, dtype=dtype))
         else:
-            groups = (in_features + groupsize - 1) // groupsize
-            self.register_buffer("scales", torch.ones(out_features, groups, dtype=torch.bfloat16))
+            groups = (in_features + group_size - 1) // group_size
+            self.register_buffer("scales", torch.ones(out_features, groups, dtype=dtype))
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         scales = self.scales
@@ -683,12 +684,21 @@ def _int4_calc_padded_size(k, groupsize=1, innner_k_tiles=1):
 def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize):
     origin_x_size = x.size()
     x = x.reshape(-1, origin_x_size[-1])
-    c = torch.ops.aten._weight_int4pack_mm(
-        x.to(dtype=torch.bfloat16),
-        weight_int4pack,
-        groupsize,
-        scales_and_zeros.to(dtype=torch.bfloat16)
-    ).to(dtype=x.dtype)
+    if x.dtype == torch.float:
+        # work around missing int4pack_mm for torch.float
+        c = torch.ops.aten._weight_int4pack_mm(
+            x.to(torch.float16),
+            weight_int4pack,
+            groupsize,
+            scales_and_zeros.to(torch.float16),
+        ).to(torch.float)
+    else:
+        c = torch.ops.aten._weight_int4pack_mm(
+            x,
+            weight_int4pack,
+            groupsize,
+            scales_and_zeros,
+        )
     new_shape = origin_x_size[:-1] + (out_features,)
     c = c.reshape(new_shape)
     return c
diff --git a/quantized_ops.py b/quantized_ops.py
@@ -120,11 +120,11 @@ def linear_int4(
     origin_input_size = input.size()
     input = input.reshape(-1, origin_input_size[-1])
     c = torch.ops.aten._weight_int4pack_mm(
-        input.to(dtype=torch.bfloat16),
+        input,
         weight_int4pack,
         groupsize,
-        scales_and_zeros.to(dtype=torch.bfloat16)
-    ).to(dtype=input.dtype)
+        scales_and_zeros,
+    )
     new_shape = origin_input_size[:-1] + (out_features,)
     c = c.reshape(new_shape)
     return c