pytorch · mikekgfb · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -266,6 +266,12 @@ jobs:
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float32"
         echo "::endgroup::"
 
+        echo "::group::Run inference with quantize file"
+        if [ $(uname -s) != Darwin ]; then
+          python3 generate.py --quantize config/data/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        fi
+        echo "::endgroup::"
+
   test-gpu-aoti-float16:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
@@ -307,6 +313,13 @@ jobs:
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float16"
         echo "::endgroup::"
 
+        echo "::group::Run inference with quantize file"
+        if [ $(uname -s) == Darwin ]; then
+          python3 export.py --output-dso-path /tmp/model.so --quantize config/data/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+             python3 generate.py --dso-path /tmp/model.so --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
+        fi
+        echo "::endgroup::"
+
   test-gpu-eval-sanity-check:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
@@ -428,9 +441,20 @@ jobs:
           export MODEL_PATH=checkpoints/stories15M/stories15M.pt
           export MODEL_NAME=stories15M
           export MODEL_DIR=/tmp
+
+          echo "******************************************"
+          echo "***               vanilla              ***"
+          echo "******************************************"
           python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
           python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
 
+          echo "******************************************"
+          echo "*** --quantize config/data/mobile.json ***"
+          echo "******************************************"
+          # python export.py --quantize config/data/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
+          # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
+
+
           echo "******************************************"
           echo "******* Emb: channel-wise quantized ******"
           echo "******************************************"

diff --git a/config/data/cuda.json b/config/data/cuda.json
@@ -1,3 +1,5 @@
 {
+    "executor": {"accelerator": "cuda"},
+    "precision": {"dtype": "bf16"},
     "linear:int4": {"groupsize" : 256}
 }
diff --git a/quantize.py b/quantize.py
@@ -17,6 +17,7 @@
 import torch.nn.functional as F
 from build.utils import (
     find_multiple,
+    get_device_str,
     get_precision,
     name_to_dtype,
     state_dict_device,
@@ -124,6 +125,8 @@ def quantized_model(self) -> nn.Module:
 
 #########################################################################
 ###            wrapper for setting device as a QuantHandler           ###
+###    for onw select device for PyTorch eager and AOTI, in future    ###
+###    also use this for selecting delegate when exporting with ET    ###
 
 
 class ExecutorHandler(QuantHandler):