pytorch · mikekgfb · Apr 25, 2024 · Apr 23, 2024
diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
@@ -216,8 +216,8 @@ function eval_model() {
         python -W ignore eval.py --compile --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" > "$MODEL_DIR/eval" || exit 1
         cat "$MODEL_DIR/eval"
         # extract perplexity number and compare with a constant
-        local REF_PERPLEXITY=100000
-        PERPLEXITY=cat "$MODEL_DIR/eval" | tail -n 1 log | awk -F '[, ]' '{print $4}'
+        export REF_PERPLEXITY=100000
+        export PERPLEXITY=cat "$MODEL_DIR/eval" | tail -n 1 log | awk -F '[, ]' '{print $4}'
         # == 1 meaning the check succeeded
         if [ "$(echo "$PERPLEXITY >= $REF_PERPLEXITY" | bc)" == 1]; then
             echo "perplexity checking failed for non-quantized model $MODEL_NAME with $DTYPE $TARGET_DEVICE"
@@ -229,17 +229,64 @@ function eval_model() {
         echo "******** INT4 group-wise quantized *******"
         echo "******************************************"
 
-        QUANT_OPTIONS='{"linear:int4" : {"groupsize": 32}}'
-        python -W ignore eval.py --compile --dtype ${DTYPE} --quant $QUANT_OPTIONS --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" > "$MODEL_DIR/eval" || exit 1
+        export QUANT_OPTIONS='{"linear:int4" : {"groupsize": 32}}'
+        python -W ignore eval.py --compile --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" > "$MODEL_DIR/eval" || exit 1
         cat "$MODEL_DIR/eval"
-        local REF_PERPLEXITY=100000
-        PERPLEXITY=cat "$MODEL_DIR/eval" | tail -n 1 log | awk -F '[, ]' '{print $4}'
+        export REF_PERPLEXITY=100000
+        export PERPLEXITY=cat "$MODEL_DIR/eval" | tail -n 1 log | awk -F '[, ]' '{print $4}'
         # == 1 meaning the check succeeded
         if [ "$(echo "$PERPLEXITY >= $REF_PERPLEXITY" | bc)" == 1]; then
             echo "perplexity checking failed for int4-quantized model $MODEL_NAME with $DTYPE $TARGET_DEVICE $QUANT_OPTIONS"
         else
             echo "perplexity checking succeeded for int4-quantized model $MODEL_NAME with $DTYPE $TARGET_DEVICE $QUANT_OPTIONS"
         fi;
+
+    done
+}
+
+function eval_model_sanity_check() {
+    local CHECKPOINT_PATH="$1"
+    local TARGET_DEVICE="${2:-cpu}"
+    local MODEL_DIR="${CHECKPOINT_PATH%/*}"
+    local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
+
+    for DTYPE in float32 bfloat16 float16; do
+        echo ""############### Run eval with torch.compile for dtype $DTYPE "###############"
+        echo ""
+        echo "******************************************"
+        echo "************** non-quantized *************"
+        echo "******************************************"
+        python -W ignore eval.py --compile --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/eval" || exit 1
+        cat "$MODEL_DIR/eval"
+
+        echo "******************************************"
+        echo "******** INT4 group-wise quantized *******"
+        echo "******************************************"
+
+        export QUANT_OPTIONS='{"linear:int4" : {"groupsize": 32}}'
+        python -W ignore eval.py --compile --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/eval" || exit 1
+        cat "$MODEL_DIR/eval"
+
+        echo "**************************************************"
+        echo "******** INT4 group-wise quantized (eager) *******"
+        echo "**************************************************"
+
+        if [ "$TARGET_DEVICE" == "cuda" ] && [ "$DTYPE" != "float16" ]; then
+            python -W ignore eval.py --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/eval_eager" || exit 1
+            cat "$MODEL_DIR/eval_eager"
+        fi;
+
+
+        # there is some issues with AOTI cpu and cuda, need to fix and enable the test for cuda as well
+        echo "*************************************************"
+        echo "******** INT4 group-wise quantized (AOTI) *******"
+        echo "*************************************************"
+        if [ "$DTYPE" != "float16" ]; then
+            python3 -W ignore export.py --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+            python3 -W ignore eval.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1
+            cat "$MODEL_DIR/output_eval_aoti"
+        fi;
+
     done
 }
 
@@ -263,6 +310,10 @@ function run_eval(){
     eval_model "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
 }
 
+function run_eval_sanity_check(){
+    eval_model_sanity_check "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
+}
+
 CHECKPOINT_PATH="$1"
 TARGET_DEVICE="${2:-cpu}"
 PROMPT="Hello, my name is"
@@ -284,6 +335,9 @@ if [ "$#" -gt 2 ]; then
             "eval")
                 run_eval || exit 1
                 ;;
+            "eval_sanity_check")
+                run_eval_sanity_check || exit 1
+                ;;
             *)
                 echo "Unknown argument: $arg" >&2
                 exit 1

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -59,6 +59,7 @@ jobs:
           pushd ${TORCHCHAT_ROOT}
           bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
           bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
+
   test-cpu-aoti:
     name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-cpu
@@ -93,6 +94,43 @@ jobs:
           pushd ${TORCHCHAT_ROOT}
           bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
           bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
+
+  test-cpu-eval-sanity-check:
+    name: test-cpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-cpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      TORCHCHAT_ROOT: ${{ github.workspace }}
+      REPO_NAME: ${{ matrix.repo_name }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Print machine info
+        run: |
+          echo "$(uname -a)"
+      - name: Install dependencies
+        run: |
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+          pip list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Download checkpoints
+        run: |
+          bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
+      - name: Run validation
+        run: |
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          pushd ${TORCHCHAT_ROOT}
+          bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+          bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check"
+
   gather-models-gpu:
     runs-on: ubuntu-22.04
     outputs:
@@ -144,6 +182,7 @@ jobs:
         echo "::group::Run inference"
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
         echo "::endgroup::"
+
   test-gpu-aoti:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
@@ -179,6 +218,43 @@ jobs:
         echo "::group::Run inference"
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
         echo "::endgroup::"
+
+  test-gpu-eval-sanity-check:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-gpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install required packages"
+        pip install --pre torch  --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run eval"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "eval_sanity_check"
+        echo "::endgroup::"
+
   test-tinystories-executorch:
     strategy:
       matrix:

diff --git a/build/builder.py b/build/builder.py
@@ -377,7 +377,8 @@ def _initialize_model(
             print(f"Time to quantize model: {time.time() - t0q:.02f} seconds")
 
         if builder_args.setup_caches:
-            max_seq_length = 350
+            # TODO: get this from args?
+            max_seq_length = 2048
             with torch.device(builder_args.device):
                 model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)