Limit what could run for 7b model on cuda (#311)

guangy10 · malfet · commit a9f134c8eeca · 2024-07-17T09:55:42.000-07:00
diff --git a/.ci/scripts/convert_checkpoint.sh b/.ci/scripts/convert_checkpoint.sh
@@ -22,12 +22,14 @@ function convert_checkpoint() {
         return 0
     fi
 
+    [ -f "build/convert_hf_checkpoint.py" ] || exit 1
+
     if [ -f "checkpoints/$MODEL_REPO/model.pth" ]; then
         echo "Converted checkpoint already exists. Skipping conversion for $MODEL_REPO."
         return 0
     fi
     echo "Convert Huggingface checkpoint for $MODEL_REPO"
-    python3 scripts/convert_hf_checkpoint.py --checkpoint-dir "checkpoints/$MODEL_REPO"
+    python3 build/convert_hf_checkpoint.py --checkpoint-dir "checkpoints/$MODEL_REPO"
 }
 
 
diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
@@ -25,7 +25,15 @@ function generate_compiled_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
-    for DTYPE in bfloat16 float16 float32; do
+    if [[ $CHECKPOINT_PATH != *"stories"* && $TARGET_DEVICE == "cuda" ]]; then
+        DTYPES="bfloat16"
+        EXCLUDE_INT8_QUANT=true
+    else
+        DTYPES="float32 bfloat16 float16"
+        EXCLUDE_INT8_QUANT=false
+    fi
+
+    for DTYPE in $DTYPES; do
         echo ""############### Run inference with torch.compile for dtype $DTYPE "###############"
         echo ""
         echo "******************************************"
@@ -66,21 +74,23 @@ function generate_compiled_model_output() {
         python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
         cat "$MODEL_DIR/output_compiled"
 
-        echo "******************************************"
-        echo "******* INT8 channel-wise quantized ******"
-        echo "******************************************"
-        python3 -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-        cat "$MODEL_DIR/output_eager"
-        python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-        cat "$MODEL_DIR/output_compiled"
-
-        echo "******************************************"
-        echo "******** INT8 group-wise quantized *******"
-        echo "******************************************"
-        python3 -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-        cat "$MODEL_DIR/output_eager"
-        python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-        cat "$MODEL_DIR/output_compiled"
+        if [ "$EXCLUDE_INT8_QUANT" = false ]; then
+            echo "******************************************"
+            echo "******* INT8 channel-wise quantized ******"
+            echo "******************************************"
+            python3 -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+            cat "$MODEL_DIR/output_eager"
+            python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+            cat "$MODEL_DIR/output_compiled"
+
+            echo "******************************************"
+            echo "******** INT8 group-wise quantized *******"
+            echo "******************************************"
+            python3 -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+            cat "$MODEL_DIR/output_eager"
+            python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+            cat "$MODEL_DIR/output_compiled"
+        fi
 
         echo "******************************************"
         echo "******** INT4 group-wise quantized *******"
@@ -98,7 +108,15 @@ function generate_aoti_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
-    for DTYPE in bfloat16 float16 float32; do
+    if [[ $CHECKPOINT_PATH != *"stories"* && $TARGET_DEVICE == "cuda" ]]; then
+        DTYPES="bfloat16"
+        EXCLUDE_INT8_QUANT=true
+    else
+        DTYPES="float32 bfloat16 float16"
+        EXCLUDE_INT8_QUANT=false
+    fi
+
+    for DTYPE in $DTYPES; do
         echo ""############### Run inference with AOT Inductor  for dtype $DTYPE "###############"
         echo ""
         echo "******************************************"
@@ -136,19 +154,21 @@ function generate_aoti_model_output() {
         python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
         cat "$MODEL_DIR/output_aoti"
 
-        echo "******************************************"
-        echo "******* INT8 channel-wise quantized ******"
-        echo "******************************************"
-        python3 -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-        python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-        cat "$MODEL_DIR/output_aoti"
+        if [ "$EXCLUDE_INT8_QUANT" = false ]; then
+            echo "******************************************"
+            echo "******* INT8 channel-wise quantized ******"
+            echo "******************************************"
+            python3 -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+            python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+            cat "$MODEL_DIR/output_aoti"
 
-        echo "******************************************"
-        echo "******** INT8 group-wise quantized *******"
-        echo "******************************************"
-        python3 -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-        python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-        cat "$MODEL_DIR/output_aoti"
+            echo "******************************************"
+            echo "******** INT8 group-wise quantized *******"
+            echo "******************************************"
+            python3 -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+            python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+            cat "$MODEL_DIR/output_aoti"
+        fi
 
         echo "******************************************"
         echo "******** INT4 group-wise quantized *******"
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -117,7 +117,7 @@ jobs:
       matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
       fail-fast: false
     with:
-      runner: linux.g5.12xlarge.nvidia.gpu
+      runner: ${{ matrix.runner }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.1"
       script: |