add aoti c/c++ runner to hqq tests; check output for gibberish using spell (#824)

mikekgfb · malfet · commit b4ac339bb062 · 2024-07-16T23:03:15.000-07:00
* add runner to hqq tests

* replace cat with a gibberish check

* typo

* create script to check for gibberish

* update gibberish check

* update gibberish check

* use variable for tokenizer path

* aspell dictionaries for english

* exclude device name from gibberish check

* handle JIT time line

* handle Warning:

* grep update

* fix line exclusion

* remove warning which causes gibberish check fail

* add sequence extraction for principled handling of perf info and user messages

* typo

* change output to pass spell check

* updates

* handle runner which does not have sequence delimiters b/c does not need sequence extraction

* add updated workflow yml

* typo

* native runner weirdness

* remove secrets

* don't log in for GGUF open_orca model
diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+#!/bin/bash
+# Check the spelling of the specified file
+cat "$1"
+
+TMPFILE=/tmp/`basename "$1"`-sequence
+
+if [ "X$2" == "X--no-extract" ]; then
+    cp "$1" $TMPFILE
+else
+    # We extract only the sequence output and don't spell check status and performance stats
+    python3 .ci/scripts/extract-sequence.py "$1" >$TMPFILE
+
+    if [ $? -ne 0 ]; then
+        echo "Sequence extraction failed. Exiting."
+        exit 1
+    fi
+fi
+
+cat ${TMPFILE} |  aspell -a -c  | grep '^[\&#]' >/tmp/out.$$
+# Exit with a non-zero status code if there were any spelling errors because:
+# * Finding one or more lines with & or # means we found a spelling error, might be gibberish
+if [ $? -ne 0 ]; then
+    echo "No spelling errors found; likely correct operation. Success."
+    exit 0
+fi
+cat /tmp/out.$$
+echo "Spelling errors found; might indicate garbage output. Failing."
+exit 1
diff --git a/.ci/scripts/extract-sequence.py b/.ci/scripts/extract-sequence.py
@@ -0,0 +1,29 @@
+import sys
+
+def print_until_equals(filename):
+    output = False
+    past_output = False
+    with open(filename, "r") as f:
+        for line in f:
+            if line.startswith("-" * 8):
+                output = True
+            if output and line.startswith("=" * 8):
+                if past_output:
+                    print("Double end-of-sequence line")
+                    exit(1)
+                past_output = True
+                output = False
+            if output:
+                print(line)
+
+        if not past_output:
+            print("Did find sequence to output")
+            exit(1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scriptname.py filename")
+        sys.exit(1)
+    filename = sys.argv[1]
+    print_until_equals(filename)
diff --git a/.github/workflows/hqq-dtype.yml b/.github/workflows/hqq-dtype.yml
@@ -8,7 +8,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-cuda:
+  test-hqq:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -28,8 +28,11 @@ jobs:
         echo "::group::Download checkpoints"
         # Install requirements
         ./install_requirements.sh cuda
+        bash scripts/build_native.sh aoti
         pip3 list
         python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        # needed to check for gibberish
+        yum install -y aspell aspell-en
         echo "::endgroup::"
 
         echo "::group::Download checkpoints"
@@ -42,30 +45,43 @@ jobs:
 
         echo "::group::Run inference"
         export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export TOKENIZER_PATH=checkpoints/stories15M/tokenizer.model
         export MODEL_NAME=stories15M
         export MODEL_DIR=/tmp
 
-        for DTYPE in bfloat16 float16 float32; do
+        export PROMPT="Once upon a time in a land far away"
+        
+        for DEVICE in cpu cuda; do 
+         for DTYPE in bfloat16 float16 float32; do
 
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --dtype ${DTYPE} --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          .ci/scripts/check_gibberish ./output_eager
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          .ci/scripts/check_gibberish ./output_compiled
+          python export.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          .ci/scripts/check_gibberish ./output_aoti
+
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          cat ./output_runner_aoti
+          # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
           echo "**********************************************"
           echo "******** INT4 HQQ group-wise quantized *******"
           echo "**********************************************"
-          python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          .ci/scripts/check_gibberish ./output_eager
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --compile --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          .ci/scripts/check_gibberish ./output_compiled
+          python export.py --dtype ${DTYPE} --device ${DEVICE} --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          .ci/scripts/check_gibberish ./output_aoti
+
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          cat ./output_runner_aoti
+          # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
+         done
         done
 
         echo "tests complete"
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -244,3 +244,4 @@ jobs:
         echo "tests complete"
         echo "*******************************************"
         echo "::endgroup::"
+
diff --git a/build/builder.py b/build/builder.py
@@ -441,6 +441,7 @@ def _initialize_model(
 
         model.to(dtype=builder_args.precision)
 
+    print("-----------------------------------------------------------")
     return model
 
 
diff --git a/generate.py b/generate.py
@@ -748,7 +748,7 @@ def callback(x):
         aggregate_metrics["tokens_per_sec"].append(tokens_sec)
 
         if jit_compile:
-            print(f"JIT compilation time (incl runtime): {compilation_time:.2} seconds")
+            print(f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds")
             # Don't continue here.... because we need to report and reset
             # continue
 
diff --git a/quantize.py b/quantize.py
@@ -550,9 +550,9 @@ def quantize(self, module):
                     inner_k_tiles=self.inner_k_tiles,
                 ):
                     if self.padding_allowed:
-                        print(
-                            f"warning: {name} is padded to satisfy in_features % 1024 == 0"
-                        )
+                        # print(
+                        #     f"warning: {name} is padded to satisfy in_features % 1024 == 0"
+                        # )
                         padded_in_features = find_multiple(in_features, 1024)
                         weight = F.pad(
                             weight, pad=(0, padded_in_features - in_features)

Original file line number	Diff line number	Diff line change
`@@ -441,6 +441,7 @@ def _initialize_model(`
`441`	`441`
`442`	`442`	`model.to(dtype=builder_args.precision)`
`443`	`443`
	`444`	`+ print("-----------------------------------------------------------")`
`444`	`445`	`return model`
`445`	`446`
`446`	`447`