pytorch · mikekgfb · May 20, 2024 · May 18, 2024 · May 18, 2024 · May 18, 2024
diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+#!/bin/bash
+# Check the spelling of the specified file
+cat "$1"
+
+TMPFILE=/tmp/`basename "$1"`-sequence
+
+if [ "X$2" == "X--no-extract" ]; then
+    cp "$1" $TMPFILE
+else
+    # We extract only the sequence output and don't spell check status and performance stats
+    python3 .ci/scripts/extract-sequence.py "$1" >$TMPFILE
+
+    if [ $? -ne 0 ]; then
+        echo "Sequence extraction failed. Exiting."
+        exit 1
+    fi
+fi
+
+cat ${TMPFILE} |  aspell -a -c  | grep '^[\&#]' >/tmp/out.$$
+# Exit with a non-zero status code if there were any spelling errors because:
+# * Finding one or more lines with & or # means we found a spelling error, might be gibberish
+if [ $? -ne 0 ]; then
+    echo "No spelling errors found; likely correct operation. Success."
+    exit 0
+fi
+cat /tmp/out.$$
+echo "Spelling errors found; might indicate garbage output. Failing."
+exit 1
diff --git a/.ci/scripts/extract-sequence.py b/.ci/scripts/extract-sequence.py
@@ -0,0 +1,29 @@
+import sys
+
+def print_until_equals(filename):
+    output = False
+    past_output = False
+    with open(filename, "r") as f:
+        for line in f:
+            if line.startswith("-" * 8):
+                output = True
+            if output and line.startswith("=" * 8):
+                if past_output:
+                    print("Double end-of-sequence line")
+                    exit(1)
+                past_output = True
+                output = False
+            if output:
+                print(line)
+
+        if not past_output:
+            print("Did find sequence to output")
+            exit(1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scriptname.py filename")
+        sys.exit(1)
+    filename = sys.argv[1]
+    print_until_equals(filename)
diff --git a/.github/workflows/hqq-dtype.yml b/.github/workflows/hqq-dtype.yml
@@ -8,7 +8,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-cuda:
+  test-hqq:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -28,8 +28,11 @@ jobs:
         echo "::group::Download checkpoints"
         # Install requirements
         ./install_requirements.sh cuda
+        bash scripts/build_native.sh aoti
         pip3 list
         python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        # needed to check for gibberish
+        yum install -y aspell aspell-en
         echo "::endgroup::"
 
         echo "::group::Download checkpoints"
@@ -42,30 +45,43 @@ jobs:
 
         echo "::group::Run inference"
         export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export TOKENIZER_PATH=checkpoints/stories15M/tokenizer.model
         export MODEL_NAME=stories15M
         export MODEL_DIR=/tmp
 
-        for DTYPE in bfloat16 float16 float32; do
+        export PROMPT="Once upon a time in a land far away"
+
+        for DEVICE in cpu cuda; do 
+         for DTYPE in bfloat16 float16 float32; do
 
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --dtype ${DTYPE} --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          .ci/scripts/check_gibberish ./output_eager
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          .ci/scripts/check_gibberish ./output_compiled
+          python export.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          .ci/scripts/check_gibberish ./output_aoti
+
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          cat ./output_runner_aoti
+          # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
           echo "**********************************************"
           echo "******** INT4 HQQ group-wise quantized *******"
           echo "**********************************************"
-          python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          .ci/scripts/check_gibberish ./output_eager
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --compile --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          .ci/scripts/check_gibberish ./output_compiled
+          python export.py --dtype ${DTYPE} --device ${DEVICE} --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device ${DEVICE} --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          .ci/scripts/check_gibberish ./output_aoti
+
+          ./cmake-out/aoti_run ${MODEL_DIR}/${MODEL_NAME}.so  -z ${TOKENIZER_PATH} -i "${PROMPT}" > ./output_runner_aoti
+          cat ./output_runner_aoti
+          # .ci/scripts/check_gibberish ./output_runner_aoti --no-extract
 
+         done
         done
 
         echo "tests complete"

diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -244,3 +244,4 @@ jobs:
         echo "tests complete"
         echo "*******************************************"
         echo "::endgroup::"
+
diff --git a/build/builder.py b/build/builder.py
@@ -441,6 +441,7 @@ def _initialize_model(
 
         model.to(dtype=builder_args.precision)
 
+    print("-----------------------------------------------------------")
     return model
 
 

diff --git a/generate.py b/generate.py
@@ -748,7 +748,7 @@ def callback(x):
         aggregate_metrics["tokens_per_sec"].append(tokens_sec)
 
         if jit_compile:
-            print(f"JIT compilation time (incl runtime): {compilation_time:.2} seconds")
+            print(f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds")
             # Don't continue here.... because we need to report and reset
             # continue
 

diff --git a/quantize.py b/quantize.py
@@ -550,9 +550,9 @@ def quantize(self, module):
                     inner_k_tiles=self.inner_k_tiles,
                 ):
                     if self.padding_allowed:
-                        print(
-                            f"warning: {name} is padded to satisfy in_features % 1024 == 0"
-                        )
+                        # print(
+                        #     f"warning: {name} is padded to satisfy in_features % 1024 == 0"
+                        # )
                         padded_in_features = find_multiple(in_features, 1024)
                         weight = F.pad(
                             weight, pad=(0, padded_in_features - in_features)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -244,3 +244,4 @@ jobs:
		echo "tests complete"
		echo "*******************************************"
		echo "::endgroup::"

Copy link Contributor malfet May 20, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Why?
Original file line number	Diff line number	Diff line change
Expand Up		@@ -441,6 +441,7 @@ def _initialize_model(

		model.to(dtype=builder_args.precision)

		print("-----------------------------------------------------------")
Copy link Contributor malfet May 20, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Why this is needed? Copy link Contributor Author mikekgfb May 20, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Looking for a delimiter for our gibberish (load, quantization and other time....). As far as model output it's as much gibberish as a hallucinating model (in terms of utlliity and connectedness to the model). That being said, there's a point where I strongly believe in dumping performance data -- we had missed regressions when we did not.
		return model


Expand Down