Run gguf (#686)

mikekgfb · malfet · commit 99ebd396a051 · 2024-07-17T09:55:45.000-07:00
* improve updown parser, and use in README.md execution

* cut/paste errors

* typo: true -&gt; false

* we scan each partial line, so need to suppress at partial line level :(

* make it twice as nice

* improved updown parsing

* special handling for lines w/o option

* enable run on quantization doc

* handle white space before trip backtick

* updates

* run gguf

* updates

* add gguf to periodic

* build et for gguf

* update updown options to handle llama3-8b on macos

* secrets

* updates
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
@@ -42,10 +42,6 @@ jobs:
         bash -x ./run-readme.sh
         echo "::endgroup::"
 
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
 
   test-quantization-any:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -79,6 +75,39 @@ jobs:
         bash -x ./run-quantization.sh
         echo "::endgroup::"
 
+  test-gguf-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      secrets-env: "HF_TOKEN_PERIODIC"
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        echo "::group::Create script to run gguf"
+        python3 scripts/updown.py --file docs/GGUF.md > ./run-gguf.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-gguf.sh
+        echo "::endgroup::"
+
+        echo "::group::Run gguf"
+        echo "*******************************************"
+        cat ./run-gguf.sh
+        echo "*******************************************"
+        bash -x ./run-gguf.sh
+        echo "::endgroup::"
+
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
@@ -7,7 +7,7 @@ on:
   workflow_dispatch:
 jobs:
   test-readme-macos:
-    runs-on: macos-14-xlarge 
+    runs-on: macos-14-xlarge
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
@@ -34,7 +34,7 @@ jobs:
           echo "::endgroup::"
 
           echo "::group::Create script to run README"
-          python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
+          python3 scripts/updown.py --file README.md  --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
           # for good measure, if something happened to updown processor,
           # and it did not error out, fail with an exit 1
           echo "exit 1" >> ./run-readme.sh
@@ -47,12 +47,7 @@ jobs:
           bash -x ./run-readme.sh
           echo "::endgroup::"
   
-          echo "::group::Completion"
-          echo "tests complete"
-          echo "*******************************************"
-          echo "::endgroup::"
-
-
+  
   test-quantization-macos:
     runs-on: macos-14-xlarge 
     steps:
@@ -81,7 +76,7 @@ jobs:
           echo "::endgroup::"
 
           echo "::group::Create script to run quantization"
-          python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
+          python3 scripts/updown.py --file docs/quantization.md  --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
           # for good measure, if something happened to updown processor,
           # and it did not error out, fail with an exit 1
           echo "exit 1" >> ./run-quantization.sh
@@ -98,3 +93,57 @@ jobs:
           echo "tests complete"
           echo "*******************************************"
           echo "::endgroup::"
+
+
+  test-gguf-macos:
+    runs-on: macos-14-xlarge
+    secrets: inherit
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        secrets-env: "HF_TOKEN_PERIODIC"		
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          # echo "::group::Install newer objcopy that supports --set-section-alignment"
+          # yum install -y  devtoolset-10-binutils
+          # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+          # echo "::endgroup::"
+  
+          echo "::group::Create script to run gguf"
+          python3 scripts/updown.py --file docs/GGUF.md > ./run-gguf.sh
+          # for good measure, if something happened to updown processor,
+          # and it did not error out, fail with an exit 1
+          echo "exit 1" >> ./run-gguf.sh
+          echo "::endgroup::"
+
+          echo "::group::Run gguf"
+          echo "*******************************************"
+          cat ./run-gguf.sh
+          echo "*******************************************"
+          bash -x ./run-gguf.sh
+          echo "::endgroup::"
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
@@ -8,8 +8,10 @@ on:
 jobs:
   test-readme-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
     with:
       runner: macos-m1-14
+      secrets-env: "HF_TOKEN_PERIODIC"
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11
           conda activate test-readme-mps-macos
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -10,6 +10,7 @@ on:
 jobs:
   test-readme-any:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
@@ -76,6 +77,39 @@ jobs:
         bash -x ./run-quantization.sh
         echo "::endgroup::"
 
+  test-gguf-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      secrets-env: "HF_TOKEN_PERIODIC"
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        echo "::group::Create script to run gguf"
+        python3 scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-gguf.sh
+        echo "::endgroup::"
+
+        echo "::group::Run gguf"
+        echo "*******************************************"
+        cat ./run-gguf.sh
+        echo "*******************************************"
+        bash -x ./run-gguf.sh
+        echo "::endgroup::"
+
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
diff --git a/docs/GGUF.md b/docs/GGUF.md
@@ -1,16 +1,27 @@
 # Using GGUF Models
-We support parsing [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files with the following tensor types:
+
+[shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
+
+[shell default]: TORCHCHAT_ROOT=${PWD} ./scripts/install_et.sh
+
+We support parsing [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files with
+the following tensor types:
 - F16
 - F32
 - Q4_0
 - Q6_K
 
-If an unsupported type is encountered while parsing a GGUF file, an exception is raised.
+If an unsupported type is encountered while parsing a GGUF file, an
+exception is raised.
 
 We now go over an example of using GGUF files in the torchchat flow.
 
 ### Download resources
-First download a GGUF model and tokenizer.  In this example, we use a Q4_0 GGUF file.  (Note that Q4_0 is only the dominant tensor type in the file, but the file also contains GGUF tensors of types Q6_K, F16, and F32.)
+
+First download a GGUF model and tokenizer.  In this example, we use a
+Q4_0 GGUF file.  (Note that Q4_0 is only the dominant tensor type in
+the file, but the file also contains GGUF tensors of types Q6_K, F16,
+and F32.)
 
 ```
 # Download resources
@@ -55,3 +66,5 @@ python3 torchchat.py export --gguf-path ${GGUF_MODEL_PATH} --output-pte-path ${G
 # Generate using the PTE model that was created by the export command
 python3 torchchat.py generate --gguf-path ${GGUF_MODEL_PATH} --pte-path ${GGUF_PTE_PATH} --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --prompt "Once upon a time" --max-new-tokens 15
 ```
+
+[end default]: end
diff --git a/scripts/updown.py b/scripts/updown.py
@@ -140,7 +140,7 @@ def process_command(
         )
     elif keyword == "prefix":
         output(
-            trailing_command[:-1],
+            trailing_command,
             end="",
             replace_list=replace_list,
             suppress_list=suppress_list,
@@ -178,6 +178,19 @@ def process_command(
             suppress_list=suppress_list,
         )
         exit(0)
+    elif keyword == "comment":
+        output(
+            "# " + trailing_command,
+            suppress_list=None,
+            replace_list=None,
+        )
+    else:
+        output(
+            "echo 'unknown updown command'\nexit 1",
+            suppress_list=None,
+            replace_list=None,
+        )
+        exit(1)
 
     # We have processed this line as a command
     return True