Advanced.md (#772)

mikekgfb · web-flow · commit 5aee79342a57 · 2024-05-12T22:21:48.000-07:00
* move gguf tests to script

* execute advanced instructions
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
@@ -55,4 +55,13 @@ if [ "$1" == "gguf" ]; then
         echo "*******************************************"
         bash -x ./run-gguf.sh
         echo "::endgroup::"
-fi
+<<<<<<< HEAD
+fi
+
+
+if [ "$1" == "advanced" ]; then
+   echo "TBD"
+fi            
+=======
+fi
+>>>>>>> e3db2486f80b71b3143945a44f58d50c02488c90
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -153,3 +153,68 @@ jobs:
         echo "tests complete"
         echo "*******************************************"
         echo "::endgroup::"
+
+  test-advanced-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      secrets-env: "HF_TOKEN_PERIODIC"
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        echo "::group::Create script to run advanced"
+        python3 scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-advanced.sh
+        echo "::endgroup::"
+
+        echo "::group::Run advanced"
+        echo "*******************************************"
+        cat ./run-advanced.sh
+        echo "*******************************************"
+        bash -x ./run-advanced.sh
+=======
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+>>>>>>> e3db2486f80b71b3143945a44f58d50c02488c90
+        echo "::endgroup::"
+
+  test-gguf-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      secrets-env: "HF_TOKEN_PERIODIC"
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
@@ -8,7 +8,10 @@ Torchchat is currently in a pre-release state and under extensive development.
 
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license)
 
-&nbsp;
+[shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
+
+[shell default]: TORCHCHAT_ROOT=${PWD} ./scripts/install_et.sh
+
 
 This is the advanced users guide, if you're looking to get started
 with LLMs, please refer to the README at the root directory of the
@@ -51,15 +54,16 @@ mistralai/Mistral-7B-v0.1 | 🚧  |  ✅  |  ✅ |  ✅ |  ✅ | ❹ |
 mistralai/Mistral-7B-Instruct-v0.1 | - | ✅ |  ✅ |  ✅ |  ✅ | ❹ |
 mistralai/Mistral-7B-Instruct-v0.2 | - | ✅ |  ✅ |  ✅ |  ✅ | ❹ |
 
-*Key:* ✅ works correctly; 🚧  work in progress; ❌ not supported; ❹ requires 4bit groupwise quantization; 📵 not on mobile (may fit some high-end devices such as tablets);
-
-&nbsp;
+*Key:* ✅ works correctly; 🚧 work in progress; ❌ not supported; ❹
+ requires 4bit groupwise quantization; 📵 not on mobile (may fit some
+ high-end devices such as tablets);
 
----
 
 ## Get Started
 
-Torchchat lets you access LLMs through an interactive interface, prompted single-use generation, model export (for use by AOT Inductor and ExecuTorch), and standalone C++ runtimes.
+Torchchat lets you access LLMs through an interactive interface,
+prompted single-use generation, model export (for use by AOT Inductor
+and ExecuTorch), and standalone C++ runtimes.
 
 | Function | Torchchat Command | Direct Command | Tested |
 |---|----|----|-----|
@@ -79,9 +83,11 @@ Mobile C++ runtime | n/a | app + AOTI | 🚧 |
 
 **Getting help:** Each command implements the --help option to give addititonal information about available options:
 
+[skip default]: begin
 ```
 python3 torchchat.py [ export | generate | chat | eval | ... ] --help
 ```
+[skip default]: end
 
 Exported models can be loaded back into torchchat for chat or text
 generation, letting you experiment with the exported model and valid
@@ -182,9 +188,12 @@ model from Andrej Karpathy's tinyllamas model family:
 
 ```
 MODEL_NAME=stories15M
-MODEL_DIR=<root to your heckpoints>/${MODEL_NAME}
-MODEL_PATH=${MODEL_OUT}/stories15M.pt
+MODEL_DIR=~/checkpoints/${MODEL_NAME}
+MODEL_PATH=${MODEL_DIR}/stories15M.pt
 MODEL_OUT=~/torchchat-exports
+
+mkdir -p ${MODEL_DIR}
+mkdir -p ${MODEL_OUT}
 ```
 
 When we export models with AOT Inductor for servers and desktops, and
@@ -242,7 +251,7 @@ ExecuTorch-exported PTE models.
 
 ## PyTorch eager mode and JIT-compiled execution
 ```
-python3 generate.py [--compile] --checkpoint-path ${MODEL_PATH} --prompt "Hello, my name is" --device [ cuda | cpu | mps]
+python3 generate.py [--compile] --checkpoint-path ${MODEL_PATH} --prompt "Hello, my name is" --device [ cuda | mps | cpu ]
 ```
 
 To improve performance, you can compile the model with `--compile`
@@ -306,7 +315,7 @@ using AOT Inductor for CPU oor GPUs (the latter using Triton for
 optimizations such as operator fusion):
 
 ```
-python3 export.py --checkpoint-path ${MODEL_PATH} --device [ cuda | cpu] --output-dso-path ${MODEL_NAME}.so
+python3 export.py --checkpoint-path ${MODEL_PATH} --device [ cuda | cpu ] --output-dso-path ${MODEL_NAME}.so
 ```
 
 
@@ -334,7 +343,7 @@ tests against the exported model with the same interface, and support
 additional experiments to confirm model quality and speed.
 
 ```
-python3 generate.py --device {cuda,cpu} --dso-path ${MODEL_NAME}.so --prompt "Once upon a time"
+python3 generate.py --device [ cuda | cpu ] --dso-path ${MODEL_NAME}.so --prompt "Once upon a time"
 ```
 
 
@@ -389,12 +398,17 @@ linear operator (asymmetric) with GPTQ | n/a | 4b (group) | n/a |
 linear operator (asymmetric) with HQQ | n/a |  work in progress | n/a |
 
 ## Model precision (dtype precision setting)
-On top of quantizing models with quantization schemes mentioned above, models can be converted to lower bit floating point precision to reduce the memory bandwidth requirement and take advantage of higher density compute available. For example, many GPUs and some of the CPUs have good support for bfloat16 and float16. This can be taken advantage of via `--dtype arg` as shown below.
+On top of quantizing models with quantization schemes mentioned above, models can be converted 
+to lower precision floating point representations to reduce the memory bandwidth requirement and 
+take advantage of higher density compute available. For example, many GPUs and some of the CPUs 
+have good support for bfloat16 and float16. This can be taken advantage of via `--dtype arg` as shown below.
 
+[skip default]: begin
 ```
 python3 generate.py --dtype [bf16 | fp16 | fp32] ...
 python3 export.py --dtype [bf16 | fp16 | fp32] ...
 ```
+[skip default]: end
 
 You can find instructions for quantizing models in
 [docs/quantization.md](file:///./quantization.md).  Advantageously,
@@ -412,9 +426,11 @@ GGUF is a nascent industry standard format and presently torchchat can
 read the F16, F32, Q4_0, and Q6_K formats natively and convert them
 into native torchchat models by using the load-gguf option:
 
+[skip default]: begin
 ```
 python3 [ export.py | generate.py | ... ] --gguf-path <gguf_filename>
 ```
+[skip default]: end
 
 You may then apply the standard quantization options, e.g., to add
 embedding table quantization as described under quantization. (You
@@ -441,7 +457,7 @@ start with the original FP16 or FP32 GGUF format.
 To use the quantize tool, install the GGML tools at ${GGUF} . Then,
 you can, for example, convert a quantized model to f16 format:
 
-
+[end default]: end
 ```
 ${GGUF}/quantize --allow-requantize your_quantized_model.gguf fake_unquantized_model.gguf f16
 ```
@@ -565,15 +581,15 @@ in a python-free environment with AOT Inductor and ExecuTorch.
 
 
 
-# Contributing to torchchat
+# CONTRIBUTING to torchchat
 
 We welcome any feature requests, bug reports, or pull requests from
 the community. See the [CONTRIBUTING](CONTRIBUTING.md) for
 instructions how to contribute to torchchat.
 
 
 
-# License
+# LICENSE
 
 Torchchat is released under the [BSD 3 license](./LICENSE). However
 you may have additional legal obligations that govern your use of other
diff --git a/run-advanced.sh b/run-advanced.sh
@@ -0,0 +1,25 @@
+set -eou pipefail
+TORCHCHAT_ROOT=${PWD} ./scripts/install_et.sh
+if false; then
+python3 torchchat.py  ...  --help
+fi
+MODEL_NAME=stories15M
+MODEL_DIR=~/checkpoints/${MODEL_NAME}
+MODEL_PATH=${MODEL_DIR}/stories15M.pt
+MODEL_OUT=~/torchchat-exports
+
+mkdir -p ${MODEL_DIR}
+mkdir -p ${MODEL_OUT}
+python3 generate.py  --checkpoint-path ${MODEL_PATH} --prompt "Hello, my name is" --device  mps 
+python3 export.py --checkpoint-path ${MODEL_PATH} --device  cuda  --output-pte-path ${MODEL_NAME}.pte
+python3 export.py --checkpoint-path ${MODEL_PATH} --device  cpu  --output-dso-path ${MODEL_NAME}.so
+python3 generate.py --checkpoint-path ${MODEL_PATH} --pte-path ${MODEL_NAME}.pte --device cpu --prompt "Once upon a time"
+python3 generate.py --device {cuda,cpu} --dso-path ${MODEL_NAME}.so --prompt "Once upon a time"
+if false; then
+python3 generate.py --dtype  fp32 ...
+python3 export.py --dtype  fp32 ...
+fi
+if false; then
+python3  ...  --gguf-path <gguf_filename>
+fi
+exit 0
diff --git a/scripts/updown.py b/scripts/updown.py
@@ -53,6 +53,14 @@ def output(*args, **kwargs):
 ###
 
 
+def select_first_option_between_brackets(text):
+    return re.sub(r"\[([^]|]*?)\|[^]]*]", r"\1", text)
+
+
+def select_last_option_between_brackets(text):
+    return re.sub(r"\[[^]]*\|([^]|]*)\]", r"\1", text)
+
+
 def remove_text_between_brackets(text):
     return re.sub(r"\[.*?\]", "", text)
 
@@ -78,6 +86,12 @@ def updown_process_line(
         # [ x1 | c2 | x3 ] means "pick one", so we may have to check that and pick one
         # of the options.  Probably pick the last option because testing has more likely
         # been performed with the first option!
+        last=True
+        if last:
+            line=select_last_option_between_brackets(line)
+        else:
+            line=select_first_option_between_brackets(line)
+            
         output(
             remove_text_between_brackets(line),
             replace_list=replace_list,