pytorch
diff --git a/‎.ci/scripts/run-docs
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/run-docs
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml
Lines changed: 6 additions & 27 deletions b/‎.github/workflows/pull.yml
Lines changed: 6 additions & 27 deletions
diff --git a/‎README.md
Lines changed: 9 additions & 7 deletions b/‎README.md
Lines changed: 9 additions & 7 deletions
diff --git a/‎docs/quantization.md
Lines changed: 15 additions & 8 deletions b/‎docs/quantization.md
Lines changed: 15 additions & 8 deletions
diff --git a/‎install/.pins/torchao-pin.txt
Lines changed: 1 addition & 1 deletion b/‎install/.pins/torchao-pin.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎install/install_requirements.sh
Lines changed: 2 additions & 2 deletions b/‎install/install_requirements.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎install/requirements.txt
Lines changed: 2 additions & 1 deletion b/‎install/requirements.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchchat/cli/builder.py
Lines changed: 12 additions & 15 deletions b/‎torchchat/cli/builder.py
Lines changed: 12 additions & 15 deletions
diff --git a/‎torchchat/cli/download.py
Lines changed: 11 additions & 4 deletions b/‎torchchat/cli/download.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎torchchat/edge/android/torchchat/app/.gitignore
Lines changed: 1 addition & 1 deletion b/‎torchchat/edge/android/torchchat/app/.gitignore
Lines changed: 1 addition & 1 deletion
@@ -7,7 +7,7 @@ fi
 
 if [ "$1" == "readme" ]; then
         echo "::group::Create script to run README"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
+        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-readme.sh
 
@@ -1092,32 +1092,11 @@ jobs:
         id: install-torchao-ops
         run: |
           bash torchchat/utils/scripts/build_torchao_ops.sh
-      - name: Set git shas
-        id: setup-hash
-        run: |
-          export TORCHCHAT_ROOT=${PWD}
-          echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
-      - name: Load or install ET
-        id: install-et
-        uses: actions/cache@v4
-        with:
-          path: |
-            ./et-build
-            ./torchchat/utils/scripts/install_et.sh
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
-      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
-        continue-on-error: true
+      - name: Install ET
         run: |
           echo "Installing ExecuTorch"
+          export TORCHCHAT_ROOT=${PWD}
           bash torchchat/utils/scripts/install_et.sh
-      - name: Install ExecuTorch python
-        run: |
-          echo "Install ExecuTorch python"
-          export TORCHCHAT_ROOT=$PWD
-          export ET_BUILD_DIR="et-build"
-          ENABLE_ET_PYBIND="${1:-true}"
-          source "torchchat/utils/scripts/install_utils.sh"
-          install_executorch_python_libs $ENABLE_ET_PYBIND
       - name: Install runner
         run: |
           echo "Installing runner"
@@ -1132,14 +1111,14 @@ jobs:
           wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
           export PRMT="Once upon a time in a land far away"
           echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
           echo "Export and run ET (C++ runner)"
-          python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Export and run AOTI (C++ runner)"
-          python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Generate AOTI"
           python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
 
@@ -171,7 +171,7 @@ python3 torchchat.py download llama3.1
 <summary>Additional Model Inventory Management Commands</summary>
 
 ### Where
-This subcommand shows location of a particular model.
+This subcommand shows the location of a particular model.
 ```bash
 python3 torchchat.py where llama3.1
 ```
@@ -216,7 +216,6 @@ This mode generates text based on an input prompt.
 python3 torchchat.py generate llama3.1 --prompt "write me a story about a boy and his bear"
 ```
 
-[skip default]: end
 
 ### Server
 This mode exposes a REST API for interacting with a model.
@@ -286,6 +285,8 @@ First, follow the steps in the Server section above to start a local server. The
 streamlit run torchchat/usages/browser.py
 ```
 
+[skip default]: end
+
 Use the "Max Response Tokens" slider to limit the maximum number of tokens generated by the model for each response. Click the "Reset Chat" button to remove the message history and start a fresh chat.
 
 
@@ -295,6 +296,7 @@ Use the "Max Response Tokens" slider to limit the maximum number of tokens gener
 [AOTI](https://pytorch.org/blog/pytorch2-2/) compiles models before execution for faster inference. The process creates a zipped PT2 file containing all the artifacts generated by AOTInductor, and a [.so](https://en.wikipedia.org/wiki/Shared_library) file with the runnable contents
 that is then loaded for inference. This can be done with both Python and C++ enviroments.
 
+
 The following example exports and executes the Llama3.1 8B Instruct
 model.  The first command compiles and performs the actual export.
 
@@ -309,7 +311,7 @@ python3 torchchat.py export llama3.1 --output-aoti-package-path exportedModels/l
 For more details on quantization and what settings to use for your use
 case visit our [customization guide](docs/model_customization.md).
 
-### Run in a Python Enviroment
+### Run in a Python Environment
 
 To run in a python enviroment, use the generate subcommand like before, but include the pt2 file.
 
@@ -376,7 +378,7 @@ While ExecuTorch does not focus on desktop inference, it is capable
 of doing so. This is handy for testing out PTE
 models without sending them to a physical device.
 
-Specifically there are 2 ways of doing so: Pure Python and via a Runner
+Specifically, there are 2 ways of doing so: Pure Python and via a Runner
 
 <details>
 <summary>Deploying via Python</summary>
@@ -476,9 +478,9 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
 
 1. Download the AAR file, which contains the Java library and corresponding JNI library, to build and run the app.
 
-   - [executorch-240919.aar](https://ossci-android.s3.amazonaws.com/executorch/main/executorch-240919.aar) (SHASUM: c8a5d38ead03bfa28ee8469f6355840ad0d182ba)
+   - [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar) ([sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums))
 
-2. Rename the downloaded AAR file to `executorch.aar` and move the file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
+2. Move the downloaded AAR file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
 
 3. Push the model and tokenizer file to your device. You can find the model file called `llama3.1.pte` in the current `torchchat` directory and the tokenizer file at `$(python3 torchchat.py where llama3.1)/tokenizer.model` path.
     ```
@@ -500,7 +502,7 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
 and use [this script](https://github.com/pytorch/executorch/blob/main/build/build_android_llm_demo.sh) to build the AAR library.
 
 <p align="center">
-    <img src="https://pytorch.org/executorch/main/_static/img/android_llama_app.png" width="600" alt="Android app running a LlaMA model">
+    <img src="https://pytorch.org/executorch/main/_static/img/chat.png" width="600" alt="Android app running a LlaMA model">
 </p>
 
 
 
@@ -121,22 +121,29 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my n
 ## Experimental TorchAO lowbit kernels
 
 ### Use
-The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
+
+#### linear:a8wxdq
+The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
 It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
 The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
 Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.
 
-You should expect high performance on ARM CPU if bitwidth is 1, 2, 3, 4, or 5 and groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+You should expect high performance on ARM CPU if groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+
+#### embedding:wx
+The quantization scheme embedding:wx quantizes embeddings in a groupwise manner with the specified bitwidth and groupsize.  It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize.  Unlike linear:a8wxdq, embedding:wx always quantizes with scales and zeros.
+
+You should expect high performance on ARM CPU if groupsize is divisible by 32.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
 
 ### Setup
-To use a8wxdq, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
 
 From the torchchat root directory, run
 ```
 sh torchchat/utils/scripts/build_torchao_ops.sh
 ```
 
-This should take about 10 seconds to complete.  Once finished, you can use a8wxdq in torchchat.
+This should take about 10 seconds to complete.
 
 Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
@@ -156,17 +163,17 @@ Below we show how to use the new kernels.  Except for ExecuTorch, you can specif
 
 #### Eager mode
 ```
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --prompt "Once upon a time,"  --num-samples 5
 ```
 
 #### torch.compile
 ```
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile --prompt "Once upon a time,"  --num-samples 5
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile --prompt "Once upon a time,"  --num-samples 5
 ```
 
 #### AOTI
 ```
-OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3_1.so
+OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
 OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time,"  --num-samples 5
 ```
 
@@ -178,7 +185,7 @@ OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cac
 
 #### ExecuTorch
 ```
-python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3_1.pte
+python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-pte llama3_1.pte
 ```
 
 Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
 
@@ -1 +1 @@
-49b1fb61c8b8eceda755579a2fd92c756d822de2
+c8f1174a06dcc0102849c8348ca6573bde8847a9
@@ -54,7 +54,7 @@ PYTORCH_NIGHTLY_VERSION=dev20241002
 VISION_NIGHTLY_VERSION=dev20241002
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20240928
+TUNE_NIGHTLY_VERSION=dev20241010
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -79,7 +79,7 @@ fi
 REQUIREMENTS_TO_INSTALL=(
   torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
   torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"
-  torchtune=="0.3.0.${TUNE_NIGHTLY_VERSION}"
+  torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}"
 )
 
 # Install the requirements. --extra-index-url tells pip to look for package
 
@@ -12,7 +12,8 @@ tiktoken
 # Miscellaneous
 snakeviz
 sentencepiece
-numpy < 2.0
+# numpy version range required by GGUF util
+numpy >= 1.17, < 2.0
 gguf
 blobfile
 tomli >= 1.1.0 ; python_version < "3.11"
 
@@ -81,19 +81,16 @@ def __post_init__(self):
         if self.aoti_package_path and self.pte_path:
             raise RuntimeError("specify either AOTI Package path or PTE path, but not more than one")
 
-        if self.checkpoint_path and (self.aoti_package_path or self.pte_path):
-            print(
-                "Warning: checkpoint path ignored because an exported AOTI or PTE path specified"
-            )
-        if self.checkpoint_dir and (self.aoti_package_path or self.pte_path):
-            print(
-                "Warning: checkpoint dir ignored because an exported AOTI or PTE path specified"
-            )
-        if self.gguf_path and (self.aoti_package_path or self.pte_path):
-            print(
-                "Warning: GGUF path ignored because an exported AOTI or PTE path specified"
-            )
-        if not (self.aoti_package_path) and not (self.pte_path):
+        if self.aoti_package_path or self.pte_path:
+            ignored_params = [
+                (self.checkpoint_path, "checkpoint path"),
+                (self.checkpoint_dir, "checkpoint dir"),
+                (self.gguf_path, "GGUF path"),
+            ]
+            for param, param_msg in ignored_params:
+                if param:
+                    print(f"Warning: {param_msg} ignored because an exported AOTI or PTE path was specified")
+        else:
             self.prefill_possible = True
 
     @classmethod
@@ -453,7 +450,7 @@ def _maybe_init_distributed(
     return world_mesh, parallel_dims
 
 
-def _maybe_parellelize_model(
+def _maybe_parallelize_model(
     model: nn.Module,
     builder_args: BuilderArgs,
     world_mesh: DeviceMesh,
@@ -493,7 +490,7 @@ def _load_model(builder_args: BuilderArgs) -> Model:
         model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-    model = _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims)
+    model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
 
@@ -10,7 +10,10 @@
 from pathlib import Path
 from typing import Optional
 
-from torchchat.cli.convert_hf_checkpoint import convert_hf_checkpoint, convert_hf_checkpoint_to_tune
+from torchchat.cli.convert_hf_checkpoint import (
+    convert_hf_checkpoint,
+    convert_hf_checkpoint_to_tune,
+)
 from torchchat.model_config.model_config import (
     load_model_configs,
     ModelConfig,
@@ -57,7 +60,6 @@ def _download_hf_snapshot(
         snapshot_download(
             model_config.distribution_path,
             local_dir=artifact_dir,
-            local_dir_use_symlinks=False,
             token=hf_token,
             ignore_patterns=ignore_patterns,
         )
@@ -77,9 +79,14 @@ def _download_hf_snapshot(
             raise e
 
     # Convert the Multimodal Llama model to the torchtune format.
-    if model_config.name in {"meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-11B-Vision"}:
+    if model_config.name in {
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision",
+    }:
         print(f"Converting {model_config.name} to torchtune format...", file=sys.stderr)
-        convert_hf_checkpoint_to_tune( model_dir=artifact_dir, model_name=model_config.name)
+        convert_hf_checkpoint_to_tune(
+            model_dir=artifact_dir, model_name=model_config.name
+        )
 
     else:
         # Convert the model to the torchchat format.
 
@@ -1 +1 @@
-/build
+/build
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-49b1fb61c8b8eceda755579a2fd92c756d822de2`
	`1`	`+c8f1174a06dcc0102849c8348ca6573bde8847a9`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ PYTORCH_NIGHTLY_VERSION=dev20241002`
`54`	`54`	`VISION_NIGHTLY_VERSION=dev20241002`
`55`	`55`
`56`	`56`	`# Nightly version for torchtune`
`57`		`-TUNE_NIGHTLY_VERSION=dev20240928`
	`57`	`+TUNE_NIGHTLY_VERSION=dev20241010`
`58`	`58`
`59`	`59`	`# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same`
`60`	`60`	`(`
`@@ -79,7 +79,7 @@ fi`
`79`	`79`	`REQUIREMENTS_TO_INSTALL=(`
`80`	`80`	`torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"`
`81`	`81`	`torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"`
`82`		`- torchtune=="0.3.0.${TUNE_NIGHTLY_VERSION}"`
	`82`	`+ torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}"`
`83`	`83`	`)`
`84`	`84`
`85`	`85`	`# Install the requirements. --extra-index-url tells pip to look for package`