Script to export HF models

Guang Yang · Guang Yang · commit 9d7e16fd0756 · 2024-09-10T15:51:33.000-07:00
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -351,3 +351,93 @@ jobs:
           PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
           echo "::endgroup::"
         done
+
+  test-huggingface-transformers:
+    name: test-huggingface-transformers
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        hf_model_repo: [google/gemma-2b]
+      fail-fast: false
+    with:
+      secrets-env: "HF_TOKEN_PERIODIC"
+      runner: linux.12xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      steps:
+        - name: Set up ExecuTorch
+          run: |
+            # The generic Linux job chooses to use base env, not the one setup by the image
+            CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+            conda activate "${CONDA_ENV}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+
+            pushd executorch
+            echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+            rm -rf cmake-out
+            retry cmake \
+                -DCMAKE_INSTALL_PREFIX=cmake-out \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+                -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+                -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+                -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+                -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+                -DEXECUTORCH_BUILD_XNNPACK=ON \
+                -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+                -Bcmake-out .
+            cmake --build cmake-out -j9 --target install --config Release
+
+            echo "Build llama runner"
+            dir="examples/models/llama2"
+            retry cmake \
+                -DCMAKE_INSTALL_PREFIX=cmake-out \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+                -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+                -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+                -DEXECUTORCH_BUILD_XNNPACK=ON \
+                -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+                -Bcmake-out/${dir} \
+                ${dir}
+            cmake --build cmake-out/${dir} -j9 --config Release
+            popd
+
+        - name: Set up HuggingFace Hub
+          run: |
+            pip install -U "huggingface_hub[cli]"
+            HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
+
+        - name: Set up HuggingFace Transformers
+          run: |
+            # TODO(guangyang): Switch to use released transformers library after all required patches are included
+            git clone --branch main https://github.com/huggingface/transformers.git
+            pushd transformers
+            pip install .
+            popd
+
+        - name: Export to ExecuTorch
+          run: |
+            pushd executorch
+            python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }}
+
+            # Transform Hugging Face model repo name to cache dir name
+            TRANSFORMED_MODEL_NAME="models--$(echo "$MODEL_NAME" | sed 's/\//--/g')"
+
+            # Search for tokenizer.model within the transformed model directory
+            TOKENIZER_PATH=$(find "~/.cache/huggingface/hub" -type f -name "tokenizer.model" -path "*/$TRANSFORMED_MODEL_NAME/*" -print -quit)
+            if [ -z "$TOKENIZER_PATH" ]; then
+              echo "tokenizer.model not found for model ${{ matrix.hf_model_repo }}"
+              exit 1
+            else
+              echo "Found tokenizer.model at: $TOKENIZER_PATH"
+              echo "$TOKENIZER_PATH"
+              cp TOKENIZER_PATH ./
+            fi
+            python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+            cmake-out/examples/models/llama2/llama_main --model_path=gemma.pte --tokenizer_path=tokenizer.bin --prompt="My name is"
+            popd
diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py
@@ -0,0 +1,100 @@
+import argparse
+import os
+
+import torch
+import torch.export._trace
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from torch.nn.attention import SDPBackend
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.integrations.executorch import convert_and_export_with_cache
+from transformers.modeling_utils import PreTrainedModel
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-hfm",
+        "--hf_model_repo",
+        required=False,
+        default=None,
+        help="a valid huggingface model repo name",
+    )
+
+    args = parser.parse_args()
+
+    # Configs to HF model
+    device = "cpu"
+    dtype = torch.float32
+    batch_size = 1
+    max_length = 123
+    cache_implementation = "static"
+    attn_implementation = "sdpa"
+
+    # Load and configure a HF model
+    model = AutoModelForCausalLM.from_pretrained(
+        args.hf_model_repo,
+        attn_implementation=attn_implementation,
+        device_map=device,
+        torch_dtype=dtype,
+        generation_config=GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_length,
+            },
+        ),
+    )
+    print(f"{model.config}")
+    print(f"{model.generation_config}")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_repo)
+    input_ids = tokenizer([""], return_tensors="pt").to(device)["input_ids"]
+    cache_position = torch.tensor([0], dtype=torch.long)
+
+    def _get_constant_methods(model: PreTrainedModel):
+        return {
+            "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
+            "get_bos_id": model.config.bos_token_id,
+            "get_eos_id": model.config.eos_token_id,
+            "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
+            "get_max_batch_size": model.generation_config.cache_config.batch_size,
+            "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
+            "get_n_bos": 1,
+            "get_n_eos": 1,
+            "get_n_kv_heads": model.config.num_key_value_heads,
+            "get_n_layers": model.config.num_hidden_layers,
+            "get_vocab_size": model.config.vocab_size,
+            "use_kv_cache": model.generation_config.use_cache,
+        }
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+
+        exported_prog = convert_and_export_with_cache(model, input_ids, cache_position)
+        prog = (
+            to_edge(
+                exported_prog,
+                compile_config=EdgeCompileConfig(
+                    _check_ir_validity=False,
+                    _skip_dim_order=True,
+                ),
+                constant_methods=_get_constant_methods(model),
+            )
+            .to_backend(XnnpackPartitioner())
+            .to_executorch(
+                ExecutorchBackendConfig(
+                    extract_delegate_segments=True
+                )
+            )
+        )
+        filename = os.path.join("./", f"{model.config.model_type}.pte")
+        with open(filename, "wb") as f:
+            prog.write_to_file(f)
+            print(f"Saved exported program to {filename}")
+
+
+if __name__ == "__main__":
+    main()