ci: Add llama3 workflow in perioidic

seemethere · seemethere · commit 2545c9f8e2bf · 2024-04-22T17:33:08.000-07:00
Adds a llama3 testing workflow for periodic, downloads this using
huggingface-cli.

This is somewhat of a working prototype, I left a couple of TODOS in
places where things could be done better if given more time.

Signed-off-by: Eli Uriegas &lt;eliuriegas@meta.com&gt;
diff --git a/.ci/scripts/download_llama.sh b/.ci/scripts/download_llama.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+install_huggingface_cli() {
+	pip install -U "huggingface_hub[cli]"
+}
+
+download_checkpoint() {
+	# This funciton is "technically re-usable but ymmv"
+	# includes org name, like <org>/<repo>
+	local repo_name=$1
+	local include=$2
+	# basically just removes the org in <org>/<repo>
+	local local_dir=${repo_name##/*}
+
+	mkdir -p "${local_dir}"
+	huggingface-cli download \
+		"${repo_name}" \
+		--include "${include}" \
+		--local-dir "${local_dir}"
+}
+
+normalize_llama_checpoint() {
+	# normalizes the checkpoint file into something that the rest of
+	# the testing scripts understand
+	local repo_name=$1
+	local local_dir=${repo_name##/*}
+	mkdir -p "${local_dir}"
+	mv "${local_dir}/original/*" "${local_dir}"
+	mv "${local_dir}/consolidated.00.pth" "${local_dir}/model.pth"
+	rmdir "${local_dir/original/}"
+}
+
+# install huggingface-cli if not already installed
+if ! command -v huggingface-cli; then
+	install_huggingface_cli
+fi
+
+# TODO: Eventually you could extend this to download different models
+# taking in some arguments similar to .ci/scripts/wget_checkpoint.sh
+download_checkpoint "meta-llama/Meta-Llama-3-8B" "original/*"
+normalize_llama_checpoint "meta-llama/Meta-Llama-3-8B"
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -19,6 +19,10 @@
     "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json",
     "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json",
     "mistralai/Mistral-7B-Instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00001-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00002-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00003-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer_config.json",
+
+    # huggingface-cli prefixed Models will download using the huggingface-cli tool
+    # TODO: Convert all of the MODEL_REPOS with a NamedTuple that includes the install_method
+    "huggingface-cli/meta-llama/Meta-Llama-3-8B": "",
 }
 
 JOB_RUNNERS = {
@@ -67,7 +71,7 @@ def model_should_run_on_event(model: str, event: str) -> bool:
     elif event == "push":
         return model in []
     elif event == "periodic":
-        return model in ["openlm-research/open_llama_7b"]
+        return model in ["openlm-research/open_llama_7b", "huggingface-cli/meta-llama/Meta-Llama-3-8B"]
     else:
         return False
 
@@ -105,12 +109,22 @@ def export_models_for_ci() -> dict[str, dict]:
         if not model_should_run_on_event(repo_name, event):
             continue
 
+        # This is mostly temporary to get this finished quickly while
+        # doing minimal changes, see TODO at the top of the file to
+        # see how this should probably be done
+        install_method = "wget"
+        final_repo_name = repo_name
+        if repo_name.startswith("huggingface-cli"):
+            install_method = "huggingface-cli"
+            final_repo_name = repo_name.replace("huggingface-cli/", "")
+
         record = {
-            "repo_name": repo_name,
-            "model_name": repo_name.split("/")[-1],
+            "repo_name": final_repo_name,
+            "model_name": final_repo_name.split("/")[-1],
             "resources": MODEL_REPOS[repo_name],
             "runner": runner[0],
             "platform": runner[1],
+            "install_method": install_method,
             "timeout": 90,
         }
 
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -113,10 +113,12 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
+    secrets: inherit
     strategy:
       matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
       fail-fast: false
     with:
+      secrets-env: "HF_TOKEN_PERIODIC"
       runner: ${{ matrix.runner }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.1"
@@ -134,6 +136,10 @@ jobs:
 
         echo "::group::Download checkpoint"
         export REPO_NAME=${{ matrix.repo_name }}
+        case "${{ matrix.install_method }}" in
+          wget) bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} ;;
+          huggingface-cli) bash .ci/scripts/download_llama.sh ;;
+        esac
         bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
         echo "::endgroup::"