Skip to content

ci: Add llama3 gpu workflow in perioidic #399

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 23, 2024
34 changes: 34 additions & 0 deletions .ci/scripts/download_llama.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

set -xeou pipefail

shopt -s globstar

install_huggingface_cli() {
pip install -U "huggingface_hub[cli]"
}

download_checkpoint() {
# This funciton is "technically re-usable but ymmv"
# includes org name, like <org>/<repo>
local repo_name=$1
local include=$2
# basically just removes the org in <org>/<repo>
local local_dir="checkpoints/${repo_name}"

mkdir -p "${local_dir}"
huggingface-cli download \
"${repo_name}" \
--quiet \
--include "${include}" \
--local-dir "${local_dir}"
}

# install huggingface-cli if not already installed
if ! command -v huggingface-cli; then
install_huggingface_cli
fi

# TODO: Eventually you could extend this to download different models
# taking in some arguments similar to .ci/scripts/wget_checkpoint.sh
download_checkpoint "meta-llama/Meta-Llama-3-8B" "original/*"
28 changes: 23 additions & 5 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
"mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json",
"mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json",
"mistralai/Mistral-7B-Instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00001-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00002-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00003-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer_config.json",

# huggingface-cli prefixed Models will download using the huggingface-cli tool
# TODO: Convert all of the MODEL_REPOS with a NamedTuple that includes the install_method
"huggingface-cli/meta-llama/Meta-Llama-3-8B": "",
}

JOB_RUNNERS = {
Expand Down Expand Up @@ -57,7 +61,7 @@ def parse_args() -> Any:
return parser.parse_args()


def model_should_run_on_event(model: str, event: str) -> bool:
def model_should_run_on_event(model: str, event: str, backend: str) -> bool:
"""
A helper function to decide whether a model should be tested on an event (pull_request/push)
We put higher priority and fast models to pull request and rest to push.
Expand All @@ -67,7 +71,11 @@ def model_should_run_on_event(model: str, event: str) -> bool:
elif event == "push":
return model in []
elif event == "periodic":
return model in ["openlm-research/open_llama_7b"]
# test llama3 on gpu only, see description in https://github.com/pytorch/torchchat/pull/399 for reasoning
if backend == "gpu":
return model in ["openlm-research/open_llama_7b", "huggingface-cli/meta-llama/Meta-Llama-3-8B"]
else:
return model in ["openlm-research/open_llama_7b"]
else:
return False

Expand Down Expand Up @@ -102,15 +110,25 @@ def export_models_for_ci() -> dict[str, dict]:
MODEL_REPOS.keys(),
JOB_RUNNERS[backend].items(),
):
if not model_should_run_on_event(repo_name, event):
if not model_should_run_on_event(repo_name, event, backend):
continue

# This is mostly temporary to get this finished quickly while
# doing minimal changes, see TODO at the top of the file to
# see how this should probably be done
install_method = "wget"
final_repo_name = repo_name
if repo_name.startswith("huggingface-cli"):
install_method = "huggingface-cli"
final_repo_name = repo_name.replace("huggingface-cli/", "")

record = {
"repo_name": repo_name,
"model_name": repo_name.split("/")[-1],
"repo_name": final_repo_name,
"model_name": final_repo_name.split("/")[-1],
"resources": MODEL_REPOS[repo_name],
"runner": runner[0],
"platform": runner[1],
"install_method": install_method,
"timeout": 90,
}

Expand Down
18 changes: 15 additions & 3 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ jobs:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
secrets: inherit
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
secrets-env: "HF_TOKEN_PERIODIC"
runner: ${{ matrix.runner }}
gpu-arch-type: cuda
gpu-arch-version: "12.1"
Expand All @@ -126,15 +128,25 @@ jobs:
echo "::endgroup::"

echo "::group::Install required packages"
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
pip install --progress-bar off --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"

echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
export REPO_NAME="${{ matrix.repo_name }}"
case "${{ matrix.install_method }}" in
wget)
bash .ci/scripts/wget_checkpoint.sh "${REPO_NAME}" "${{ matrix.resources }}"
;;
huggingface-cli)
(
set +x
HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" bash .ci/scripts/download_llama.sh
)
;;
esac
echo "::endgroup::"

echo "::group::Convert checkpoint"
Expand Down