add CI job for eval_llama with wikitext task (#6336)

helunwencser · facebook-github-bot · commit a343884ee980 · 2024-10-18T11:19:02.000-07:00
Summary: Pull Request resolved: #6336 imported-using-ghimport Test Plan: Imported from OSS Reviewed By: cccclai Differential Revision: D64565304 Pulled By: helunwencser fbshipit-source-id: 58559df0a007830ddecea8e20297bbacb20dd773
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 2048 \
+	--limit 5 > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_TASK="wikitext"
+    EXPECTED_RESULT="word_perplexity"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -447,3 +447,30 @@ jobs:
 
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
+
+  test-eval_llama-wikitext-linux:
+    name: test-eval_llama-wikitext-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run eval_llama wikitext task
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py
@@ -40,7 +40,12 @@ def __init__(
 
     @property
     def eot_token_id(self):
-        return self._tokenizer.eot_id
+        """
+        The stories model does not have an EOT token, so we use the EOS token instead.
+        """
+        if hasattr(self._tokenizer, "eot_id"):
+            return self._tokenizer.eot_id
+        return self._tokenizer.eos_id
 
     @property
     def max_length(self):