anton-l
diff --git a/‎.github/workflows/slow_tests.yaml
Lines changed: 47 additions & 0 deletions b/‎.github/workflows/slow_tests.yaml
Lines changed: 47 additions & 0 deletions
diff --git a/‎.github/workflows/tests.yaml
Lines changed: 46 additions & 33 deletions b/‎.github/workflows/tests.yaml
Lines changed: 46 additions & 33 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/_toctree.yml
Lines changed: 2 additions & 2 deletions b/‎docs/source/_toctree.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/adding-a-custom-task.mdx
Lines changed: 1 addition & 1 deletion b/‎docs/source/adding-a-custom-task.mdx
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/package_reference/models.mdx
Lines changed: 0 additions & 4 deletions b/‎docs/source/package_reference/models.mdx
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/source/quicktour.mdx
Lines changed: 5 additions & 5 deletions b/‎docs/source/quicktour.mdx
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/saving-and-reading-results.mdx
Lines changed: 14 additions & 0 deletions b/‎docs/source/saving-and-reading-results.mdx
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/source/evaluate-the-model-on-a-server-or-container.mdx renamed to ‎docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
Lines changed: 6 additions & 26 deletions b/‎docs/source/evaluate-the-model-on-a-server-or-container.mdx renamed to ‎docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
Lines changed: 6 additions & 26 deletions
diff --git a/‎docs/source/use-inference-providers-as-backend.mdx
Lines changed: 3 additions & 3 deletions b/‎docs/source/use-inference-providers-as-backend.mdx
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/use-litellm-as-backend.mdx
Lines changed: 15 additions & 12 deletions b/‎docs/source/use-litellm-as-backend.mdx
Lines changed: 15 additions & 12 deletions
@@ -0,0 +1,47 @@
+name: Slow end to end tests
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  run_tests:
+    name: Run tests
+    runs-on: 'aws-g4dn-2xlarge-use1-public-80'
+    steps:
+      - name: Install Git LFS
+        run: |
+          if ! command -v git-lfs &> /dev/null; then
+            echo "Installing Git LFS..."
+            sudo apt-get update && sudo apt-get install -y git-lfs
+            git lfs install
+          else
+            echo "Git LFS already installed."
+          fi
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --extra dev
+
+      - name: Ensure cache directories exist
+        run: mkdir -p cache/models cache/datasets
+
+      - name: Run tests
+        env:
+          HF_HOME: "cache/models"
+          HF_DATASETS_CACHE: "cache/datasets"
+        run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests
@@ -11,36 +11,49 @@ on:
 
 jobs:
   run_tests:
-   name: Run tests
-   runs-on: ubuntu-latest
-   steps:
-     - name: Checkout code
-       uses: actions/checkout@v3
-       with:
-        lfs: 'true'
-     - name: Setup Python environment
-       uses: actions/setup-python@v4
-       with:
-         python-version: '3.10'
-         cache: 'pip'
-     - name: Install lighteval in editable mode
-       run: |
-         pip install -e .[dev,extended_tasks,multilingual,litellm]
-     - name: Get cached files
-       uses: actions/cache@v4
-       id: get-cache
-       with:
-         path: "cache"
-         key: test-cache-HF
-     - name: Test
-       env:
-        HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
-        HF_HOME: "cache/models"
-        HF_DATASETS_CACHE: "cache/datasets"
-       run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
-        python -m pytest -x --disable-pytest-warnings
-     - name: Write cache
-       uses: actions/cache@v4
-       with:
-         path: "cache"
-         key: test-cache-HF
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Cache Hugging Face models
+        uses: actions/cache@v4
+        with:
+          path: cache/models
+          key: hf-models-${{ runner.os }}-${{ github.ref }}
+          restore-keys: hf-models-${{ runner.os }}-
+
+      - name: Cache Hugging Face datasets
+        uses: actions/cache@v4
+        with:
+          path: cache/datasets
+          key: hf-datasets-${{ runner.os }}-${{ github.ref }}
+          restore-keys: hf-datasets-${{ runner.os }}-
+
+      - name: Cache uv virtual environment
+        uses: actions/cache@v4
+        with:
+          path: .venv
+          key: uv-env-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+          restore-keys: uv-env-${{ runner.os }}-
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --extra dev
+
+      - name: Ensure cache directories exist
+        run: mkdir -p cache/models cache/datasets
+
+      - name: Run tests
+        env:
+          HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
+          HF_HOME: "cache/models"
+          HF_DATASETS_CACHE: "cache/datasets"
+        run: uv run pytest -x --disable-pytest-warnings
@@ -87,7 +87,7 @@ Here’s a quick command to evaluate using the Accelerate backend:
 
 ```shell
 lighteval accelerate \
-    "pretrained=gpt2" \
+    "model_name=gpt2" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
 
@@ -23,8 +23,8 @@
     title: Use vllm as backend
   - local: use-sglang-as-backend
     title: Use SGLang as backend
-  - local: evaluate-the-model-on-a-server-or-container
-    title: Evaluate on Server
+  - local: use-huggingface-inference-endpoints-or-tgi-as-backend
+    title: Use Hugging Face inference endpoints or TGI as backend
   - local: contributing-to-multilingual-evaluations
     title: Contributing to multilingual evaluations
   title: Guides
 
@@ -171,7 +171,7 @@ Once your file is created you can then run the evaluation with the following com
 
 ```bash
 lighteval accelerate \
-    "pretrained=HuggingFaceH4/zephyr-7b-beta" \
+    "model_name=HuggingFaceH4/zephyr-7b-beta" \
     "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
     --custom-tasks {path_to_your_custom_task_file}
 ```
@@ -31,10 +31,6 @@
 ### Open AI Models
 [[autodoc]] models.endpoints.openai_model.OpenAIClient
 
-## Nanotron Model
-### NanotronLightevalModel
-[[autodoc]] models.nanotron.nanotron_model.NanotronLightevalModel
-
 ## VLLM Model
 ### VLLMModel
 [[autodoc]] models.vllm.vllm_model.VLLMModelConfig
 
@@ -27,7 +27,7 @@ To evaluate `GPT-2` on the Truthful QA benchmark with [🤗
 
 ```bash
 lighteval accelerate \
-     "pretrained=gpt2" \
+     "model_name=openai-community/gpt2" \
      "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -59,7 +59,7 @@ When specifying a path to file, it should start with `./`.
 
 ```bash
 lighteval accelerate \
-     "pretrained=gpt2" \
+     "model_name=openai-community/gpt2" \
      ./path/to/lighteval/examples/tasks/recommended_set.txt
 # or, e.g., "leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1"
 ```
@@ -79,7 +79,7 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows:
 ```bash
 accelerate launch --multi_gpu --num_processes=8 -m \
     lighteval accelerate \
-    "pretrained=gpt2" \
+    "model_name=openai-community/gpt2" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -92,7 +92,7 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
 
 ```bash
 lighteval accelerate \
-    "pretrained=gpt2,model_parallel=True" \
+    "model_name=openai-community/gpt2,model_parallel=True" \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
@@ -129,7 +129,7 @@ accelerate).
 - **add_special_tokens** (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
    If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
     `False` for causal models.
-- **model_parallel** (bool, optional, defaults to False):
+- **model_parallel** (bool, optional, defaults to None):
     True/False: force to use or not the `accelerate` library to load a large
     model across multiple devices.
     Default: None which corresponds to comparing the number of processes with
 
@@ -31,6 +31,20 @@ This will create a Tensorboard dashboard in a HF org set with the `--results-org
 option.
 
 
+## Pushing results to WandB
+
+You can push the results to WandB by setting `--wandb`. This will init a WandB
+run and log the results.
+
+Wandb args need to be set in your env variables.
+
+```
+export WANDB_PROJECT="lighteval"
+```
+
+You can find a list of variable in the [wandb documentation](https://docs.wandb.ai/guides/track/environment-variables/).
+
+
 ## How to load and investigate details
 
 ### Load from local detail files
 
@@ -25,15 +25,12 @@ be deleted afterwards).
 __configuration file example:__
 
 ```yaml
-model:
-  base_params:
-    # Pass either model_name, or endpoint_name and true reuse_existing
-    # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
-    # reuse_existing: true # defaults to false; if true, ignore all params in instance, and don't delete the endpoint after evaluation
+model_parameters:
+    reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
+# endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
     model_name: "meta-llama/Llama-2-7b-hf"
-    # revision: "main" # defaults to "main"
+    revision: "main"  # defaults to "main"
     dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
-  instance:
     accelerator: "gpu"
     region: "eu-west-1"
     vendor: "aws"
@@ -44,7 +41,7 @@ model:
     namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace
     image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
     env_vars:
-      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+    null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
 ```
 
 ### Text Generation Inference (TGI)
@@ -55,25 +52,8 @@ serverless inference.
 __configuration file example:__
 
 ```yaml
-model:
-  instance:
+model_parameters:
     inference_server_address: ""
     inference_server_auth: null
     model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
 ```
-
-### OpenAI API
-
-Lighteval also supports evaluating models on the OpenAI API. To do so you need to set your OpenAI API key in the environment variable.
-
-```bash
-export  OPENAI_API_KEY={your_key}
-```
-
-And then run the following command:
-
-```bash
-lighteval endpoint openai \
-    {model-name} \
-    <task parameters>
-```
@@ -11,7 +11,7 @@ Lighteval allows to use Hugging Face's Inference Providers to evaluate llms on s
 
 ```bash
 lighteval endpoint inference-providers \
-    "model=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
+    "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
     "lighteval|gsm8k|0|0"
 ```
 
@@ -28,13 +28,13 @@ lighteval endpoint inference-providers \
 with the following config file:
 
 ```yaml
-model:
+model_parameters:
   model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
   provider: "novita"
   timeout: null
   proxies: null
   parallel_calls_count: 10
-  generation:
+  generation_parameters:
     temperature: 0.8
     top_k: 10
     max_new_tokens: 10000
 
@@ -10,10 +10,14 @@ Documentation for available APIs and compatible endpoints can be found [here](ht
 
 ```bash
 lighteval endpoint litellm \
-    "gpt-3.5-turbo" \
-    "lighteval|gsm8k|0|0"
+    "provider=openai,model_name=gpt-3.5-turbo" \
+    "lighteval|gsm8k|0|0" \
+    --use-chat-template
 ```
 
+> [!WARNING]
+> `--use-chat-template` is required for litellm to work properly.
+
 ## Using a config file
 
 Litellm allows generation with any OpenAI compatible endpoint, for example you
@@ -22,17 +26,16 @@ can evaluate a model running on a local vllm server.
 To do so you will need to use a config file like so:
 
 ```yaml
-model:
-  base_params:
+model_parameters:
     model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
     base_url: "URL OF THE ENDPOINT YOU WANT TO USE"
     api_key: "" # remove or keep empty as needed
-  generation:
-    temperature: 0.5
-    max_new_tokens: 256
-    stop_tokens: [""]
-    top_p: 0.9
-    seed: 0
-    repetition_penalty: 1.0
-    frequency_penalty: 0.0
+    generation_parameters:
+      temperature: 0.5
+      max_new_tokens: 256
+      stop_tokens: [""]
+      top_p: 0.9
+      seed: 0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
 ```