Add mixtral dockerfile and standalone inference script (#2029)

nvzhihanj · mrmhodak · web-flow · commit 88f4d236819f · 2025-01-14T11:13:37.000-06:00
* Add dockerfile and standalone accuracy evaluation scripts

* Minor fixes

---------

Co-authored-by: Miro &lt;mirhodak@amd.com&gt;
diff --git a/language/llama2-70b/evaluate-accuracy.py b/language/llama2-70b/evaluate-accuracy.py
@@ -59,6 +59,7 @@ def main():
     checkpoint_path = args.checkpoint_path
     metric = evaluate.load("rouge")
     nltk.download("punkt")
+    nltk.download("punkt_tab")
 
     tokenizer = AutoTokenizer.from_pretrained(
         checkpoint_path,
diff --git a/language/mixtral-8x7b/Dockerfile b/language/mixtral-8x7b/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvcr.io/nvidia/pytorch:24.07-py3
 SHELL ["/bin/bash", "-c"]
 
 ENV LC_ALL=C.UTF-8
@@ -22,7 +22,7 @@ ENV TZ=US/Pacific
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN rm -rf /var/lib/apt/lists/* && rm /etc/apt/sources.list.d/* \
+RUN rm -rf /var/lib/apt/lists/* && rm -rf /etc/apt/sources.list.d/* \
  && apt update \
  && apt install -y --no-install-recommends build-essential autoconf \
         libtool git ccache curl wget pkg-config sudo ca-certificates \
@@ -44,5 +44,5 @@ WORKDIR /tmp
 RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh \
     && bash Miniconda3-* -b -p /opt/miniconda3
 ENV PATH="$PATH:/opt/miniconda3/bin"
-RUN conda create -n llama2-70b python=3.10
+RUN conda create -n llm python=3.10
 RUN chmod -R 777 /opt/miniconda3
diff --git a/language/mixtral-8x7b/README.md b/language/mixtral-8x7b/README.md
@@ -10,7 +10,7 @@
 
 
 Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/mixtral-8x7b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
- 
+
 ## Prepare environment
 
 For a CPU-only run:
@@ -234,11 +234,11 @@ Recreating the enviroment for evaluating the quality metrics can be quite tediou
 ```bash
 docker build . -f Dockerfile.eval -t evaluation
 ```
-2. Run the docker in interactive mode and with 
+2. Run the docker in interactive mode and with
 ```bash
-sudo docker run -it -v $(pwd):/eval -t evaluation
+docker run -it --rm --net=host --runtime=nvidia --ipc=host -v $PWD:$PWD -w $PWD evaluation
 ```
-3. 
+3.
 ```bash
 cd eval
 python -u evaluate-accuracy.py --checkpoint-path [path_to_model_checkpoint] \
diff --git a/language/mixtral-8x7b/standalone_infer/README.md b/language/mixtral-8x7b/standalone_infer/README.md
@@ -0,0 +1,34 @@
+# Mixtral reference standalone inference script
+
+The reference output and accuracy can be checked using the standalone hugginface inference script following the instructions below:
+
+```
+cd language/mixtral-8x7b
+docker build -t mlc-ngc .
+nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH --security-opt seccomp=unconfined -w $PWD -v $PWD:$PWD -t mlc-ngc
+
+pip install -r requirements.txt
+cd standalone_infer
+# Make sure the checkpoint and reference pickle file is already downloaded
+python3 hf_eval_all.py --input_pkl=09292024_mixtral_15k_mintoken2_v1.pkl --checkpoint_path=/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1 --output_pkl=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl --batch_size=64
+
+# Exit the container and enter the evaluation container
+exit
+docker build . -f Dockerfile.eval -t evaluation
+docker run -it --rm --net=host --runtime=nvidia --ipc=host -v $PWD:$PWD -w $PWD evaluation
+cd standalone_infer
+python3 run_accuracy.py --results_path=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl
+```
+
+Expected output:
+```
+EM: 0.7366, correct: 3683 / 5000, gen_token_per_sample: 129.9604
+Evaluating OpenOrca score...
+OpenOrca score: {'rouge1': np.float64(45.5989), 'rouge2': np.float64(23.3526), 'rougeL': np.float64(30.4608), 'rougeLsum': np.float64(42.5396)}, gen_token_per_sample: 205.8656
+Evaluating MBXP score...
+100%|| 5000/5000 [02:33<00:00, 32.50it/s]
+Processed 5000 in 153.89411109898356s
+ 60.16% pass@1
+{'cpp': 381, 'typescript': 438, 'ruby': 419, 'python': 492, 'php': 809, 'javascript': 469}  out of  {'cpp': 743, 'typescript': 868, 'ruby': 846, 'python': 863, 'php': 846, 'javascript': 834}
+gen_tokens_per_sample: 98.7026
+```
diff --git a/language/mixtral-8x7b/standalone_infer/hf_eval_all.py b/language/mixtral-8x7b/standalone_infer/hf_eval_all.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
+import torch
+import pandas as pd
+import time
+from pathlib import Path
+import argparse
+
+
+def run_infer(df, ckpt_path, bs):
+    """
+    dataset                                                           GSM8K
+    id                                                            train.548
+    question              Gary manages two Amazon distribution centers. ...
+    input                 <s> [INST] As an expert problem solver solve s...
+    ref_output            The first center processes 10000 packages per ...
+    gt_output                                                         14000
+    tok_input             [1, 1, 28705, 733, 16289, 28793, 1136, 396, 75...
+    tok_ref_output        [415, 907, 4982, 9537, 28705, 28740, 28734, 28...
+    stop_sequence                                                      </s>
+    tok_stop_sequence                                                   [2]
+    tok_input_len                                                       662
+    tok_ref_output_len                                                  174
+    Name: 0, dtype: object
+    """
+    device = "cuda"  # the device to load the model onto
+
+    # Load the model from local if possible.
+    model_path = Path(ckpt_path)
+    if not model_path.exists():
+        raise RuntimeError(f"{ckpt_path} not existed. Please download the checkpoint from mlcommon")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, padding_side="left", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, device_map="auto", trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    # gen parameter. We stop at 1024. Starting from v5.0, min_token is set to 2 to avoid 0-output issue
+    gen_kwargs = {
+        # "min_new_tokens": 1,
+        "min_new_tokens": 2,
+        "max_new_tokens": 1024,
+        "do_sample": False,
+        "temperature": None,
+        "top_p": None,
+    }
+
+    # Start inference
+    BS = bs
+    bidx = 0
+    model.eval()
+
+    input_tokens = []
+    input_tokens_lens = []
+    output_tokens = []
+    output_tokens_lens = []
+    output_texts = []
+
+    tic = time.time()
+    for idx in range(0, len(df), BS):
+        tac = time.time()
+        print(f"Processing {idx}/{len(df)}, time: {tac - tic}s")
+        sidx = idx
+        eidx = min(sidx + BS, len(df))
+
+        # We use batch_encode_plus for batch inference.
+        # Note 9/29/2024: Mixtral changed its tokenizer in Jun. Using the Feb 29 2024 version.
+        batch_texts = df['input'][sidx:eidx].tolist()
+        batch_ids = tokenizer.batch_encode_plus(batch_texts, return_tensors="pt", padding=True)
+        # tok_input_length = batch_ids['attention_mask'].sum(
+        #     axis=1).to(torch.int32).tolist()
+        # input_tokens_lens += tok_input_length
+        tok_input_id = batch_ids['input_ids'].to(torch.int32).tolist()
+        # Remove eos from the input id
+        tok_input_id = [[element for element in sublist if element !=
+                        tokenizer.eos_token_id] for sublist in tok_input_id]
+        input_tokens += tok_input_id
+        tok_input_length = [len(seq) for seq in tok_input_id]
+        input_tokens_lens += tok_input_length
+
+        batch_ids = batch_ids.to(device)
+        _, length = batch_ids.input_ids.shape
+        outputs = model.generate(**batch_ids, num_return_sequences=1,
+                                **gen_kwargs)
+
+        output_ids = outputs[:, length:].cpu().tolist()
+        output_tokens += output_ids
+
+        # Filter out EOS
+        id_filtered = [[num for num in sublist if num !=
+                        tokenizer.eos_token_id] for sublist in output_ids]
+        output_id_len = [len(out) for out in id_filtered]
+        output_tokens_lens += output_id_len
+
+        # Detokenizer
+        output_msgs = tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True)
+        output_texts += output_msgs
+        bidx += 1
+
+    # Assemble the output
+    output_df = df[:len(output_tokens)].copy()
+    output_df["infer_tok_input"] = input_tokens
+    output_df["infer_tok_input_length"] = input_tokens_lens
+    output_df["infer_ref_output"] = output_texts
+    output_df["infer_tok_ref_output"] = output_tokens
+    output_df["infer_tok_ref_output_length"] = output_tokens_lens
+
+    # output_df.to_pickle(f"mixtral_8x7b_all15k_{len(output_tokens)}_BS{BS}_greedy_reference_fp16_mintoken1.pkl")
+
+    return output_df
+
+def trim_twos(df):
+    # Remove all trailing 2s except for 1
+    def remove_trailing_twos(lst):
+        count = 0
+        for num in reversed(lst):
+            if num == 2:
+                count += 1
+            else:
+                break
+        return lst[:-count] if count > 0 else lst
+
+    df['infer_tok_ref_output'] = df['infer_tok_ref_output'].apply(remove_trailing_twos)
+    df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
+    df['tok_ref_output'] = df['tok_ref_output'].apply(remove_trailing_twos)
+    df['tok_ref_output_len'] = df['tok_ref_output'].apply(len)
+    return df
+
+def mbxp_stop(df):
+    stop_tokens = [13, 13940, 28832, 13]
+    def modify_list(lst):
+        for i in range(len(lst) - len(stop_tokens) + 1):
+            if lst[i:i+len(stop_tokens)] == stop_tokens:
+                return lst[:i+len(stop_tokens)]
+        return lst
+
+    df.loc[df['dataset'] == 'MBXP', 'infer_tok_ref_output'] = df[df['dataset'] == 'MBXP']['infer_tok_ref_output'].apply(modify_list)
+    df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
+    return df
+
+
+def fix_name(df):
+    df.drop(columns=['ref_output'], inplace=True)
+    df.drop(columns=['tok_ref_output'], inplace=True)
+    df.drop(columns=['tok_ref_output_len'], inplace=True)
+    df.drop(columns=['infer_tok_ref_output_length'], inplace=True)
+    df.drop(columns=['infer_tok_input'], inplace=True)
+    df.drop(columns=['infer_tok_input_length'], inplace=True)
+    df.rename(columns={'infer_ref_output': 'ref_output'}, inplace=True)
+    df.rename(columns={'infer_tok_ref_output': 'tok_ref_output'}, inplace=True)
+    df.rename(columns={'trim_lengths': 'tok_ref_output_len'}, inplace=True)
+
+    return df
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_pkl", type=str, default="09292024_mixtral_15k_mintoken2_v1.pkl",
+                        help="The path to the input pkl file")
+    parser.add_argument("--output_pkl", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
+                        help="The path to the output pickle.")
+    parser.add_argument("--checkpoint_path", type=str, default="/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1",
+                        help="The path to the mixtral checkpoint")
+    parser.add_argument("--batch_size", type=int, default=64,
+                        help="Batch size of the refernece inference")
+    args = parser.parse_args()
+
+    df = pd.read_pickle(args.input_pkl)
+    df = run_infer(df, args.checkpoint_path, args.batch_size)
+
+    df = trim_twos(df)
+    df = mbxp_stop(df)
+    df = fix_name(df)
+
+    df.to_pickle(args.output_pkl)
+
+
diff --git a/language/mixtral-8x7b/standalone_infer/run_accuracy.py b/language/mixtral-8x7b/standalone_infer/run_accuracy.py