Merge branch 'main' into patch-15

Jack-Khuu · web-flow · commit e6a89cdaa543 · 2024-11-19T20:41:52.000-05:00
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
@@ -91,3 +91,23 @@ if [ "$1" == "evaluation" ]; then
         echo "*******************************************"
         bash -x ./run-evaluation.sh
 fi
+
+if [ "$1" == "multimodal" ]; then
+
+   # Expecting that this might fail this test as-is, because 
+   # it's the first on-pr test depending on githib secrets for access with HF token access
+
+        echo "::group::Create script to run multimodal"
+        python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-multimodal.sh
+        echo "::endgroup::"
+
+        echo "::group::Run multimodal"
+        echo "*******************************************"
+        cat ./run-multimodal.sh
+        echo "*******************************************"
+        bash -x ./run-multimodal.sh
+        echo "::endgroup::"
+fi
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -243,4 +243,47 @@ jobs:
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
-        echo "::endgroup::"
+        echo "::endgroup::"
+
+  test-multimodal-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs multimodal
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-multimodal-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -14,9 +14,11 @@ This page goes over the different commands you can run with LLama 3.2 11B Vision
 
 While we strongly encourage you to use the Hugging Face checkpoint (which is the default for torchchat when utilizing the commands with the argument `llama3.2-11B`), we also provide support for manually providing the checkpoint. This can be done by replacing the `llama3.2-11B` argument in the commands below with the following:
 
+[skip default]: begin
 ```
 --checkpoint-path <file.pth> --tokenizer-path <tokenizer.model> --params-path torchchat/model_params/Llama-3.2-11B-Vision.json
 ```
+[skip default]: end
 
 ##  Generation
 This generates text output based on a text prompt and (optional) image prompt.
@@ -48,6 +50,7 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
+[skip default]: begin
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -75,6 +78,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
 ```
 {"id": "chatcmpl-cb7b39af-a22e-4f71-94a8-17753fa0d00c", "choices": [{"message": {"role": "assistant", "content": "The image depicts a simple black and white cartoon-style drawing of an animal face. It features a profile view, complete with two ears, expressive eyes, and a partial snout. The animal looks to the left, with its eye and mouth implied, suggesting that the drawn face might belong to a rabbit, dog, or pig. The graphic face has a bold black outline and a smaller, solid black nose. A small circle, forming part of the face, has a white background with two black quirkly short and long curved lines forming an outline of what was likely a mouth, complete with two teeth. The presence of the curve lines give the impression that the animal is smiling or speaking. Grey and black shadows behind the right ear and mouth suggest that this face is looking left and upwards. Given the prominent outline of the head and the outline of the nose, it appears that the depicted face is most likely from the side profile of a pig, although the ears make it seem like a dog and the shape of the nose makes it seem like a rabbit. Overall, it seems that this image, possibly part of a character illustration, is conveying a playful or expressive mood through its design and positioning."}, "finish_reason": "stop"}], "created": 1727487574, "model": "llama3.2", "system_fingerprint": "cpu_torch.float16", "object": "chat.completion"}%
 ```
+[skip default]: end
 
 </details>
 
@@ -90,6 +94,8 @@ First, follow the steps in the Server section above to start a local server. The
 streamlit run torchchat/usages/browser.py
 ```
 
+[skip default]: end
+
 ---
 
 # Future Work
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -74,7 +74,7 @@ def __post_init__(self):
             or (self.pte_path and Path(self.pte_path).is_file())
         ):
             raise RuntimeError(
-                "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
+                "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
             )
 
         if self.aoti_package_path and self.pte_path:
@@ -91,7 +91,7 @@ def __post_init__(self):
             for param, param_msg in ignored_params:
                 if param:
                     print(
-                        f"Warning: {param_msg} ignored because an exported DSO or PTE path was specified"
+                        f"Warning: {param_msg} ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument"
                     )
         else:
             self.prefill_possible = True
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
@@ -39,19 +39,14 @@ def convert_hf_checkpoint(
     config = TransformerArgs.from_params(config_args)
     print(f"Model config {config.__dict__}")
 
-    # Load the json file containing weight mapping
+    # Find all candidate weight mapping index files
     model_map_json_matches = [Path(m) for m in glob.glob(str(model_dir / "*.index.json"))]
-    assert len(model_map_json_matches) <= 1, "Found multiple weight mapping files"
-    if len(model_map_json_matches):
-        model_map_json = model_map_json_matches[0]
-    else:
-        model_map_json = model_dir / "pytorch_model.bin.index.json"
 
     # If there is no weight mapping, check for a consolidated model and
     # tokenizer we can move. Llama 2 and Mistral have weight mappings, while
     # Llama 3 has a consolidated model and tokenizer.
     # Otherwise raise an error.
-    if not model_map_json.is_file():
+    if not model_map_json_matches:
         consolidated_pth = model_dir / "original" / "consolidated.00.pth"
         tokenizer_pth = model_dir / "original" / "tokenizer.model"
         if consolidated_pth.is_file() and tokenizer_pth.is_file():
@@ -68,11 +63,30 @@ def convert_hf_checkpoint(
             return
         else:
             raise RuntimeError(
-                f"Could not find {model_map_json} or {consolidated_pth} plus {tokenizer_pth}"
+                f"Could not find a valid model weight map or {consolidated_pth} plus {tokenizer_pth}"
             )
 
-    with open(model_map_json) as json_map:
-        bin_index = json.load(json_map)
+    # Load the json file(s) containing weight mapping
+    #
+    # NOTE: If there are multiple index files, there are two possibilities:
+    #   1. The files could be mapped to different weight format files (e.g. .bin
+    #       vs .safetensors)
+    #   2. The files could be split subsets of the mappings that need to be
+    #       merged
+    #
+    # In either case, we can simply keep the mappings where the target file is
+    # valid in the model dir.
+    bin_index = {}
+    for weight_map_file in model_map_json_matches:
+        with open(weight_map_file, "r") as handle:
+            weight_map = json.load(handle)
+        valid_mappings = {
+            k: model_dir / v
+            for (k, v) in weight_map.get("weight_map", {}).items()
+            if (model_dir / v).is_file()
+        }
+        bin_index.update(valid_mappings)
+    bin_files = set(bin_index.values())
 
     weight_map = {
         "model.embed_tokens.weight": "tok_embeddings.weight",
@@ -96,7 +110,6 @@ def convert_hf_checkpoint(
         "model.norm.weight": "norm.weight",
         "lm_head.weight": "output.weight",
     }
-    bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
         return (
diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
@@ -35,22 +35,23 @@ def _download_hf_snapshot(
         model_info = model_info(model_config.distribution_path, token=hf_token)
         model_fnames = [f.rfilename for f in model_info.siblings]
 
-        # Check the model config for preference between safetensors and pth
+        # Check the model config for preference between safetensors and pth/bin
         has_pth = any(f.endswith(".pth") for f in model_fnames)
+        has_bin = any(f.endswith(".bin") for f in model_fnames)
         has_safetensors = any(f.endswith(".safetensors") for f in model_fnames)
 
-        # If told to prefer safetensors, ignore pth files
+        # If told to prefer safetensors, ignore pth/bin files
         if model_config.prefer_safetensors:
             if not has_safetensors:
                 print(
                     f"Model {model_config.name} does not have safetensors files, but prefer_safetensors is set to True. Using pth files instead.",
                     file=sys.stderr,
                 )
                 exit(1)
-            ignore_patterns = "*.pth"
+            ignore_patterns = ["*.pth", "*.bin"]
 
         # If the model has both, prefer pth files over safetensors
-        elif has_pth and has_safetensors:
+        elif (has_pth or has_bin) and has_safetensors:
             ignore_patterns = "*safetensors*"
 
         # Otherwise, download everything
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -1149,9 +1149,11 @@ def callback(x, *, done_generating=False):
                 print(
                     f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds"
                 )
-            aggregate_metrics["tokens_per_sec"].append(tokens_sec)
-            aggregate_metrics["first_token_per_sec"].append(first_token_sec)
-            aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
+            else:
+                # aggregate_metrics will not append when is jit_compile, which will affect the average numbers.
+                aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+                aggregate_metrics["first_token_per_sec"].append(first_token_sec)
+                aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
 
             logging.info(
                 f"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\
@@ -1205,7 +1207,8 @@ def callback(x, *, done_generating=False):
             or torch.isnan(torch.tensor(avg_next_tokens_sec))
         ):
             print(
-                f"\n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
+                f"\nWarning: Excluding compile in calculations \
+                \n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
                 \nAverage tokens/sec (first token): {avg_first_token_sec:.2f} \
                 \nAverage tokens/sec (next tokens): {avg_next_tokens_sec:.2f} \n\
                 "