convert-*.py: refactor to per_model_weight_count_estimation()

mofosyne · mofosyne · commit 7cae773a1bc2 · 2024-05-31T01:18:33.000+10:00
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -121,9 +121,12 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
         self.tensor_names = None
         self.metadata = metadata
+
+        model_tensors = self.get_tensors()
+
         if self.ftype == gguf.LlamaFileType.GUESSED:
             # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
-            _, first_tensor = next(self.get_tensors())
+            _, first_tensor = next(model_tensors)
             if first_tensor.dtype == torch.float16:
                 logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
                 self.ftype = gguf.LlamaFileType.MOSTLY_F16
@@ -160,8 +163,35 @@ def get_model_name(metadata, huggingface_parameters, dir_model, model_arch):
         # Get Expert Count From huggingface_parameters
         expert_count = self.hparams["num_local_experts"] if "num_local_experts" in self.hparams else None
 
+        def per_model_weight_count_estimation(tensors, expert_count):
+            # TODO: Ensure parameter count is accurate throughout various model type
+            #       May currently overestimate parameter count in Mamba model because
+            #       output weights is tied with token embeddings.
+            sum_weight_estimate = 0
+            for name, data_torch in tensors:
+                # Got A Tensor
+
+                # We don't need these
+                if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+                    continue
+
+                # Calculate Tensor Volume
+                sum_weights_in_tensor = 1
+                for dim in data_torch.shape:
+                    sum_weights_in_tensor *= dim
+
+                # Add Tensor Volume To Running Count
+                sum_weight_estimate += sum_weights_in_tensor
+
+            # Calculate weight estimate per model
+            per_model_weight_estimate = (sum_weight_estimate / expert_count) if (expert_count > 0) else sum_weight_estimate
+
+            return per_model_weight_estimate
+
+        weight_estimate = per_model_weight_count_estimation(model_tensors, expert_count)
+
         # Generate default filename based on model specification and available metadata
-        self.fname_default = gguf.naming_convention(self.model_name, self.metadata.version, expert_count, self.parameter_count(), encodingScheme)
+        self.fname_default = gguf.naming_convention(self.model_name, self.metadata.version, expert_count, weight_estimate, encodingScheme)
 
         # Filename Output
         if fname_out is not None:
@@ -343,28 +373,6 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
 
         return False
 
-    def parameter_count(self):
-        # TODO: Ensure parameter count is accurate throughout various model type
-        #       May currently overestimate parameter count in Mamba model because
-        #       output weights is tied with token embeddings.
-        total_model_parameters = 0
-        for name, data_torch in self.get_tensors():
-            # Got A Tensor
-
-            # We don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
-                continue
-
-            # Calculate Tensor Volume
-            sum_weights_in_tensor = 1
-            for dim in data_torch.shape:
-                sum_weights_in_tensor *= dim
-
-            # Add Tensor Volume To Running Count
-            total_model_parameters += sum_weights_in_tensor
-
-        return total_model_parameters
-
     def write_tensors(self):
         max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
 
diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py
@@ -1020,18 +1020,28 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
     raise ValueError(f"Unexpected combination of types: {name_to_type}")
 
 
-def model_parameter_count(model: LazyModel) -> int:
+def per_model_weight_count_estimation(model: LazyModel, expert_count:int) -> int:
     # TODO: Ensure parameter count is accurate throughout various model type
-    total_model_parameters = 0
+    sum_weight_estimate = 0
     for name, lazy_tensor in model.items():
+        # We don't need these
+        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
         # Got A Tensor
         sum_weights_in_tensor = 1
+
         # Tensor Volume
         for dim in lazy_tensor.shape:
             sum_weights_in_tensor *= dim
+
         # Add Tensor Volume To Running Count
-        total_model_parameters += sum_weights_in_tensor
-    return total_model_parameters
+        sum_weight_estimate += sum_weights_in_tensor
+
+    # Calculate weight estimate per model
+    per_model_weight_estimate = (sum_weight_estimate / expert_count) if (expert_count > 0) else sum_weight_estimate
+
+    return per_model_weight_estimate
 
 
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1213,18 +1223,10 @@ def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) ->
         return vocab, special_vocab
 
 
-def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
-
-    name = None
-    if metadata is not None and metadata.name is not None:
-        name = metadata.name
-    elif params.path_model is not None:
-        name = params.path_model.name
-
+def default_convention_outfile(file_type: GGMLFileType, model_name:str, expert_count:int, model_params_count: int, metadata: Metadata) -> str:
+    name = metadata.name if metadata is not None and metadata.name is not None else model_name
     version = metadata.version if metadata is not None and metadata.version is not None else None
 
-    expert_count = params.n_experts if params.n_experts is not None else None
-
     encodingScheme = {
         GGMLFileType.AllF32:    "F32",
         GGMLFileType.MostlyF16: "F16",
@@ -1234,8 +1236,8 @@ def default_convention_outfile(file_type: GGMLFileType, params: Params, model_pa
     return gguf.naming_convention(name, version, expert_count, model_params_count, encodingScheme)
 
 
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, model_name:str, expert_count:int, model_params_count: int, metadata: Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, model_name, expert_count, model_params_count, metadata)
     ret = model_paths[0].parent / f"{default_filename}.gguf"
     if ret in model_paths:
         logger.error(
@@ -1293,9 +1295,9 @@ def main(args_in: list[str] | None = None) -> None:
         model_plus = load_some_model(args.model)
         params = Params.load(model_plus)
         model = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = model_parameter_count(model_plus.model)
+        model_params_count = per_model_weight_count_estimation(model_plus.model, params.n_experts)
         ftype = pick_output_type(model, args.outtype)
-        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
+        print(f"{default_convention_outfile(ftype, params.path_model.name, params.n_experts, model_params_count, metadata)}") # noqa: NP100
         return
 
     if args.no_vocab and args.vocab_only:
@@ -1311,8 +1313,8 @@ def main(args_in: list[str] | None = None) -> None:
     else:
         model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
 
-    model_params_count = model_parameter_count(model_plus.model)
-    logger.info(f"model parameters count : {model_params_count} ({gguf.model_parameter_count_rounded_notation(model_params_count)})")
+    model_params_count = per_model_weight_count_estimation(model_plus.model, params.n_experts)
+    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count)})")
 
     if args.dump:
         do_dump_model(model_plus)
@@ -1380,7 +1382,7 @@ def main(args_in: list[str] | None = None) -> None:
     model   = convert_model_names(model, params, args.skip_unknown)
     ftype   = pick_output_type(model, args.outtype)
     model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.path_model.name, params.n_experts, model_params_count, metadata)
 
     params.ftype = ftype
     logger.info(f"Writing {outfile}, format {ftype}")
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
-
-def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+def model_weight_count_rounded_notation(model_params_count: int) -> str:
     if model_params_count > 1e15 :
         # Quadrillion Of Parameters
         scaled_model_params = model_params_count * 1e-15
@@ -29,7 +28,7 @@ def naming_convention(model_name: str, version_string:str, expert_count_int:int,
     # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
     name = model_name.strip().replace(' ', '-') if model_name is not None else "ggml-model"
     version = f"-{version_string}" if version_string is not None else ""
-    expert_count_chunk = f"{expert_count_int}x" if expert_count_int is not None else ""
-    parameters = model_parameter_count_rounded_notation(model_params_count)
+    expert_count_chunk = f"{expert_count_int}x" if expert_count_int is not None and expert_count_int > 0 else ""
+    parameters = model_weight_count_rounded_notation(model_params_count)
     encodingScheme = encodingScheme.upper()
     return f"{name}{version}-{expert_count_chunk}{parameters}-{encodingScheme}"