replicate dtype from quantization dictionary to args.dtype (#494)

mikekgfb · malfet · commit ad98ae2d2a73 · 2024-07-17T09:55:44.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -357,9 +357,9 @@ def _initialize_model(
         _set_gguf_kwargs(builder_args, is_et=is_pte, context="generate")
 
     if builder_args.dso_path:
-        assert (
-            quantize is None or quantize == "{ }"
-        ), "quantize not valid for exported DSO model. Specify quantization during export."
+        # assert (
+        #     quantize is None or quantize == "{ }"
+        # ), "quantize not valid for exported DSO model. Specify quantization during export."
 
         t0 = time.time()
         model = _load_model(builder_args, only_config=True)
@@ -379,9 +379,9 @@ def _initialize_model(
         except:
             raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}")
     elif builder_args.pte_path:
-        assert (
-            quantize is None or quantize == "{ }"
-        ), "quantize not valid for exported PTE model. Specify quantization during export."
+        # assert (
+        #     quantize is None or quantize == "{ }"
+        # ), "quantize not valid for exported PTE model. Specify quantization during export."
 
         t0 = time.time()
         model = _load_model(builder_args, only_config=True)
diff --git a/cli.py b/cli.py
@@ -295,10 +295,16 @@ def _add_arguments_common(parser):
 
 
 def arg_init(args):
-    if hasattr(args, 'quantize') and Path(args.quantize).is_file():
+    if hasattr(args, "quantize") and Path(args.quantize).is_file():
         with open(args.quantize, "r") as f:
             args.quantize = json.loads(f.read())
 
-    if hasattr(args, 'seed') and args.seed:
+    if isinstance(args.quantize, str):
+        args.quantize = json.loads(args.quantize)
+
+    # if we specify dtype in quantization recipe, replicate it as args.dtype
+    args.dtype = args.quantize.get("precision", {}).get("dtype", args.dtype)
+
+    if hasattr(args, "seed") and args.seed:
         torch.manual_seed(args.seed)
     return args
diff --git a/eval.py b/eval.py
@@ -158,7 +158,7 @@ def _model_call(self, inps):
         x = seq.index_select(0, input_pos).view(1, -1)
         start = time.time()
         logits = model_forward(self._model, x, input_pos)
-        self.times.append(time.time()-start)
+        self.times.append(time.time() - start)
         return logits
 
     def _model_generate(self, context, max_length, eos_token_id):
@@ -266,9 +266,13 @@ def main(args) -> None:
         device=builder_args.device,
     )
     print(f"Time to run eval: {time.time() - t1:.02f}s.")
-    times=torch.tensor(result["times"])
-    print(f"Time in model.forward: {times.sum():.02f}s, over {times.numel()} model evaluations")
-    print(f"forward run time stats - Median: {times.median():.02f}s Min: {times.min():.02f}s Max: {times.max():.02f}s")
+    times = torch.tensor(result["times"])
+    print(
+        f"Time in model.forward: {times.sum():.02f}s, over {times.numel()} model evaluations"
+    )
+    print(
+        f"forward run time stats - Median: {times.median():.02f}s Min: {times.min():.02f}s Max: {times.max():.02f}s"
+    )
     if builder_args.dso_path:
         print(f"For model {builder_args.dso_path}")
     elif builder_args.pte_path: