macos12 full build (x86) (#125)

mikekgfb · malfet · commit 395317586145 · 2024-07-16T23:03:10.000-07:00
* macos12 full build (x86)

* add support for setting precision via --dtype
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -11,7 +11,7 @@ jobs:
   run-tinystories:
     strategy:
       matrix:
-        runner: [ubuntu-latest, macos-14]
+        runner: [ubuntu-latest, macos-14, macos-12]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout repo
diff --git a/export.py b/export.py
@@ -12,6 +12,8 @@
 import torch.nn as nn
 from torch.export import Dim, export
 
+from quantize import quantize_model, name_to_dtype, set_precision, get_precision
+
 try:
     executorch_export_available = True
     from export_et import export_model as export_model_et
@@ -62,8 +64,9 @@ def main(checkpoint_path, device, quantize = "{ }", args = None):
     assert checkpoint_path.is_file(), checkpoint_path
 
     print(f"Using device={device}")
-    precision = torch.float  # bfloat16
-
+    precision = name_to_dtype(args.dtype)  # torch.float  # bfloat16
+    set_precision(precision)
+    
     print("Loading model ...")
     t0 = time.time()
     model = _load_model(
diff --git a/generate.py b/generate.py
@@ -13,7 +13,7 @@
 import torch._dynamo.config
 import torch._inductor.config
 
-from quantize import quantize_model, name_to_dtype
+from quantize import quantize_model, name_to_dtype, set_precision, get_precision
 
 
 def device_sync(device):
@@ -344,7 +344,8 @@ def main(
     #            print = lambda *args, **kwargs: None
 
     print(f"Using device={device}")
-    precision = torch.float  # bfloat16
+    precision = name_to_dtype(model_dtype)
+    set_precision(precision)
     is_speculative = draft_checkpoint_path is not None
     is_chat = "chat" in str(checkpoint_path)
 
diff --git a/model.py b/model.py
@@ -11,6 +11,7 @@
 from torch import Tensor
 from torch.nn import functional as F
 
+from quantize import get_precision
 
 def find_multiple(n: int, k: int) -> int:
     if n % k == 0:
@@ -99,8 +100,11 @@ def from_name(cls, name: str):
 
 class KVCache(nn.Module):
     def __init__(
-        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.float): # bfloat16    ):
+        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=None):
+        # torch.float): # bfloat16    ):
         super().__init__()
+        if not dtype:
+            dtype=get_precision()
         cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
         self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
         self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
diff --git a/parking_lot/macos12.yml b/parking_lot/macos12.yml
diff --git a/quantize.py b/quantize.py
@@ -23,6 +23,16 @@
 ##########################################################################
 ###               dtype name to torch.dtype mapping                    ###
 
+precision = torch.float
+
+def set_precision(dtype):
+    global precision
+    precision = dtype
+
+def get_precision():
+    global precision
+    return precision
+
 def name_to_dtype(name):
     if name in name_to_dtype_dict:
         return name_to_dtype_dict[name]
@@ -33,6 +43,11 @@ def name_to_dtype(name):
     "fp32" : torch.float,
     "fp16" : torch.float16,
     "bf16" : torch.bfloat16,
+    "float" : torch.float,
+    "half" : torch.float16,
+    "float32" : torch.float,
+    "float16" : torch.float16,
+    "bfloat16" : torch.bfloat16,
 }
 
 ##########################################################################