do weight transform on cpu (#508)

mikekgfb · malfet · commit e437feadf0a8 · 2024-07-16T22:49:53.000-07:00
diff --git a/build/utils.py b/build/utils.py
@@ -9,7 +9,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import List
+from typing import List, Dict
 
 import torch
 
@@ -133,10 +133,20 @@ def device_sync(device="cpu"):
 
 
 #########################################################################
-###                   general utilkity functions                      ###
+###                    general utility functions                      ###
 
 
 # in fbcode, we can intercept certain local paths that
 # should be interpreted as part of an XAR package
 def canonical_path(path):
     return path
+
+
+#########################################################################
+###                    general utility functions                      ###
+
+def state_dict_device(d, device = "cpu") -> Dict:
+    for key, weight in d.items():
+        d[key] = weight.to(device=device)
+
+    return d
diff --git a/quantize.py b/quantize.py
@@ -15,7 +15,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from build.utils import find_multiple, get_precision, name_to_dtype, use_et_backend
+from build.utils import find_multiple, get_precision, name_to_dtype, use_et_backend, state_dict_device
 
 
 #########################################################################
@@ -63,7 +63,7 @@ def convert_for_runtime(self) -> nn.Module:
         pass
 
     def quantized_model(self) -> nn.Module:
-        model_updated_state_dict = self.create_quantized_state_dict()
+        model_updated_state_dict = state_dict_device(self.create_quantized_state_dict())
         self.convert_for_runtime()
         self.model_.load_state_dict(model_updated_state_dict)
         return self.model_
@@ -406,8 +406,9 @@ def __init__(
 
     @torch.no_grad()
     def create_quantized_state_dict(self) -> Dict:
-        cur_state_dict = self.model_.state_dict()
-
+        cur_state_dict = state_dict_device(self.model_.state_dict())
+        dict_device = "cpu" # self.device
+                                                     
         if self.bitwidth == 4:
             range_min = -8
             range_max = 7
@@ -446,8 +447,8 @@ def create_quantized_state_dict(self) -> Dict:
                         scales_dtype=mod.weight.dtype,
                     )
 
-                    weight = weight.to(device=self.device)
-                    scales = scales.to(device=self.device)
+                    weight = weight.to(device=dict_device)
+                    scales = scales.to(device=dict_device)
                     cur_state_dict[f"{fqn}.weight"] = weight
                     # squeeze makes groupsize=rowsize unidimensional
                     cur_state_dict[f"{fqn}.scales"] = scales.squeeze(dim=-1)
@@ -553,7 +554,8 @@ def __init__(
 
     @torch.no_grad()
     def create_quantized_state_dict(self) -> Dict:
-        cur_state_dict = self.model_.state_dict()
+        cur_state_dict = state_dict_device(self.model_.state_dict())
+        dict_device = "cpu"  # self.device
 
         if self.bitwidth == 4:
             range_min = -8
@@ -595,8 +597,8 @@ def create_quantized_state_dict(self) -> Dict:
                     weight_packed = weight_even + weight_odd
                     weight = weight_packed
 
-                weight = weight.to(device=self.device)
-                scales = scales.to(device=self.device)
+                weight = weight.to(device=dict_device)
+                scales = scales.to(device=dict_device)
                 # Update state dict
                 cur_state_dict[f"{fqn}.weight"] = weight
                 # squeeze makes groupsize=rowsize unidimensional
@@ -822,9 +824,21 @@ def __init__(
         assert groupsize in [32, 64, 128, 256]
         assert inner_k_tiles in [2, 4, 8]
 
+        
+    # @torch.no_grad()
+    # def p(self):
+    #     cur_state_dict = state_dict_device(self.model_.state_dict())
+    #     dict_device = "cpu"  # self.device
+    #     
+    #     for fqn, mod in self.model_.named_modules():
+    #         if hasattr(mod, "weight"):
+    #             print(f"device={str(mod.weight.data.device)}")
+
     @torch.no_grad()
     def create_quantized_state_dict(self):
-        cur_state_dict = self.model_.state_dict()
+        cur_state_dict = state_dict_device(self.model_.state_dict())
+        dict_device = "cpu"  # self.device
+        
         for fqn, mod in self.model_.named_modules():
             if isinstance(mod, torch.nn.Linear):
                 assert not mod.bias
@@ -856,8 +870,8 @@ def create_quantized_state_dict(self):
                         weight.to(torch.float), self.groupsize, self.inner_k_tiles
                     )
                 )
-                weight_int4pack = weight_int4pack.to(device=self.device)
-                scales_and_zeros = scales_and_zeros.to(device=self.device)
+                weight_int4pack = weight_int4pack.to(device=dict_device)
+                scales_and_zeros = scales_and_zeros.to(device=dict_device)
                 cur_state_dict[f"{fqn}.weight"] = weight_int4pack
                 cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros
 
@@ -877,6 +891,7 @@ def quantized_model(self) -> nn.Module:
         model_updated_state_dict = self.create_quantized_state_dict()
         self.convert_for_runtime()
         self.model_.load_state_dict(model_updated_state_dict)
+        # self.p()
         return self.model_