user linear_int8 (#135)

mikekgfb · malfet · commit 2b46b72fad11 · 2024-07-16T23:03:10.000-07:00
diff --git a/quantize.py b/quantize.py
@@ -12,6 +12,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import quantized_ops
 
 
 try:
@@ -471,19 +472,12 @@ def __init__(
             self.register_buffer("scales", torch.ones(out_features, groups, dtype=torch.bfloat16))
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        scales = self.scales
-        weight = self.weight
-        scales = scales.view(scales.shape[0], -1)
-        no_groups = scales.shape[1]
-
-        # need a formulation / custom op for good performance on both eager, CUDA compiled, CPU compiled and ET exported
-        # maybe use IR-based rewriting?
-
-        # for now, we special-case channel-wise, because we know how to make that fast (but does not work for groupwise)
-        if scales.shape[1] == 1:
-            return F.linear(input, weight.to(dtype=input.dtype)) * self.scales
-        else:
-            return F.linear(input, (weight.to(dtype=input.dtype).view(weight.shape[0], no_groups, -1) * scales.view(weight.shape[0], no_groups, -1)).view(weight.shape[0], -1))
+        return torch.ops.torchat.linear_int8(
+            input,
+            self.weight,
+            self.scales,
+            None
+        )
 
 
 #########################################################################