Update on "[ET-VK][int4] Wrap int4 linear calls with view_copy nodes to squeeze/unsqueeze inputs"

Nathanael See · Nathanael See · commit fe111eb64817 · 2025-02-05T14:04:13.000-08:00
This is done automatically for full-precision linear/mm nodes in the graph at torch.export graph tracing time, but is not done for the int4 op. The new pass adds view_copy nodes, as there are subsequent passes which can fuse view_copy nodes if redundant, and convert view_copy nodes to squeeze/unsqueeze nodes. Differential Revision: [D69065866](https://our.internmc.facebook.com/intern/diff/D69065866/) [ghstack-poisoned]
diff --git a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py b/backends/vulkan/_passes/squeeze_int4_linear_inputs.py
@@ -34,7 +34,7 @@ def _squeezable(shape: List[int]) -> bool:
         output_shape = meta["val"].shape
         if not _squeezable(input_shape):
             return super().call_operator(op, args, kwargs, meta)
-        
+
         # squeeze input tensor
         squeeze_shape = list(input_shape)
         while _squeezable(squeeze_shape):
@@ -43,23 +43,23 @@ def _squeezable(shape: List[int]) -> bool:
         squeeze_out = super().call_operator(
             exir_ops.edge.aten.view_copy.default,
             (args[0], squeeze_shape),
-            kwargs, 
+            kwargs,
             meta,
         )
         # call linear on squeezed output
         new_args = (squeeze_out, *args[1:])
         linear_out = super().call_operator(
             op,
-            new_args, 
-            kwargs, 
+            new_args,
+            kwargs,
             meta,
         )
         # unsqueeze output
         unsqueeze_shape = list(output_shape)
         return super().call_operator(
             exir_ops.edge.aten.view_copy.default,
             (linear_out, unsqueeze_shape),
-            kwargs, 
+            kwargs,
             meta,
         )
     
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -352,7 +352,8 @@ void add_q_4w_linear_node(
       local_wg_size,
       // Inputs and Outputs
       {{out_W_packed, vkapi::MemoryAccessType::WRITE},
-       {{mat1_W_packed, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}},
+       {{mat1_W_packed, mat2, scales_and_zeros}, 
+       vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       ubos,
       // Specialization Constants