Add quantized linear layer lowering (#549)

Jerry-Ge · facebook-github-bot · commit 99824b1f7000 · 2023-10-02T23:51:43.000-07:00
Summary: Edge->TOSA lowering for quantized linear layer. Pull Request resolved: #549 Reviewed By: larryliu0820, cccclai Differential Revision: D49855197 Pulled By: digantdesai fbshipit-source-id: cfe9278b1e3ad18dd7ca02c097f86ee1e61b60a0
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -217,6 +217,12 @@ def getNodeArgs(node):
     return [tosa_mapping.TosaArg(arg) for arg in node.args]
 
 
+def getQuantNodeArgs(node):
+    quant_args = [tosa_mapping.TosaArg(arg) for arg in node.args]
+    # Return the scale and zp
+    return quant_args[1].number, quant_args[2].number
+
+
 @final
 class ArmBackend(BackendDetails):
     @staticmethod
@@ -253,6 +259,7 @@ def preprocess(  # noqa: C901
                 outp = tosa_mapping.TosaArg(node)
 
                 is_quant_node = tosa_quant_utils.isQuantNode(node)
+
                 if is_quant_node:
                     tosa_fb.currRegion.currBasicBlock.addTensor(
                         outp.name, outp.shape, ts.DType.INT8
@@ -345,13 +352,17 @@ def preprocess(  # noqa: C901
                 elif exir_ops.edge.aten.addmm.default == node.target:
                     bias, input, weight = inputs
 
+                    output_dtype = ts.DType.INT8 if is_quant_node else outp.dtype
+
                     # Reshape input, weight, bias tensors
                     input_reshape_res = promote_shape(
-                        tosa_fb, input, (1,) + input.shape, outp.dtype
+                        tosa_fb, input, (1,) + input.shape, output_dtype
                     )
                     weight_reshape_res = promote_shape(
-                        tosa_fb, weight, (1,) + weight.shape, outp.dtype
+                        tosa_fb, weight, (1,) + weight.shape, output_dtype
                     )
+
+                    bias_dtype = ts.DType.INT32 if is_quant_node else outp.dtype
                     bias_reshape_res = promote_shape(
                         tosa_fb,
                         bias,
@@ -360,36 +371,87 @@ def preprocess(  # noqa: C901
                             1,
                         )
                         + bias.shape,
-                        outp.dtype,
+                        bias_dtype,
                     )
 
                     # Add dummy batch 1 to mm_shape
                     mm_shape = (1, input.shape[0], weight.shape[1])
                     # Define Intermediate tensor for MatMul res
-                    mm_res = tosa_fb.addIntermediate(mm_shape, outp.dtype)
+                    mm_res = tosa_fb.addIntermediate(
+                        mm_shape, ts.DType.INT32 if is_quant_node else output_dtype
+                    )
 
                     # Add MatMulOp
+                    attr_matmul = ts.TosaSerializerAttribute()
+                    a_zp, b_zp = (-128, 0) if is_quant_node else (0, 0)
+                    attr_matmul.MatMulAttribute(a_zp, b_zp)
                     tosa_fb.addOperator(
                         TosaOp.Op().MATMUL,
                         [input_reshape_res.name, weight_reshape_res.name],
                         [mm_res.name],
-                        attr_torch_to_tosa(TosaOp.Op().MATMUL, node),
+                        attr_matmul,
                     )
 
                     # Add AddOp
-                    add_res = tosa_fb.addIntermediate(mm_shape, outp.dtype)
+                    add_res = tosa_fb.addIntermediate(
+                        mm_shape, ts.DType.INT32 if is_quant_node else output_dtype
+                    )
+
                     tosa_fb.addOperator(
                         TosaOp.Op().ADD,
                         [bias_reshape_res.name, mm_res.name],
                         [add_res.name],
                         None,
                     )
 
+                    if is_quant_node:
+                        # Read inputs' parent nodes
+                        #
+                        _, input_node, weight_node = node.all_input_nodes
+                        input_scale, _ = getQuantNodeArgs(input_node)
+                        weight_node_q_node = weight_node.all_input_nodes[0]
+                        weight_scale, _ = getQuantNodeArgs(weight_node_q_node)
+
+                        consumer_node = list(node.users)[0]
+                        consumer_node_scale, consumer_node_node_zp = getQuantNodeArgs(
+                            consumer_node
+                        )
+
+                        output_rescale_scale = (
+                            input_scale * weight_scale
+                        ) / consumer_node_scale
+                        (
+                            multiplier_output,
+                            shift_output,
+                        ) = tosa_quant_utils.computeMultiplierAndShift(
+                            output_rescale_scale
+                        )
+
+                        attr_rescale_output = ts.TosaSerializerAttribute()
+                        attr_rescale_output.RescaleAttribute(
+                            input_zp=0,
+                            output_zp=consumer_node_node_zp,
+                            multiplier=[multiplier_output],
+                            shift=[shift_output],
+                            scale32=True,
+                            double_round=True,
+                            per_channel=False,
+                        )
+                        add_res_int8 = tosa_fb.addIntermediate(mm_shape, ts.DType.INT8)
+                        tosa_fb.addOperator(
+                            TosaOp.Op().RESCALE,
+                            [add_res.name],
+                            [add_res_int8.name],
+                            attr_rescale_output,
+                        )
                     # Reshape final result to original shape
                     attr_out = ts.TosaSerializerAttribute()
                     attr_out.ReshapeAttribute(outp.shape)
                     tosa_fb.addOperator(
-                        TosaOp.Op().RESHAPE, [add_res.name], [outp.name], attr_out
+                        TosaOp.Op().RESHAPE,
+                        [add_res_int8.name if is_quant_node else add_res.name],
+                        [outp.name],
+                        attr_out,
                     )
                 elif exir_ops.edge.aten.permute_copy.default == node.target:
                     attr = ts.TosaSerializerAttribute()
@@ -700,20 +762,11 @@ def preprocess(  # noqa: C901
                         [outp.name],
                         attr_mul,
                     )
-                elif operator.getitem == node.target:
-                    item_name = inputs[0].name
-                    ## Simply add an identityOp
-                    tosa_fb.addOperator(TosaOp.Op().IDENTITY, [item_name], [outp.name])
-                elif (
-                    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
-                    == node.target
-                ):
-                    item_name = inputs[0].name
-                    tosa_fb.addOperator(TosaOp.Op().IDENTITY, [item_name], [outp.name])
-                elif (
-                    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
-                    == node.target
-                ):
+                elif node.target in [
+                    operator.getitem,
+                    tosa_quant_utils.q_op,
+                    tosa_quant_utils.dq_op,
+                ]:
                     item_name = inputs[0].name
                     ## Simply add an identityOp
                     tosa_fb.addOperator(TosaOp.Op().IDENTITY, [item_name], [outp.name])
@@ -740,9 +793,54 @@ def preprocess(  # noqa: C901
 
                     assert isinstance(p_data, torch.Tensor), "Expect Attr to be tensor"
                     weight_values = p_data.detach().numpy()
-                    tosa_fb.addConst(
-                        inputs[0].shape, inputs[0].dtype, weight_values, name=out
-                    )
+
+                    # Check if they're for quantized nodes
+                    consumer_node = list(node.users)[0]
+                    if consumer_node.target in tosa_quant_utils.dq_q_ops:
+                        _, weight_node_scale, weight_node_zp, _, _, _ = getNodeArgs(
+                            consumer_node
+                        )
+
+                        weight_values_quantized = (
+                            (weight_values / weight_node_scale.number)
+                            + weight_node_zp.number
+                        ).astype(np.int8)
+                        tosa_fb.addConst(
+                            inputs[0].shape,
+                            ts.DType.INT8,
+                            weight_values_quantized,
+                            name=out,
+                        )
+                    elif (
+                        consumer_node.target == exir_ops.edge.aten.addmm.default
+                        and list(consumer_node.users)[0].target == tosa_quant_utils.q_op
+                    ):
+                        (
+                            _,
+                            input_node,
+                            weight_node_permuted,
+                        ) = consumer_node.all_input_nodes
+                        weight_node = weight_node_permuted.all_input_nodes[0]
+
+                        input_node_scale, _ = getQuantNodeArgs(input_node)
+                        weight_node_scale, weight_node_zp = getQuantNodeArgs(
+                            weight_node
+                        )
+
+                        weight_values_quantized = (
+                            weight_values / (input_node_scale * weight_node_scale)
+                        ).astype(np.int32)
+
+                        tosa_fb.addConst(
+                            inputs[0].shape,
+                            ts.DType.INT32,
+                            weight_values_quantized,
+                            name=out,
+                        )
+                    else:
+                        tosa_fb.addConst(
+                            inputs[0].shape, inputs[0].dtype, weight_values, name=out
+                        )
                 elif out in edge_program.graph_signature.inputs_to_buffers:
                     parameter_name = edge_program.graph_signature.inputs_to_buffers[
                         node.name
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
@@ -69,19 +69,18 @@ def forward(self, x, y):
     @register_test
     class simple_linear(torch.nn.Module):
         inputs = {
-            TosaProfile.BI: (torch.ones(128, 20),),
-            TosaProfile.MI: (torch.ones(128, 20),),
+            TosaProfile.BI: (torch.ones(100, 20),),
+            TosaProfile.MI: (torch.ones(100, 20),),
         }
 
         def __init__(self):
             super().__init__()
+            torch.manual_seed(42)
             self.fc = torch.nn.Linear(20, 30)
-            self.relu6 = torch.nn.ReLU6()
 
         def forward(self, x):
             x = self.fc(x)
-            x = self.relu6(x)
-            return x + x
+            return x
 
     @register_test
     class simple_conv2d(torch.nn.Module):
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
@@ -12,25 +12,24 @@
 from serializer.tosa_serializer import TosaOp, TosaSerializerTensor
 
 
+q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+dq_q_ops = [q_op, dq_op]
+
+
 def isQuantNode(node):
     consumer_node = list(node.users)[0]
+    input = node.all_input_nodes[0]
     return (
-        consumer_node.target
-        == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
-        or node.target
-        in [
-            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-        ]
+        consumer_node.target == q_op
+        or node.target in dq_q_ops
+        or input.target in dq_q_ops
     )
 
 
 def isQuantArg(arg):
     consumer_node = list(arg.users)[0]
-    return (
-        consumer_node.target
-        == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
-    )
+    return consumer_node.target == q_op
 
 
 # TOSA uses the RESCALE operation to scale between values with differing precision.
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py
@@ -36,7 +36,7 @@
     _check_ir_validity=False,
 )
 
-SUPPORTED_BI_TEST_LIST = ["simple_add", "simple_add_broadcast"]
+SUPPORTED_BI_TEST_LIST = ["simple_add", "simple_add_broadcast", "simple_linear"]
 
 
 def get_input_quantization_params(captured_model):
@@ -234,7 +234,10 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
     torch_output = np.load(torch_file)
 
     ## Compare Tosa and Torch Results
-    if np.allclose(tosa_output, torch_output, 1e-1, equal_nan=True):
+    ## TODO: Torch is doing [Q, DQ, Operation (FP32), Q, DQ] for quantization
+    ## While TOSA is doing everything in INT8 which is causing a large diff
+    ## Between two final results. Need to fix this to have a smaller error margin.
+    if np.allclose(tosa_output, torch_output, rtol=1e-1, atol=1e-1, equal_nan=True):
         print(
             "\033[92m"
             + "Torch and Tosa Reference results are matching for operator: "