pytorch
diff --git a/‎.github/workflows/android.yml
Lines changed: 15 additions & 16 deletions b/‎.github/workflows/android.yml
Lines changed: 15 additions & 16 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
Lines changed: 0 additions & 6 deletions b/‎backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
Lines changed: 0 additions & 6 deletions
diff --git a/‎backends/cadence/aot/TARGETS
Lines changed: 1 addition & 0 deletions b/‎backends/cadence/aot/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/compiler.py
Lines changed: 9 additions & 3 deletions b/‎backends/cadence/aot/compiler.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎backends/cadence/aot/quantizer/TARGETS
Lines changed: 3 additions & 0 deletions b/‎backends/cadence/aot/quantizer/TARGETS
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py
Lines changed: 3 additions & 4 deletions b/‎backends/cadence/aot/quantizer/fusion_pass.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 50 additions & 19 deletions b/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 50 additions & 19 deletions
@@ -27,8 +27,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
       matrix:
-        include:
-          - build-tool: buck2
+          tiktoken: [OFF, ON]
     with:
       # NB: The example model dl3 requires lots of memory (T161064121)
       runner: linux.12xlarge
@@ -44,30 +43,30 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        BUILD_TOOL=${{ matrix.build-tool }}
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
         # Build Android library
+        export EXECUTORCH_USE_TIKTOKEN=${{ matrix.tiktoken }}
         bash build/build_android_library.sh
         # Build Android demo app
         bash build/test_android_ci.sh
 
-        mkdir -p artifacts-to-be-uploaded
-        mkdir -p artifacts-to-be-uploaded/arm64-v8a/
-        mkdir -p artifacts-to-be-uploaded/x86_64/
+        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN
+        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
+        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
         # Copy the jar to S3
-        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/
+        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
         # Copy the app and its test suite to S3
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
         # Also copy the libraries
-        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/arm64-v8a/
-        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
-        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
-        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
+        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
+        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
+        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
+        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
         # Copyp AAR to S3
-        cp executorch.aar artifacts-to-be-uploaded/
-        cp executorch-llama.aar artifacts-to-be-uploaded/
+        cp executorch.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
+        cp executorch-llama.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
 
   # Upload the app and its test suite to S3 so that they can be downloaded by the test job
   upload-artifacts:
 
@@ -485,7 +485,7 @@ if(MAX_KERNEL_NUM)
   )
 endif()
 
-if(EXECUTORCH_BUILD_PYBIND)
+if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   # shared version
   add_library(
     executorch_no_prim_ops_shared SHARED ${_executorch_no_prim_ops__srcs}
 
@@ -115,12 +115,6 @@ InMemoryFileSystem::InMemoryNode* get_node(InMemoryFileSystem::InMemoryNode* nod
     return node;
 }
 
-std::string toString(time_t time) {
-    constexpr auto format = "%Y-%m-%dT%TZ";
-    std::stringstream stream;
-    stream << std::put_time(gmtime(&time), format);
-    return stream.str();
-}
 
 time_t toTime(const std::string& str) {
     constexpr auto format = "%Y-%m-%dT%TZ";
 
@@ -28,6 +28,7 @@ python_library(
         "compiler.py",
     ],
     deps = [
+        "fbsource//third-party/pypi/pyre-extensions:pyre-extensions",
         ":passes",
         ":utils",
         "//caffe2:torch",
 
@@ -18,9 +18,13 @@
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
-from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
+from executorch.backends.cadence.aot.quantizer.quantizer import (
+    CadenceGenericQuantizer,
+    CadenceQuantizer,
+)
 from executorch.backends.cadence.aot.utils import model_is_quantized
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
+from pyre_extensions import assert_is_instance
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.pt2e.export_utils import model_is_exported
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
@@ -53,8 +57,10 @@ def quantize_pt2(
     converted_model = convert_pt2e(prepared_model)
 
     # Get patterns and apply fusion of dq -> op -> q to qop
-    # pyre-fixme[16]: Pyre doesn't get that CadenceQuantizer has a patterns attribute
-    patterns = [q.pattern for q in quantizer.quantizers]
+    patterns = [
+        assert_is_instance(q, CadenceGenericQuantizer).pattern
+        for q in quantizer.quantizers
+    ]
     QuantFusion(patterns)(converted_model)
 
     return converted_model
 
@@ -17,6 +17,7 @@ python_library(
     srcs = [
         "patterns.py",
     ],
+    typing = True,
     deps = [
         ":utils",
         "//caffe2:torch",
@@ -28,7 +29,9 @@ python_library(
     srcs = [
         "quantizer.py",
     ],
+    typing = True,
     deps = [
+        "fbsource//third-party/pypi/pyre-extensions:pyre-extensions",
         ":patterns",
         ":utils",
         "//caffe2:torch",
 
@@ -11,6 +11,7 @@
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
     LayerNormFunctionalPattern,
@@ -361,9 +362,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
                     kwargs = {}
-                    if isinstance(pattern, Conv1dPattern) or isinstance(
-                        pattern, Conv2dPattern
-                    ):
+                    if isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
                         args, kwargs = get_args_and_kwargs_conv(
                             graph_module,
                             inputs_inputs,
@@ -396,7 +395,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             other_inputs,
                             quant_node,
                         )
-                    elif isinstance(pattern, MatmulPattern):
+                    elif isinstance(pattern, (BmmPattern, MatmulPattern)):
                         args, kwargs = get_args_and_kwargs_matmul(
                             inputs_inputs,
                             dequants_inputs,
 
@@ -4,14 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-strict
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Callable, List, Optional, Tuple, Type, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams
 
 from torch import fx
+from torch._ops import OpOverload
 from torch.ao.quantization.quantizer import (
     DerivedQuantizationSpec,
     SharedQuantizationSpec,
@@ -44,18 +47,22 @@ class PartitionAnchors:
 
 class QuantizationPattern(ABC):
     @abstractmethod
-    def partition_types(self):
+    def partition_types(
+        self,
+    ) -> Union[List[Type[torch.nn.Module]], List[Callable[..., torch.Tensor]]]:
         """
         List of types to be passed to find_sequential_partitions.
         """
         pass
 
     @abstractmethod
-    def get_anchors(self, gm, fused_partition) -> Optional[PartitionAnchors]:
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Optional[PartitionAnchors]:
         pass
 
     @abstractmethod
-    def replacement_op(self) -> Callable[..., Any]:
+    def replacement_op(self) -> OpOverload:
         """
         Operator (most likely a custom one) that this partition should be fused into in
         the backend. Refer to the QuantFusion pass for examples.
@@ -91,10 +98,30 @@ def get_anchors(
             output=[(addmm_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear
 
 
+class BmmPattern(QuantizationPattern):
+    def partition_types(self) -> List[Callable[..., torch.Tensor]]:
+        return [torch.bmm]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        bmm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(bmm_node, 0), (bmm_node, 1)],
+            weights=[],
+            biases=[],
+            output=[(bmm_node,)],
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_matmul.default
+
+
 class Conv1dPattern(QuantizationPattern):
     def partition_types(self) -> List[Type[torch.nn.Module]]:
         return [torch.nn.Conv1d]
@@ -129,7 +156,7 @@ def get_anchors(
             output=[(conv1d_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_conv.default
 
 
@@ -167,15 +194,17 @@ def get_anchors(
             output=[(conv2d_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_conv.default
 
 
 class LayerNormPattern(QuantizationPattern):
-    def partition_types(self):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
         return [torch.nn.LayerNorm]
 
-    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
         layer_norm_node = fused_partition[0].nodes[-1]
 
         # Weights and biases are used as fp32 by our kernel, so they are
@@ -189,15 +218,17 @@ def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
             output=[(layer_norm_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_layer_norm.default
 
 
 class LayerNormFunctionalPattern(QuantizationPattern):
-    def partition_types(self):
+    def partition_types(self) -> List[Callable[..., torch.Tensor]]:
         return [torch.nn.functional.layer_norm]
 
-    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
         layer_norm_node = fused_partition[0].nodes[-1]
 
         others = [(layer_norm_node, 1)]
@@ -221,7 +252,7 @@ def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
             output=[(layer_norm_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_layer_norm.default
 
 
@@ -259,12 +290,12 @@ def get_anchors(
             output=[(linear_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear.default
 
 
 class LinearFunctionalPattern(QuantizationPattern):
-    def partition_types(self):
+    def partition_types(self) -> List[Callable[..., torch.Tensor]]:
         return [torch.nn.functional.linear]
 
     def get_anchors(
@@ -297,12 +328,12 @@ def get_anchors(
             output=[(linear_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear.default
 
 
 class MatmulPattern(QuantizationPattern):
-    def partition_types(self):
+    def partition_types(self) -> List[Callable[..., torch.Tensor]]:
         return [torch.matmul]
 
     def get_anchors(
@@ -317,7 +348,7 @@ def get_anchors(
             output=[(matmul_node,)],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_matmul.default
 
 
@@ -339,5 +370,5 @@ def get_anchors(
             ],
         )
 
-    def replacement_op(self):
+    def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_relu.default
Original file line number	Diff line number	Diff line change
`@@ -485,7 +485,7 @@ if(MAX_KERNEL_NUM)`
`485`	`485`	`)`
`486`	`486`	`endif()`
`487`	`487`
`488`		`-if(EXECUTORCH_BUILD_PYBIND)`
	`488`	`+if(EXECUTORCH_BUILD_PYBIND AND APPLE)`
`489`	`489`	`# shared version`
`490`	`490`	`add_library(`
`491`	`491`	`executorch_no_prim_ops_shared SHARED ${_executorch_no_prim_ops__srcs}`