pytorch
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 109 additions & 63 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 109 additions & 63 deletions
diff --git a/‎examples/qualcomm/executor_runner/qnn_executor_runner.cpp
Lines changed: 6 additions & 0 deletions b/‎examples/qualcomm/executor_runner/qnn_executor_runner.cpp
Lines changed: 6 additions & 0 deletions
diff --git a/‎extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift
Lines changed: 90 additions & 0 deletions b/‎extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift
Lines changed: 90 additions & 0 deletions
diff --git a/‎extension/apple/ExecuTorch/__tests__/TensorTest.swift
Lines changed: 49 additions & 1 deletion b/‎extension/apple/ExecuTorch/__tests__/TensorTest.swift
Lines changed: 49 additions & 1 deletion
diff --git a/‎kernels/aten/functions.yaml
Lines changed: 2 additions & 0 deletions b/‎kernels/aten/functions.yaml
Lines changed: 2 additions & 0 deletions
@@ -13,10 +13,6 @@
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
 from executorch.backends.cadence.aot import compiler
-from executorch.backends.cadence.aot.compiler import (
-    export_to_edge,
-    quantize_and_export_to_edge,
-)
 from executorch.backends.cadence.aot.fuse_ops import (
     FuseFullThenReshapePass,
     FuseMulScalarIntoDequantPass,
@@ -336,94 +332,144 @@ def test_replace_quant_view_dequant_with_requantize(self):
         )
 
     def test_replace_dequant_quant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6).to(torch.int8)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(dequant, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(quant)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
 
         self.check_op_counts(
             graph_module,
             expected_op_counts={
-                # Verify that dequant -> permute -> quant was replaced with permute -> requantize.
+                # Verify that dequant -> quant was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
                 exir_ops.edge.cadence.requantize.default: 1,
             },
         )
 
     def test_replace_dequant_permute_quant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6).to(torch.int8)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(dequant, [2, 0, 1, 3])
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(quant)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
 
         self.check_op_counts(
             graph_module,
             expected_op_counts={
                 # Verify that dequant -> permute -> quant was replaced with permute -> requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
+                exir_ops.edge.aten.permute_copy.default: 1,
                 exir_ops.edge.cadence.requantize.default: 1,
             },
         )
 
     def test_remove_nop_dequant_quant(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-                self.lin1 = torch.nn.Linear(6, 12, bias=False)
-                self.lin2 = torch.nn.Linear(12, 24, bias=False)
+        LEADING_DIMS: Final[int] = 12
+        IN_DIM: Final[int] = 6
+        OUT_DIM: Final[int] = 12
 
-            def forward(self, x):
-                x = self.lin1(x)
-                # redundant dequant+quant will be created around this permute
-                x = torch.permute(x, [0, 2, 1, 3])
-                x = self.lin2(x)
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = (
-            quantize_and_export_to_edge(model, (inputs,))
-            .exported_program()
-            .graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder(
+            "x", torch.randn(LEADING_DIMS, IN_DIM, dtype=torch.float32)
+        )
+        quant1 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 4.5, 6, 0, 127, torch.int8),
+        )
+        weights = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM, IN_DIM], 1)
+        )
+        bias = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 1)
+        )
+        weight_zero_point = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([IN_DIM], 0)
+        )
+        out_multiplier = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 1)
+        )
+        out_shift = builder.call_operator(
+            op=exir_ops.edge.aten.full.default, args=([OUT_DIM], 0)
         )
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        linear1 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantized_linear.default,
+            args=(
+                quant1,
+                weights,
+                bias,
+                0,  # src_zero_point
+                weight_zero_point,
+                out_multiplier,
+                out_shift,
+                0,  # out_zero_point
+                None,
+            ),
+        )
+        dequant1 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(linear1, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(dequant1, [1, 0])
+        )
+        quant2 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        linear2 = builder.call_operator(
+            op=exir_ops.edge.cadence.quantized_linear.default,
+            args=(
+                quant2,
+                weights,
+                bias,
+                0,  # src_zero_point
+                weight_zero_point,
+                out_multiplier,
+                out_shift,
+                0,  # out_zero_point
+                None,
+            ),
+        )
+        dequant2 = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(linear2, 1.2, 3, 0, 127, torch.int8),
+        )
+        builder.output(dequant2)
+        graph_module = FuseQuantDequantToRequantizePass()(
+            builder.get_graph_module()
+        ).graph_module
         self.check_op_counts(
             graph_module,
             expected_op_counts={
-                # Verify that one dequant/quant pair was removed
-                # Expect 1 quantize ops: 1 input
+                # Verify that one dequant/quant pair was removed from chain:
+                # quant->linear->dequant->permute->quant->linear->dequant
+                # gets converted to:
+                # quant->linear->permute->linear->dequant
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
-                # Expect 1 dequant op at the end (output of second linear)
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 1,
             },
         )
 
@@ -481,6 +481,12 @@ int main(int argc, char** argv) {
 
       ++inference_index;
     }
+    ET_LOG(
+        Info,
+        "%d inference took %f ms, avg %f ms",
+        inference_index,
+        elapsed_time,
+        elapsed_time / inference_index);
   } else {
     // if no input is provided, fill the inputs with default values
     auto inputs = prepare_input_tensors(*method);
 
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+@_exported import ExecuTorch
+
+/// A protocol that types conform to in order to be used as tensor element types.
+/// Provides the mapping from the Swift type to the underlying `DataType`.
+@available(*, deprecated, message: "This API is experimental.")
+protocol Scalar {
+  /// The `DataType` corresponding to this scalar type.
+  static var dataType: DataType { get }
+}
+
+@available(*, deprecated, message: "This API is experimental.")
+extension UInt8: Scalar { static var dataType: DataType { .byte } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Int8: Scalar { static var dataType: DataType { .char } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Int16: Scalar { static var dataType: DataType { .short } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Int32: Scalar { static var dataType: DataType { .int } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Int64: Scalar { static var dataType: DataType { .long } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Int: Scalar { static var dataType: DataType { .long } }
+@available(macOS 11.0, *)
+@available(*, deprecated, message: "This API is experimental.")
+extension Float16: Scalar { static var dataType: DataType { .half } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Float: Scalar { static var dataType: DataType { .float } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Double: Scalar { static var dataType: DataType { .double } }
+@available(*, deprecated, message: "This API is experimental.")
+extension Bool: Scalar { static var dataType: DataType { .bool } }
+@available(*, deprecated, message: "This API is experimental.")
+extension UInt16: Scalar { static var dataType: DataType { .uInt16 } }
+@available(*, deprecated, message: "This API is experimental.")
+extension UInt32: Scalar { static var dataType: DataType { .uInt32 } }
+@available(*, deprecated, message: "This API is experimental.")
+extension UInt64: Scalar { static var dataType: DataType { .uInt64 } }
+@available(*, deprecated, message: "This API is experimental.")
+extension UInt: Scalar { static var dataType: DataType { .uInt64 } }
+
+@available(*, deprecated, message: "This API is experimental.")
+extension Tensor {
+  /// Calls the closure with a typed, immutable buffer pointer over the tensor’s elements.
+  ///
+  /// - Parameter body: A closure that receives an `UnsafeBufferPointer<T>` bound to the tensor’s data.
+  /// - Returns: The value returned by `body`.
+  /// - Throws: `Error(code: .invalidArgument)` if `T.dataType` doesn’t match the tensor’s `dataType`,
+  ///           or any error thrown by `body`.
+  func withUnsafeBytes<T: Scalar, R>(_ body: (UnsafeBufferPointer<T>) throws -> R) throws -> R {
+    guard dataType == T.dataType else { throw Error(code: .invalidArgument) }
+    var result: Result<R, Error>?
+    bytes { pointer, count, _ in
+      result = Result { try body(
+        UnsafeBufferPointer(
+          start: pointer.assumingMemoryBound(to: T.self),
+          count: count
+        )
+      ) }
+    }
+    return try result!.get()
+  }
+
+  /// Calls the closure with a typed, mutable buffer pointer over the tensor’s elements.
+  ///
+  /// - Parameter body: A closure that receives an `UnsafeMutableBufferPointer<T>` bound to the tensor’s data.
+  /// - Returns: The value returned by `body`.
+  /// - Throws: `Error(code: .invalidArgument)` if `T.dataType` doesn’t match the tensor’s `dataType`,
+  ///           or any error thrown by `body`.
+  func withUnsafeMutableBytes<T: Scalar, R>(_ body: (UnsafeMutableBufferPointer<T>) throws -> R) throws -> R {
+    guard dataType == T.dataType else { throw Error(code: .invalidArgument) }
+    var result: Result<R, Error>?
+    mutableBytes { pointer, count, _ in
+      result = Result { try body(
+        UnsafeMutableBufferPointer(
+          start: pointer.assumingMemoryBound(to: T.self),
+          count: count
+        )
+      ) }
+    }
+    return try result!.get()
+  }
+}
@@ -148,6 +148,54 @@ class TensorTest: XCTestCase {
     }
   }
 
+  func testWithUnsafeBytes() throws {
+    var data: [Float] = [1, 2, 3, 4, 5, 6]
+    let tensor = data.withUnsafeMutableBytes {
+      Tensor(bytesNoCopy: $0.baseAddress!, shape: [2, 3], dataType: .float)
+    }
+    let array: [Float] = try tensor.withUnsafeBytes { Array($0) }
+    XCTAssertEqual(array, data)
+  }
+
+  func testWithUnsafeMutableBytes() throws {
+    var data = [1, 2, 3, 4]
+    let tensor = data.withUnsafeMutableBytes {
+      Tensor(bytes: $0.baseAddress!, shape: [4], dataType: .long)
+    }
+    try tensor.withUnsafeMutableBytes { (buffer: UnsafeMutableBufferPointer<Int>) in
+      for i in buffer.indices {
+        buffer[i] *= 2
+      }
+    }
+    try tensor.withUnsafeBytes { buffer in
+      XCTAssertEqual(Array(buffer), [2, 4, 6, 8])
+    }
+  }
+
+  func testWithUnsafeBytesFloat16() throws {
+    var data: [Float16] = [1, 2, 3, 4, 5, 6]
+    let tensor = data.withUnsafeMutableBytes {
+      Tensor(bytesNoCopy: $0.baseAddress!, shape: [6], dataType: .half)
+    }
+    let array: [Float16] = try tensor.withUnsafeBytes { Array($0) }
+    XCTAssertEqual(array, data)
+  }
+
+  func testWithUnsafeMutableBytesFloat16() throws {
+    var data: [Float16] = [1, 2, 3, 4]
+    let tensor = data.withUnsafeMutableBytes { buffer in
+      Tensor(bytes: buffer.baseAddress!, shape: [4], dataType: .half)
+    }
+    try tensor.withUnsafeMutableBytes { (buffer: UnsafeMutableBufferPointer<Float16>) in
+      for i in buffer.indices {
+        buffer[i] *= 2
+      }
+    }
+    try tensor.withUnsafeBytes { buffer in
+      XCTAssertEqual(Array(buffer), data.map { $0 * 2 })
+    }
+  }
+
   func testInitWithTensor() {
     var data: [Int] = [10, 20, 30, 40]
     let tensor1 = data.withUnsafeMutableBytes {
@@ -618,7 +666,7 @@ class TensorTest: XCTestCase {
       }
     }
   }
-  
+
   func testZeros() {
     let tensor = Tensor.zeros(shape: [2, 3], dataType: .double)
     XCTAssertEqual(tensor.shape, [2, 3])
 
@@ -201,6 +201,8 @@
 
 - op: index_put.out
 
+- op: index_put_
+
 - op: index_select.out
 
 - op: index.Tensor_out