Merge branch 'main' into export-D69871232

georgehong · web-flow · commit fbd1a7f888df · 2025-02-19T17:34:06.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -258,6 +258,11 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -0,0 +1,85 @@
+# IMPORTANT:
+# This file is ONLY used to subscribe for notifications for PRs
+# related to a specific file path. Approvals from people in this
+# file are not required for merges.
+
+/backends/apple @shoumikhin @cccclai
+/backends/apple/mps @cccclai @DenisVieriu97
+/backends/arm @digantdesai
+/backends/cadence @tarun292
+/backends/example @iseeyuan @JacobSzwejbka @larryliu0820
+/backends/mediatek @cccclai @neuropilot-captain
+/backends/qualcomm @cccclai @chunit-quic @haowhsu-quic @shewu-quic @winskuo-quic
+/backends/test @cccclai
+/backends/transforms @kimishpatel
+/backends/vulkan @SS-JIA
+/backends/xnnpack @digantdesai @mcr229
+
+/build @GregoryComer @dbort @kirklandsign
+
+/codegen @larryliu0820 @lucylq
+
+/devtools @tarun292 @Gasoonjia
+
+/docs @mergennachin
+
+/examples/apple @shoumikhin
+/examples/apple/coreml @cccclai @metascroy @cymbalrush @YifanShenSZ
+/examples/arm @digantdesai
+/examples/cadence @tarun292
+/examples/demo-apps @shoumikhin @kirklandsign
+/examples/devtools @tarun292
+/examples/llm_manual @larryliu0820
+/examples/llm_pte_finetuning @JacobSzwejbka
+/examples/mediatek @cccclai
+/examples/models @lucylq
+/examples/portable @larryliu0820 @manuelcandales
+/examples/qualcomm @cccclai
+/examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
+/examples/xnnpack @digantdesai @mcr229
+
+/exir/backend @cccclai @kimishpatel @JacobSzwejbka @tarun292
+/exir @JacobSzwejbka @tarun292 @larryliu0820
+
+
+/extension/android @kirklandsign
+/extension/android_test @kirklandsign
+/extension/apple @shoumikhin
+/extension/aten_util @JacobSzwejbka
+/extension/benchmark @tarun292
+/extension/data_loader @JacobSzwejbka @lucylq @dbort
+/extension/evalue_util @GregoryComer @dbort
+/extension/export_util @kimishpatel
+/extension/flat_tensor @lucylq
+/extension/gguf_util @larryliu0820
+/extension/kernel_util @kimishpatel @manuelcandales
+/extension/llm @jackzhxng @iseeyuan @larryliu0820
+/extension/memory_allocator @JacobSzwejbka @dbort
+/extension/module @shoumikhin
+/extension/parallel @kimishpatel
+/extension/pybindings @JacobSzwejbka @larryliu0820
+/extension/pytree @JacobSzwejbka
+/extension/runner_util @dbort
+/extension/tensor @shoumikhin
+/extension/testing_util @dbort
+/extension/threadpool @kimishpatel
+/extension/training @JacobSzwejbka
+
+/kernels @manuelcandales
+
+/profiler @tarun292 @Gasoonjia
+
+/runtime @dbort @JacobSzwejbka @lucylq
+/runtime/backend @cccclai
+
+/schema @dbort @JacobSzwejbka @lucylq
+
+/scripts @GregoryComer
+
+/shim @larryliu0820 @GregoryComer
+
+/third-party @GregoryComer
+
+/test @larryliu0820 @kirklandsign
+
+/util @tarun292
diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS
@@ -13,7 +13,7 @@ python_library(
 
 python_library(
     name = "ops",
-    srcs = glob(["op_*.py"]),
+    srcs = glob(["op_*.py", "ops_*.py"]),
     typing = True,
     deps = [
         "fbsource//third-party/serialization_lib/python/tosa:tosa",
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
@@ -22,7 +22,10 @@ endif()
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
   include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -74,10 +77,12 @@ endif()
 
 if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 elseif(EXECUTORCH_FUSION_G3_OPT)
   set(TARGET_DIR fusion_g3)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 else()
   set(TARGET_DIR reference)
 endif()
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -219,11 +219,6 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
 
-- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
-
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -64,7 +64,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -25,7 +25,8 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   cadence_kernels
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -67,7 +67,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
@@ -77,7 +78,7 @@ target_include_directories(
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
-             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
+             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp" "op_quantized_fully_connected_out"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -328,7 +328,7 @@ Tensor& clamp_tensor_out(
     const executorch::aten::optional<Tensor>& min_opt,
     const executorch::aten::optional<Tensor>& max_opt,
     Tensor& out) {
-  clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
+  return clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
@@ -175,7 +175,7 @@ Tensor& mean_dim_out(
     bool keepdim,
     optional<ScalarType> dtype,
     Tensor& out) {
-  mean_out(ctx, in, dim_list, keepdim, dtype, out);
+  return mean_out(ctx, in, dim_list, keepdim, dtype, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
@@ -100,10 +100,10 @@ void quantized_relu_per_tensor_out(
 void quantized_relu_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const int64_t in_zero_point,
+    const Tensor& in_zero_point,
     const int64_t out_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
     Tensor& output) {
   quantized_relu_per_tensor_out(
       ctx,
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -200,7 +200,7 @@ Tensor& softmax_out(
     int64_t dim,
     bool half_to_float,
     Tensor& out) {
-  _softmax_out(ctx, in, dim, half_to_float, out);
+  return _softmax_out(ctx, in, dim, half_to_float, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
@@ -183,6 +183,15 @@ Tensor& where_self_out(
   return out;
 }
 
+Tensor& where_out(
+    RuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  return where_out(ctx, cond, a, b, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -8,7 +8,8 @@
 add_library(cadence_kernels kernels.cpp)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(cadence_kernels PUBLIC .
                     ${_common_include_directories}
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
@@ -71,7 +71,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
@@ -75,7 +75,11 @@ def find_unsqueeze_dim(
         j = 0
         idx = -1
         while j < len(view_shape):
-            if input_shape[i] != view_shape[j]:
+            # account for added dim being last dim in view_shape
+            if i == j and j == len(input_shape):
+                if view_shape[j] != 1:
+                    return None
+            elif input_shape[i] != view_shape[j]:
                 if view_shape[j] == 1:
                     idx = j
                     i -= 1
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
@@ -114,15 +114,30 @@ def update(
         return all_data, (out_k_cache, out_v_cache)
 
 
-def _apply_rotary_embedding(
-    x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
-) -> torch.Tensor:
-    x_r, x_i = x[..., ::2], x[..., 1::2]
-    x_out_r = x_r * freqs_cos - x_i * freqs_sin
-    x_out_i = x_r * freqs_sin + x_i * freqs_cos
+class _Rope(nn.Module):
+    def __init__(self, use_hf_rope):
+        super().__init__()
+        self.use_hf_rope = use_hf_rope
+
+    def forward(
+        self, x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+    ) -> torch.Tensor:
+        if self.use_hf_rope:
+            if len(freqs_cos.shape) == 2:
+                freqs_cos = freqs_cos.unsqueeze(0)
+            if len(freqs_sin.shape) == 2:
+                freqs_sin = freqs_sin.unsqueeze(0)
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            x_rotated = torch.cat((-x2, x1), dim=-1)
+            return x * freqs_cos + x_rotated * freqs_sin
+        else:
+            x_r, x_i = x[..., ::2], x[..., 1::2]
+            x_out_r = x_r * freqs_cos - x_i * freqs_sin
+            x_out_i = x_r * freqs_sin + x_i * freqs_cos
 
-    x_out = torch.cat([x_out_r, x_out_i], dim=-1)
-    return x_out
+            x_out = torch.cat([x_out_r, x_out_i], dim=-1)
+            return x_out
 
 
 @register_attention("static")
@@ -172,6 +187,7 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
             [StaticVCache(layer_id, i) for i in range(self.n_kv_heads)]
         )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+        self.rope = _Rope(rope.params.use_hf_rope)
 
     def forward(
         self,
@@ -191,8 +207,8 @@ def forward(
         new_qs = [self.wqs[i](x) for i in range(self.n_heads)]
         new_ks = [self.wks[i](x) for i in range(self.n_kv_heads)]
         new_vs = [self.wvs[i](x) for i in range(self.n_kv_heads)]
-        new_qs = [_apply_rotary_embedding(q, freqs_cos, freqs_sin) for q in new_qs]
-        new_ks = [_apply_rotary_embedding(k, freqs_cos, freqs_sin) for k in new_ks]
+        new_qs = [self.rope(q, freqs_cos, freqs_sin) for q in new_qs]
+        new_ks = [self.rope(k, freqs_cos, freqs_sin) for k in new_ks]
 
         all_ks = []
         all_vs = []
@@ -211,7 +227,7 @@ def forward(
             kv_idx = i // self.n_heads_per_kv_group
             attn = new_qs[i] @ all_ks[kv_idx].transpose(-2, -1)
             attn = attn * self.inv_scale
-            attn = attn + mask  # pyre-ignore
+            attn = attn + mask
             attn = F.softmax(attn, dim=-1)
             heads.append(attn @ all_vs[kv_idx])
 
diff --git a/examples/models/llama/tests/test_static_attention.py b/examples/models/llama/tests/test_static_attention.py
@@ -43,6 +43,35 @@ def test_without_cache(self):
         )
         self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
 
+    def test_hf_rope_without_cache(self):
+        config = ModelArgs(
+            dim=64,
+            n_heads=4,
+            n_kv_heads=2,
+            max_seq_len=8,
+            use_hf_rope=True,
+        )
+        layer_id = 0
+        rope = Rope(config)
+        attn_mha = AttentionMHA(config, layer_id, rope).eval()
+        static_attn = StaticAttention(config, layer_id, rope).eval()
+        static_attn.load_weights_from_attention_mha(attn_mha)
+
+        x = torch.rand(1, config.max_seq_len, config.dim)
+        freqs_cos, freqs_sin = rope.get_freqs(None, config.max_seq_len)
+        expected, _ = attn_mha(x, freqs_cos, freqs_sin)
+        mask = torch.triu(
+            torch.full((1, config.max_seq_len, config.max_seq_len), float("-inf")),
+            diagonal=1,
+        )
+        y, _ = static_attn(
+            x,
+            freqs_cos.unsqueeze(0),
+            freqs_sin.unsqueeze(0),
+            mask=mask,
+        )
+        self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+
     def test_with_cache(self):
         config = ModelArgs(
             dim=64,
diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl
@@ -9,13 +9,15 @@ def define_common_targets():
         exported_headers = ["flat_tensor_data_map.h"],
         deps = [
             "//executorch/extension/flat_tensor/serialize:generated_headers",
-            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core:named_data_map",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
+        exported_deps = [
+            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
+        ],
         visibility = [
             "//executorch/...",
         ],
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
diff --git a/extension/module/module.h b/extension/module/module.h
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
diff --git a/extension/module/test/resources/linear.ptd b/extension/module/test/resources/linear.ptd
diff --git a/extension/module/test/resources/linear.pte b/extension/module/test/resources/linear.pte

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,8 @@ add_library(`
`25`	`25`	`${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c`
`26`	`26`	`)`
`27`	`27`	`# Let files say "include <executorch/path/to/header.h>".`
`28`		`-set(_common_include_directories ${EXECUTORCH_ROOT}/..)`
	`28`	`+set(_common_include_directories ${EXECUTORCH_ROOT}/..`
	`29`	`+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)`
`29`	`30`
`30`	`31`	`target_include_directories(`
`31`	`32`	`cadence_kernels`
Original file line number	Diff line number	Diff line change
`@@ -328,7 +328,7 @@ Tensor& clamp_tensor_out(`
`328`	`328`	`const executorch::aten::optional<Tensor>& min_opt,`
`329`	`329`	`const executorch::aten::optional<Tensor>& max_opt,`
`330`	`330`	`Tensor& out) {`
`331`		`- clamp_Tensor_out(ctx, in, min_opt, max_opt, out);`
	`331`	`+ return clamp_Tensor_out(ctx, in, min_opt, max_opt, out);`
`332`	`332`	`}`
`333`	`333`
`334`	`334`	`} // namespace native`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ Tensor& mean_dim_out(`
`175`	`175`	`bool keepdim,`
`176`	`176`	`optional<ScalarType> dtype,`
`177`	`177`	`Tensor& out) {`
`178`		`- mean_out(ctx, in, dim_list, keepdim, dtype, out);`
	`178`	`+ return mean_out(ctx, in, dim_list, keepdim, dtype, out);`
`179`	`179`	`}`
`180`	`180`
`181`	`181`	`} // namespace native`
Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,7 @@ Tensor& softmax_out(`
`200`	`200`	`int64_t dim,`
`201`	`201`	`bool half_to_float,`
`202`	`202`	`Tensor& out) {`
`203`		`- _softmax_out(ctx, in, dim, half_to_float, out);`
	`203`	`+ return _softmax_out(ctx, in, dim, half_to_float, out);`
`204`	`204`	`}`
`205`	`205`
`206`	`206`	`} // namespace native`