Merge remote-tracking branch 'origin/main' into aar-for-bench-2

kirklandsign · kirklandsign · commit fd88afa03c79 · 2024-08-30T17:07:17.000-07:00
diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts
@@ -8,8 +8,8 @@ android {
 
     defaultConfig {
         applicationId = "org.pytorch.minibench"
-        minSdk = 24
-        targetSdk = 34
+        minSdk = 28
+        targetSdk = 33
         versionCode = 1
         versionName = "1.0"
 
diff --git a/extension/android/benchmark/build.gradle.kts b/extension/android/benchmark/build.gradle.kts
@@ -1,4 +1,4 @@
 // Top-level build file where you can add configuration options common to all sub-projects/modules.
 plugins {
-    id("com.android.application") version "8.2.2" apply false
+    id("com.android.application") version "8.1.0" apply false
 }
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
@@ -14,7 +14,7 @@ runtime.python_test(
         "test_sdpa_with_kv_cache.py",
     ],
     preload_deps = [
-        ":custom_ops_aot_lib",
+        ":custom_ops_aot_lib_mkl_noomp",
         ":custom_ops_aot_py",
     ],
     deps = [
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -6,47 +6,48 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    runtime.cxx_library(
-        name = "custom_ops",
-        srcs = ["op_sdpa.cpp", "op_fallback.cpp"],
-        exported_headers = ["op_sdpa.h", "op_fallback.h"],
-        exported_deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/kernels/optimized:libblas",
-            "//executorch/kernels/optimized:libvec",
-            "//executorch/extension/kernel_util:kernel_util",
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/extension/threadpool:threadpool",
-        ],
-        compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
-        visibility = [
-            "//executorch/...",
-            "//executorch/extension/llm/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        # @lint-ignore BUCKLINT link_whole
-        link_whole = True,
-        force_static = True,
-    )
+    for mkl_dep in ["", "_mkl_noomp"]:
+        runtime.cxx_library(
+            name = "custom_ops" + mkl_dep,
+            srcs = ["op_sdpa.cpp", "op_fallback.cpp"],
+            exported_headers = ["op_sdpa.h", "op_fallback.h"],
+            exported_deps = [
+                "//executorch/runtime/kernel:kernel_includes",
+                "//executorch/kernels/portable/cpu:scalar_utils",
+                "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
+                "//executorch/kernels/optimized:libvec",
+                "//executorch/extension/kernel_util:kernel_util",
+                "//executorch/extension/parallel:thread_parallel",
+                "//executorch/extension/threadpool:threadpool",
+            ],
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
+            visibility = [
+                "//executorch/...",
+                "//executorch/extension/llm/custom_ops/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            # @lint-ignore BUCKLINT link_whole
+            link_whole = True,
+            force_static = True,
+        )
 
-    runtime.cxx_library(
-        name = "custom_ops_aot_lib",
-        srcs = [
-            "op_sdpa_aot.cpp",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        external_deps = [
-            "libtorch",
-        ],
-        deps = [
-            ":custom_ops",
-            "//executorch/extension/aten_util:aten_bridge",
-        ],
-    )
+        runtime.cxx_library(
+            name = "custom_ops_aot_lib" + mkl_dep,
+            srcs = [
+                "op_sdpa_aot.cpp",
+            ],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            external_deps = [
+                "libtorch",
+            ],
+            deps = [
+                ":custom_ops" + mkl_dep,
+                "//executorch/extension/aten_util:aten_bridge",
+            ],
+        )
 
     runtime.python_library(
         name = "custom_ops_aot_py",
diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -392,17 +392,50 @@ def setUp(self):
         self.max_seq_len = 2048
         self.setup_caches()
 
+    def _scale_tensor(self, tensor, min_value, max_value, scale=True):
+        normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
+
+        scaled_tensor = normalized_tensor * (max_value - min_value) + min_value
+
+        return scaled_tensor if scale else tensor
+
     def _test_sdpa_common(
-        self, n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len=1
+        self,
+        n_heads_kv,
+        n_heads_q,
+        head_dim,
+        max_seq_len,
+        seq_len,
+        next_iter_seq_len=1,
+        scale_tensors=False,
     ):
+        # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests
+        tensor_scale_max = 20
+        tensor_scale_min = -20
         self.n_heads_kv = n_heads_kv
         self.n_heads_q = n_heads_q
         self.head_dim = head_dim
         self.max_seq_len = max_seq_len
         self.setup_caches()
-        q = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
-        k = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
-        v = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim))
+        q = self._scale_tensor(
+            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        k = self._scale_tensor(
+            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        v = self._scale_tensor(
+            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+
         start_pos = 0
         attn_mask = self.mask[start_pos : start_pos + seq_len, :]
         attn_mask = attn_mask[:, : start_pos + seq_len]
@@ -412,11 +445,27 @@ def _test_sdpa_common(
         op_output = torch.ops.llama.sdpa_with_kv_cache(
             q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True
         )
-        self.assertTrue(torch.allclose(ref_output, op_output))
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6))
+
+        q = self._scale_tensor(
+            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        k = self._scale_tensor(
+            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        v = self._scale_tensor(
+            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
 
-        q = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
-        k = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
-        v = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim))
         start_pos = seq_len
         seq_len = q.size(1)
         attn_mask = self.mask[start_pos : start_pos + seq_len, :]
@@ -427,7 +476,7 @@ def _test_sdpa_common(
         op_output = torch.ops.llama.sdpa_with_kv_cache(
             q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True
         )
-        self.assertTrue(torch.allclose(ref_output, op_output))
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6))
 
 
 class SDPATestForLargeSeqLength(SDPATestCommon):
@@ -438,7 +487,9 @@ def test_sdpa_with_cache_seq_len_130(self):
         head_dim = 128
         max_seq_len = 2048
         seq_len = 130
-        self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len)
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True
+        )
 
     def test_sdpa_with_cache_seq_len_small(self):
         n_heads_kv = 4
@@ -462,7 +513,9 @@ def test_sdpa_with_cache_seq_len_130_gqa(self):
         head_dim = 128
         max_seq_len = 2048
         seq_len = 130
-        self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len)
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True
+        )
 
     def test_sdpa_with_cache_seq_len_llava_example_gqa(self):
         n_heads_kv = 16
@@ -483,7 +536,13 @@ def test_sdpa_with_cache_seq_len_130(self):
         seq_len = 130
         next_iter_seq_len = 17
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            seq_len,
+            next_iter_seq_len,
+            True,
         )
 
     def test_sdpa_with_cache_seq_len_llava_example(self):
@@ -505,7 +564,13 @@ def test_sdpa_with_cache_seq_len_130_gqa(self):
         seq_len = 130
         next_iter_seq_len = 33
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            seq_len,
+            next_iter_seq_len,
+            True,
         )
 
     def test_sdpa_with_cache_seq_len_llava_example_gqa(self):
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -1,4 +1,5 @@
 load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFORM_REGEX")
+load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 # Because vec exists as a collection of header files, compile and preprocessor
@@ -99,44 +100,64 @@ def define_libs():
         ],
     )
 
-    runtime.cxx_library(
-        name = "libblas",
-        srcs = native.glob([
-            "blas/**/*.cpp",
-        ]),
-        exported_headers = native.glob([
-            "blas/**/*.h",
-        ]),
-        header_namespace = "executorch/kernels/optimized",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        fbandroid_platform_preprocessor_flags = [
-            (
-                "^android-arm64.*$",
-                [
-                    "-DET_BUILD_WITH_BLAS",
-                ],
-            ),
-        ],
-        fbandroid_platform_deps = [
-            (
-                "^android-arm64.*$",
-                [
-                    "fbsource//third-party/openblas:openblas",
-                ],
-            ),
-        ],
-        fbobjc_exported_preprocessor_flags = [
-            "-DET_BUILD_WITH_BLAS",
-            "-DET_BUILD_FOR_APPLE",
-        ],
-        fbobjc_frameworks = [
-            "Accelerate",
-        ],
-        exported_deps = [
-            "//executorch/kernels/optimized:libutils",
-            "//executorch/runtime/core/exec_aten:lib",
+    # OSS doesn't have ovr_config//os:linux-x86_64
+    fb_native.config_setting(
+        name = "linux-x86_64",
+        constraint_values = [
+            "ovr_config//os/constraints:linux",
+            "ovr_config//cpu/constraints:x86_64",
         ],
     )
+
+    for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
+        runtime.cxx_library(
+            name = libblas_name,
+            srcs = native.glob([
+                "blas/**/*.cpp",
+            ]),
+            exported_headers = native.glob([
+                "blas/**/*.h",
+            ]),
+            header_namespace = "executorch/kernels/optimized",
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            preprocessor_flags = select({
+                ":linux-x86_64": [
+                    "-DET_BUILD_WITH_BLAS",
+                ] if not runtime.is_oss else [],
+                "DEFAULT": [],
+            }),
+            fbandroid_platform_preprocessor_flags = [
+                (
+                    "^android-arm64.*$",
+                    [
+                        "-DET_BUILD_WITH_BLAS",
+                    ],
+                ),
+            ],
+            fbandroid_platform_deps = [
+                (
+                    "^android-arm64.*$",
+                    [
+                        "fbsource//third-party/openblas:openblas",
+                    ],
+                ),
+            ],
+            fbobjc_exported_preprocessor_flags = [
+                "-DET_BUILD_WITH_BLAS",
+                "-DET_BUILD_FOR_APPLE",
+            ],
+            fbobjc_frameworks = [
+                "Accelerate",
+            ],
+            deps = select({
+                ":linux-x86_64": [mkl_dep] if not runtime.is_oss else [],
+                "DEFAULT": [],
+            }),
+            exported_deps = [
+                "//executorch/kernels/optimized:libutils",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+        )
diff --git a/shim/tools/build_defs/fb_native_wrapper.bzl b/shim/tools/build_defs/fb_native_wrapper.bzl
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under both the MIT license found in the
+# LICENSE-MIT file in the root directory of this source tree and the Apache
+# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
+# of this source tree.
+
+fb_native = struct(
+    config_setting = native.config_setting,
+)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`// Top-level build file where you can add configuration options common to all sub-projects/modules.`
`2`	`2`	`plugins {`
`3`		`- id("com.android.application") version "8.2.2" apply false`
	`3`	`+ id("com.android.application") version "8.1.0" apply false`
`4`	`4`	`}`