Reapply #9842: Save some size in dtype_util when dtype selective build is not in use

swolchok · swolchok · commit fb91dee891b5 · 2025-04-25T13:02:14.000-07:00
We duplicate a lot of functions depending on the operator name so that dtype selective build will work. We can just detect if dtype selective build is in use and, if not, stop duplicating. Test Plan: compared results of bash test/build_optimized_size_test.sh before/after this rev. Before: ``` ExecuTorch with no ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 153928 Apr 25 12:24 cmake-out/test/size_test ExecuTorch with portable ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 2150960 Apr 25 12:24 cmake-out/test/size_test_all_ops ExecuTorch with optimized ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 5887368 Apr 25 12:24 cmake-out/test/size_test_all_optimized_ops (.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test* __TEXT __DATA __OBJC others dec hex 81920 81920 0 4295049216 4295213056 10003c000 cmake-out/test/size_test 1474560 81920 0 4295655424 4297211904 100224000 cmake-out/test/size_test_all_ops 4489216 98304 0 4296359936 4300947456 1005b4000 cmake-out/test/size_test_all_optimized_ops ``` After: ``` ExecuTorch with no ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 153928 Apr 25 12:51 cmake-out/test/size_test ExecuTorch with portable ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 1796928 Apr 25 12:51 cmake-out/test/size_test_all_ops ExecuTorch with optimized ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 5605176 Apr 25 12:51 cmake-out/test/size_test_all_optimized_ops (.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test* __TEXT __DATA __OBJC others dec hex 81920 81920 0 4295049216 4295213056 10003c000 cmake-out/test/size_test 1310720 81920 0 4295458816 4296851456 1001cc000 cmake-out/test/size_test_all_ops 4358144 98304 0 4296212480 4300668928 100570000 cmake-out/test/size_test_all_optimized_ops ``` (This was reverted because the diff it was stacked on was a size regression. Reversing the order instead this time around, and reverted part of the change that was actually regressing size.) ghstack-source-id: 8d82c29 ghstack-comment-id: 2831329046 Pull-Request-resolved: #10490
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
@@ -228,7 +228,7 @@ enum class SupportedTensorDtypes {
 namespace internal {
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -251,6 +251,10 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
   return nullptr;
 }
 
+// NOTE: applying the #ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+// technique used for get_load_to_compute_fn in this path was a size
+// regression rather than an improvement. Haven't fully investigated
+// why; just be aware when trying to improve size further.
 template <typename CTYPE_COMPUTE, const char* op_name>
 store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
     const Tensor& t,
@@ -285,6 +289,28 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
   return nullptr;
 }
 
+#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+inline constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op";
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  // NOTE: Selective build relies on the operator name being passed
+  // here. When it's *not* active, using the same operator name
+  // everywhere saves on size because we don't require a new template
+  // instantiation for every operator.
+  return get_load_to_compute_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
+
 bool check_tensor_dtype(
     const Tensor t,
     SupportedTensorDtypes dtypes,