Precompute multiplicative inverse when possible in op_div (#5209)

swolchok · facebook-github-bot · commit 07e15a90e16b · 2024-09-13T09:56:32.000-07:00
Summary: Pull Request resolved: #5209 Division is generally slower than multiplication in hardware. ghstack-source-id: 242226478 Reviewed By: kimishpatel Differential Revision: D62412539 fbshipit-source-id: 78c659d84579c42da3dc44a8d8c88e31ea369430
diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp
@@ -84,8 +84,11 @@ Tensor& opt_div_out(
                 tensor->const_data_ptr<CTYPE>(),
                 out.numel());
           } else {
+            Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
             executorch::vec::map<CTYPE>(
-                [scalar_casted](Vec x) { return x / Vec(scalar_casted); },
+                [inv_scalar_casted_vec](Vec x) {
+                  return x * inv_scalar_casted_vec;
+                },
                 out.mutable_data_ptr<CTYPE>(),
                 tensor->const_data_ptr<CTYPE>(),
                 out.numel());
@@ -220,8 +223,9 @@ Tensor& opt_div_scalar_out(
             CTYPE b_casted = static_cast<CTYPE>(b_val);
 
             using Vec = executorch::vec::Vectorized<CTYPE>;
+            Vec inv_b_casted_vec(CTYPE(1) / b_casted);
             executorch::vec::map<CTYPE>(
-                [b_casted](Vec x) { return x / Vec(b_casted); },
+                [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
                 out.mutable_data_ptr<CTYPE>(),
                 a.const_data_ptr<CTYPE>(),
                 out.numel());
@@ -239,14 +243,16 @@ Tensor& opt_div_scalar_out(
                             CTYPE_B b_val;
                             ET_EXTRACT_SCALAR(b, b_val);
                             CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+                            CTYPE_IN inv_b_casted = CTYPE_IN(1) / b_casted;
 
                             const size_t n = a.numel();
                             const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
                             CTYPE_OUT* out_data =
                                 out.mutable_data_ptr<CTYPE_OUT>();
                             for (auto i = 0; i < n; ++i) {
                               out_data[i] = static_cast<CTYPE_OUT>(
-                                  static_cast<CTYPE_IN>(a_data[i]) / b_casted);
+                                  static_cast<CTYPE_IN>(a_data[i]) *
+                                  inv_b_casted);
                             }
                           });
                     });