special case dim-list ops for dim list having size 1 (#9111)

swolchok · web-flow · commit c5e457a597ee · 2025-03-11T15:39:57.000-07:00
Applying over a single dim is much faster; the general
apply-over-dim-list logic has to do extra work (tracking the current
delinearized index).
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
@@ -347,6 +347,10 @@ class ApplyOverDimListPlan {
       return;
     }
     dim_list_ = dim_list.value();
+    if (dim_list_.value().size() == 1) {
+      mode_ = ExecutionMode::OnlyOneDim;
+      return;
+    }
     is_in_dim_list_.fill(0);
     for (const auto& d : dim_list.value()) {
       const size_t non_neg_d = d < 0 ? d + in.dim() : d;
@@ -367,6 +371,14 @@ class ApplyOverDimListPlan {
         apply_on_flat_ix_with_stride_and_base(
             fn, /*stride=*/1, /*base=*/0, ustart_, uend_);
         return;
+      case ExecutionMode::OnlyOneDim:
+        apply_on_flat_and_dim_ix_with_stride_and_base(
+            [&](const auto in_ix, const auto dim_ix) { fn(in_ix); },
+            in_.strides()[ET_NORMALIZE_IX(dim_list_.value()[0], in_.dim())],
+            get_init_index(in_, dim_list_.value(), out_ix),
+            ustart_,
+            uend_);
+        return;
       case ExecutionMode::NormalDimMask:
         apply_on_flat_ix_with_dim_mask_and_base(
             fn,
@@ -399,6 +411,9 @@ class ApplyOverDimListPlan {
     // Iterate over the entire tensor with
     // apply_on_flat_ix_with_stride_and_base.
     NoDimMaskOrZeroDimension,
+    // dim_list has size 1, iterate with
+    // apply_on_flat_and_dim_ix_with_stride_and_base
+    OnlyOneDim,
     // General mode, iterate with
     // apply_on_flat_ix_with_dim_mask_and_base.
     NormalDimMask