pytorch
diff --git a/‎kernels/portable/cpu/op_avg_pool2d.cpp
Lines changed: 114 additions & 0 deletions b/‎kernels/portable/cpu/op_avg_pool2d.cpp
Lines changed: 114 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/op_max_pool2d_with_indices.cpp
Lines changed: 9 additions & 4 deletions b/‎kernels/portable/cpu/op_max_pool2d_with_indices.cpp
Lines changed: 9 additions & 4 deletions
diff --git a/‎kernels/portable/cpu/targets.bzl
Lines changed: 6 additions & 0 deletions b/‎kernels/portable/cpu/targets.bzl
Lines changed: 6 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/util/kernel_ops_util.cpp
Lines changed: 52 additions & 0 deletions b/‎kernels/portable/cpu/util/kernel_ops_util.cpp
Lines changed: 52 additions & 0 deletions
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+
+Tensor& avg_pool2d_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    exec_aten::optional<int64_t> divisor_override,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      check_avg_pool2d_args(
+          in,
+          kernel_size,
+          stride,
+          padding,
+          ceil_mode,
+          count_include_pad,
+          divisor_override,
+          out),
+      InvalidArgument,
+      out);
+
+  size_t output_ndim = 0;
+  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  get_avg_pool2d_out_target_size(
+      in, kernel_size, stride, padding, ceil_mode, output_sizes, &output_ndim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      output_size_is_valid({output_sizes, output_ndim}),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ScalarType in_type = in.scalar_type();
+  ET_SWITCH_FLOAT_TYPES_AND(Long, in_type, ctx, __func__, CTYPE, [&]() {
+    if (divisor_override.has_value()) {
+      int64_t divisor = divisor_override.value();
+      // If divisor_override is specified, then we don't need to use `count` in
+      // the calculation. Simply sum x / divisor to get the output.
+      apply_kernel_2d_reduce_then_map_fn<CTYPE>(
+          [](const CTYPE in_val,
+             int64_t in_idx,
+             CTYPE accum,
+             int64_t accum_idx) {
+            // Average pooling does not track indexes, so return 0 for accum_idx
+            return std::tuple<CTYPE, int64_t>(in_val + accum, 0);
+          },
+          [divisor](const int64_t count, const CTYPE accum) {
+            return accum / static_cast<CTYPE>(divisor);
+          },
+          count_include_pad,
+          in,
+          kernel_size,
+          stride,
+          padding,
+          {},
+          out);
+    } else {
+      apply_kernel_2d_reduce_then_map_fn<CTYPE>(
+          [](const CTYPE in_val,
+             int64_t in_idx,
+             CTYPE accum,
+             int64_t accum_idx) {
+            // Average pooling does not track indexes, so return 0 for accum_idx
+            return std::tuple<CTYPE, int64_t>(in_val + accum, 0);
+          },
+          [](const int64_t count, const CTYPE accum) {
+            return accum / static_cast<CTYPE>(count);
+          },
+          count_include_pad,
+          in,
+          kernel_size,
+          stride,
+          padding,
+          {},
+          out);
+    }
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -9,7 +9,6 @@
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -55,7 +54,7 @@ std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
       ctx,
       output_size_is_valid({output_sizes, output_ndim}),
       InvalidArgument,
-      out);
+      ret_val);
 
   ET_KERNEL_CHECK(
       ctx,
@@ -71,13 +70,19 @@ std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
 
   ScalarType in_type = in.scalar_type();
   ET_SWITCH_REAL_TYPES(in_type, ctx, __func__, CTYPE, [&]() {
-    apply_kernel_2d_reduce_fn<CTYPE>(
-        [](const CTYPE in_val, int64_t in_idx, CTYPE accum, int64_t accum_idx) {
+    apply_kernel_2d_reduce_then_map_fn<CTYPE>(
+        [](const CTYPE in_val,
+           const int64_t in_idx,
+           const CTYPE accum,
+           const int64_t accum_idx) {
           if (in_val > accum) {
             return std::tuple<CTYPE, int64_t>(in_val, in_idx);
           }
           return std::tuple<CTYPE, int64_t>(accum, accum_idx);
         },
+        // Max pooling does not need to post-process the accumulated output
+        [](const int64_t count, const CTYPE accum) { return accum; },
+        /*include_pad=*/false,
         in,
         kernel_size,
         stride,
 
@@ -119,6 +119,12 @@ _ATEN_OPS = (
             "//executorch/kernels/portable/cpu/pattern:pattern",
         ],
     ),
+    op_target(
+        name = "op_avg_pool2d",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
+        ],
+    ),
     op_target(
         name = "op_bitwise_and",
         deps = [
 
@@ -192,6 +192,58 @@ void calculate_kernel_output_sizes(
   }
 }
 
+bool check_avg_pool2d_args(
+    const Tensor& in,
+    const IntArrayRef kernel_size,
+    const IntArrayRef stride,
+    const IntArrayRef padding,
+    const bool ceil_mode,
+    const bool count_include_pad,
+    const exec_aten::optional<int64_t>& divisor_override,
+    const Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(out));
+
+  ET_LOG_AND_RETURN_IF_FALSE(kernel_size_is_valid(kernel_size, 2));
+  if (stride.size() > 0) {
+    ET_LOG_AND_RETURN_IF_FALSE(stride_is_valid(kernel_size, 2));
+  }
+  ET_LOG_AND_RETURN_IF_FALSE(padding_is_valid(padding, kernel_size, 2, true));
+
+  if (divisor_override.has_value()) {
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        divisor_override.value() > 0,
+        "divisor_override must be > 0, but found %" PRId64,
+        divisor_override.value());
+  }
+
+  return true;
+}
+
+void get_avg_pool2d_out_target_size(
+    const Tensor& in,
+    const IntArrayRef kernel_size,
+    const IntArrayRef stride,
+    const IntArrayRef padding,
+    const bool ceil_mode,
+    exec_aten::SizesType* const out_sizes,
+    size_t* const out_ndim) {
+  *out_ndim = in.dim();
+
+  // Batch dim is optional, so in can be either 3 or 4 dim.
+  if (in.dim() == 4) {
+    out_sizes[0] = in.size(0);
+    out_sizes[1] = in.size(1);
+  } else {
+    out_sizes[0] = in.size(0);
+  }
+
+  calculate_kernel_output_sizes(
+      in, kernel_size, stride, padding, {}, out_sizes, ceil_mode);
+}
+
 bool check_convolution_args(
     const Tensor& in,
     const Tensor& weight,