pytorch
diff --git a/‎extension/training/optimizer/sgd.cpp
Lines changed: 176 additions & 0 deletions b/‎extension/training/optimizer/sgd.cpp
Lines changed: 176 additions & 0 deletions
diff --git a/‎extension/training/optimizer/sgd.h
Lines changed: 104 additions & 3 deletions b/‎extension/training/optimizer/sgd.h
Lines changed: 104 additions & 3 deletions
diff --git a/‎extension/training/optimizer/targets.bzl
Lines changed: 20 additions & 1 deletion b/‎extension/training/optimizer/targets.bzl
Lines changed: 20 additions & 1 deletion
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/training/optimizer/sgd.h>
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace torch {
+namespace executor {
+namespace training {
+namespace optimizer {
+
+bool SGDParamGroup::has_options() const {
+  return options_ != nullptr;
+}
+
+SGDOptions& SGDParamGroup::options() {
+  return *options_.get();
+}
+
+const SGDOptions& SGDParamGroup::options() const {
+  return *options_.get();
+}
+
+void SGDParamGroup::set_options(std::unique_ptr<SGDOptions> options) {
+  options_ = std::move(options);
+}
+
+Span<const char*> SGDParamGroup::param_names() {
+  return param_names_;
+}
+
+const Span<const char*> SGDParamGroup::param_names() const {
+  return param_names_;
+}
+
+Span<Tensor> SGDParamGroup::param_data() {
+  return param_data_;
+}
+
+const Span<Tensor> SGDParamGroup::param_data() const {
+  return param_data_;
+}
+
+void SGD::add_param_group(const SGDParamGroup& param_group) {
+  SGDParamGroup param_group_(
+      param_group.param_names(), param_group.param_data());
+  if (!param_group.has_options()) {
+    param_group_.set_options(defaults_->clone());
+  } else {
+    param_group_.set_options(param_group.options().clone());
+  }
+  param_groups_.emplace_back(std::move(param_group_));
+}
+
+Error SGD::step(Span<const char*> gradient_names, Span<Tensor> gradient_data) {
+  // check that the number of gradient names matches the number of gradients
+  ET_CHECK_OR_RETURN_ERROR(
+      gradient_names.size() == gradient_data.size(),
+      InvalidState,
+      "Gradient names and gradients must have the same length.");
+
+  RuntimeContext context;
+  for (auto& group : param_groups_) {
+    auto& options = static_cast<SGDOptions&>(group.options());
+    auto weight_decay = options.weight_decay();
+    auto momentum = options.momentum();
+    auto dampening = options.dampening();
+    auto nesterov = options.nesterov();
+
+    for (int i = 0; i < group.param_names().size(); i++) {
+      for (int j = 0; j < gradient_names.size(); j++) {
+        // if param name and gradient name match, run the optimizer step
+        if (strcmp(group.param_names()[i], gradient_names[j]) == 0) {
+          auto d_p = gradient_data[j];
+          auto p = group.param_data()[i];
+          if (weight_decay != 0) {
+            // uses weight_decay specified and adds it to the gradient
+            torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p);
+            if (context.failure_state() != Error::Ok) {
+              return context.failure_state();
+            }
+          }
+          if (momentum != 0) {
+            Tensor buf(nullptr);
+            auto param_state = state_.find(p.unsafeGetTensorImpl());
+            // look for the momentum buffer for the given parameter. this is the
+            // momentum as of the previous epoch
+            if (param_state == state_.end()) {
+              // create a new momentum buffer if it doesn't exist. this memory
+              // needs to be freed when the optimizer is destroyed
+              void* buf_ptr = malloc(d_p.nbytes());
+
+#ifdef USE_ATEN_LIB
+              std::vector<int64_t> sizes(
+                  d_p.sizes().begin(), d_p.sizes().end());
+              buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type());
+#else
+              TensorImpl* buf_impl = new TensorImpl(
+                  d_p.scalar_type(),
+                  d_p.sizes().size(),
+                  const_cast<TensorImpl::SizesType*>(d_p.sizes().data()),
+                  buf_ptr,
+                  const_cast<TensorImpl::DimOrderType*>(
+                      d_p.dim_order().data()));
+              buf = Tensor(buf_impl);
+#endif
+              torch::executor::aten::clone_outf(
+                  context, d_p, exec_aten::MemoryFormat::Contiguous, buf);
+              if (context.failure_state() != Error::Ok) {
+                return context.failure_state();
+              }
+
+              // save the state of the momentum buffer to be reused in later
+              // epochs
+              auto state = std::make_unique<SGDParamState>(buf);
+              state_[p.unsafeGetTensorImpl()] = std::move(state);
+            } else {
+              buf = static_cast<SGDParamState&>(*param_state->second)
+                        .momentum_buffer();
+
+              // update the momentum buffer and apply dampening
+              torch::executor::aten::mul_outf(context, buf, momentum, buf);
+              if (context.failure_state() != Error::Ok) {
+                return context.failure_state();
+              }
+              torch::executor::aten::add_outf(
+                  context, buf, d_p, 1 - dampening, buf);
+              if (context.failure_state() != Error::Ok) {
+                return context.failure_state();
+              }
+            }
+            if (nesterov) {
+              // apply nesterov momentum
+              torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p);
+              if (context.failure_state() != Error::Ok) {
+                return context.failure_state();
+              }
+            } else {
+              d_p = buf;
+            }
+          }
+          // update the parameter using the gradient and learning rate
+          torch::executor::aten::add_outf(
+              context, p, d_p, -1 * options.lr(), p);
+          if (context.failure_state() != Error::Ok) {
+            return context.failure_state();
+          }
+          break;
+        }
+      }
+    }
+  }
+  return Error::Ok;
+}
+
+SGD::~SGD() {
+  for (const auto& state_kv : state_) {
+    auto state_tensor = static_cast<SGDParamState&>(*state_kv.second);
+    free(state_tensor.momentum_buffer().unsafeGetTensorImpl()->mutable_data());
+#ifndef USE_ATEN_LIB
+    delete state_tensor.momentum_buffer().unsafeGetTensorImpl();
+#endif
+  }
+}
+} // namespace optimizer
+} // namespace training
+} // namespace executor
+} // namespace torch
@@ -16,14 +16,21 @@
  */
 #pragma once
 
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/span.h>
 #include <memory>
+#include <unordered_map>
+#include <vector>
 
 namespace torch {
 namespace executor {
+namespace training {
 namespace optimizer {
 
 using Tensor = exec_aten::Tensor;
+using TensorImpl = exec_aten::TensorImpl;
+using ScalarType = exec_aten::ScalarType;
 
 /**
  * SGD optimizer state. This keeps track of the state of a given parameter to
@@ -123,16 +130,110 @@ class SGDOptions {
 
 /**
  * SGD optimizer param group. This contains the parameters and
- * the OptimizerOptions associated to it.
+ * the SGDOptions associated to it.
  */
-class SGDParamGroup {};
+class SGDParamGroup {
+ public:
+  // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has
+  // to be copy-constructible.
+  SGDParamGroup(const SGDParamGroup& param_group)
+      : param_data_(param_group.param_data()),
+        param_names_(param_group.param_names()),
+        options_(
+            param_group.has_options() ? param_group.options().clone()
+                                      : nullptr) {}
+  SGDParamGroup& operator=(const SGDParamGroup& param_group) {
+    this->param_data_ = param_group.param_data();
+    this->param_names_ = param_group.param_names();
+    this->options_ =
+        param_group.has_options() ? param_group.options().clone() : nullptr;
+    return *this;
+  }
+
+  /**
+   * This constructs a SGD param group. We expect that the two spans are of the
+   * same size, and that for a given param data, its index in param_data is the
+   * same as its param name in param_name.
+   *
+   * @param[in] param_names The names of the params for this group.
+   * @param[in] param_data The tensors representing the param data.
+   */
+  /* implicit */ SGDParamGroup(
+      Span<const char*> param_names,
+      Span<Tensor> param_data)
+      : param_data_(std::move(param_data)),
+        param_names_(std::move(param_names)) {}
+  SGDParamGroup(
+      Span<const char*> param_names,
+      Span<Tensor> param_data,
+      std::unique_ptr<SGDOptions> options)
+      : param_data_(std::move(param_data)),
+        param_names_(std::move(param_names)),
+        options_(std::move(options)) {}
+
+  bool has_options() const;
+  SGDOptions& options();
+  const SGDOptions& options() const;
+  void set_options(std::unique_ptr<SGDOptions> options);
+  Span<const char*> param_names();
+  const Span<const char*> param_names() const;
+  Span<Tensor> param_data();
+  const Span<Tensor> param_data() const;
+
+ private:
+  Span<Tensor> param_data_;
+  Span<const char*> param_names_;
+  std::unique_ptr<SGDOptions> options_;
+};
 
 /**
  * SGD optimizer class. This is responsible for performing the optimization
  * step.
  */
-class SGD {};
+class SGD {
+ public:
+  explicit SGD(
+      const std::vector<SGDParamGroup>& param_groups,
+      SGDOptions defaults)
+      : defaults_(std::make_unique<SGDOptions>(defaults)) {
+    for (const auto& param_group : param_groups) {
+      add_param_group(param_group);
+    }
+  }
+
+  explicit SGD(
+      Span<const char*> param_names,
+      Span<Tensor> param_data,
+      SGDOptions defaults)
+      : SGD({SGDParamGroup(std::move(param_names), std::move(param_data))},
+            defaults) {}
+
+  // Adds the given param_group to the optimizer's param_group list.
+  void add_param_group(const SGDParamGroup& param_group);
+
+  ~SGD();
+
+  /**
+   * Performs the optimization step.
+   *
+   * The two spans must be of the same size. It is expected that the gradient in
+   * 'gradient_data' at index 'i' represents the gradient calculated in the loss
+   * function for the parameter with the name in 'gradient_names' at index 'i'.
+   *
+   * @param[in] gradient_names The names of the params that matches the gradient
+   *   in 'gradient_data' at the same index.
+   * @param[in] gradient_data The gradient tensors to be used for optimization
+   *   step.
+   */
+  Error step(Span<const char*> gradient_names, Span<Tensor> gradient_data);
+
+ private:
+  std::vector<SGDParamGroup> param_groups_;
+  std::unordered_map<void*, std::unique_ptr<SGDParamState>> state_;
+  std::unique_ptr<SGDOptions> defaults_;
+};
 
 } // namespace optimizer
+} // namespace training
 } // namespace executor
 } // namespace torch
@@ -10,14 +10,33 @@ def define_common_targets():
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
 
+        if aten_mode:
+            kernel_deps = [
+                "//executorch/kernels/aten:generated_lib",
+                "//executorch/kernels/aten:generated_lib_headers",
+                "//executorch/kernels/test:function_header_wrapper_aten",
+            ]
+        else:
+            kernel_deps = [
+                "//executorch/kernels/portable/cpu:op_add",
+                "//executorch/kernels/portable/cpu:op_mul",
+                "//executorch/kernels/portable/cpu:op_clone",
+                "//executorch/kernels/portable:generated_lib_headers",
+                "//executorch/kernels/test:function_header_wrapper_portable",
+            ]
+
         runtime.cxx_library(
             name = "optimizer" + aten_suffix,
+            srcs = [
+                "sgd.cpp",
+            ],
             exported_headers = [
                 "sgd.h",
             ],
             exported_deps = [
+                "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-            ],
+            ] + kernel_deps,
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],