Add AdaDelta optimizer (#302)

lakshya-sky · rxwei · commit a41903b33ed0 · 2019-06-28T01:12:59.000-07:00
diff --git a/Sources/TensorFlow/Optimizer.swift b/Sources/TensorFlow/Optimizer.swift
@@ -264,41 +264,14 @@ public class SGD<Model: Differentiable>: Optimizer
     }
 }
 
-// MARK: - Manifold optimizers
-
-/// A Riemann manifold stochastic gradient descent (SGD) optimizer.
-public class RiemannSGD<Model: Differentiable>: Optimizer
-    where Model.TangentVector: VectorProtocol,
-          Model.TangentVector.VectorSpaceScalar: FloatingPoint {
-    public typealias Scalar = Model.TangentVector.VectorSpaceScalar
-    /// The learning rate.
-    public var learningRate: Model.TangentVector.VectorSpaceScalar
-
-    public init(learningRate: Model.TangentVector.VectorSpaceScalar) {
-        self.learningRate = learningRate
-    }
-
-    public convenience init(
-        for _: __shared Model,
-        learningRate: Scalar
-    ) {
-        self.init(learningRate: learningRate)
-    }
-
-    public func update(_ model: inout Model.AllDifferentiableVariables,
-                       along direction: Model.TangentVector) {
-        model.move(along: (.zero - direction).scaled(by: learningRate))
-    }
-}
 
 /// AdaGrad optimizer.
 ///
 /// Individually adapts the learning rates of all model parameters by scaling them inversely proportional to
 /// the square root of the sum of all the historical squared values of the gradient.
 ///
 /// Reference: ["Adaptive Subgradient Methods for Online Learning and Stochastic Optimization"](
-///  http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-///
+/// http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
 public class AdaGrad<Model: Layer>: Optimizer
     where Model.AllDifferentiableVariables == Model.TangentVector {
     public typealias Model = Model
@@ -355,3 +328,122 @@ public class AdaGrad<Model: Layer>: Optimizer
         update(&model.allDifferentiableVariables, along: direction)
     }
 }
+
+/// ADADELTA optimizer.
+///
+/// ADADELTA is a more robust extension of AdaGrad. ADADELTA adapts learning rates based on a moving
+/// window of gradient updates rather accumulating all past gradients. ADADELTA can continue to
+/// learn even after many update steps.
+/// 
+/// Reference: ["ADADELTA: An Adaptive Learning Rate Method"](https://arxiv.org/abs/1212.5701)
+public class AdaDelta<Model: Layer>: Optimizer
+    where Model.AllDifferentiableVariables == Model.TangentVector {
+    public typealias Model = Model
+    /// The learning rate.
+    public var learningRate: Float
+    /// The decay factor, corresponding to fraction of gradient to keep at each time step.
+    public var rho: Float
+    /// A small scalar added to the denominator to improve numerical stability.
+    public var epsilon: Float
+    /// The learning rate decay.
+    public var decay: Float
+    /// The current step.
+    public var step: Int = 0
+    /// The accumulated, exponentially decaying average of squared gradients.
+    public var averageSquared: Model.TangentVector
+    /// The accumulated parameter updates.
+    public var accumulatedDelta: Model.TangentVector
+
+    public init(
+        for model: __shared Model,
+        learningRate: Float = 1,
+        rho: Float = 0.95,
+        epsilon: Float = 1e-6,
+        decay: Float = 0
+    ) {
+        precondition(learningRate >= 0, "Learning rate must be non-negative")
+        precondition(0 <= rho && rho <= 1, "Rho parameter must be between 0 and 1")
+        precondition(0 <= epsilon, "Epsilon parameter must be non-negative")
+        precondition(decay >= 0, "Learning rate decay must be non-negative")
+
+        self.learningRate = learningRate
+        self.rho = rho
+        self.epsilon = epsilon
+        self.decay = decay
+
+        averageSquared = model.allDifferentiableVariables
+        accumulatedDelta = model.allDifferentiableVariables
+        
+        for kp in averageSquared.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self) {
+            averageSquared[keyPath: kp].resetToZero()
+            accumulatedDelta[keyPath: kp].resetToZero()
+        }
+        for kp in averageSquared.recursivelyAllWritableKeyPaths(to: Tensor<Double>.self) {
+            averageSquared[keyPath: kp].resetToZero()
+            accumulatedDelta[keyPath: kp].resetToZero()
+        }
+    }
+
+    // TODO: Deprecate this when `Differentiable.AllDifferentiableVariables` is removed.
+    public func update(_ model: inout Model.AllDifferentiableVariables,
+                       along direction: Model.AllDifferentiableVariables) {
+        step += 1
+        let learningRate = self.learningRate / (1 + decay * Float(step))
+        
+        // Update `Tensor<Float>` and `Tensor<Double>` variables.
+        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self) {
+            averageSquared[keyPath: kp] *= rho
+            averageSquared[keyPath: kp] +=
+                (1 - rho) * (direction[keyPath: kp] * direction[keyPath: kp])
+            var stepSize = direction[keyPath: kp] *
+                sqrt(accumulatedDelta[keyPath: kp] + epsilon)
+            stepSize /= sqrt(averageSquared[keyPath: kp] + epsilon)
+            model[keyPath: kp] -= learningRate * stepSize
+            accumulatedDelta[keyPath: kp] *= rho
+            accumulatedDelta[keyPath: kp] += (1 - rho) * stepSize.squared()
+        }
+        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Double>.self) {
+            averageSquared[keyPath: kp] *= Double(rho)
+            averageSquared[keyPath: kp] +=
+                (1 - Double(rho)) * (direction[keyPath: kp] * direction[keyPath: kp])
+            var stepSize = direction[keyPath: kp] *
+                sqrt(accumulatedDelta[keyPath: kp] + Double(epsilon))
+            stepSize /= sqrt(averageSquared[keyPath: kp] + Double(epsilon))
+            model[keyPath: kp] -= Double(learningRate) * stepSize
+            accumulatedDelta[keyPath: kp] *= Double(rho)
+            accumulatedDelta[keyPath: kp] += (1 - Double(rho)) * stepSize.squared()
+        }
+    }
+
+    public func update(_ model: inout Model,
+                       along direction: Model.TangentVector) {
+        update(&model.allDifferentiableVariables, along: direction)
+    }
+}
+
+// MARK: - Manifold optimizers
+
+/// A Riemann manifold stochastic gradient descent (SGD) optimizer.
+public class RiemannSGD<Model: Differentiable>: Optimizer
+    where Model.TangentVector: VectorProtocol,
+          Model.TangentVector.VectorSpaceScalar: FloatingPoint {
+    public typealias Scalar = Model.TangentVector.VectorSpaceScalar
+    /// The learning rate.
+    public var learningRate: Model.TangentVector.VectorSpaceScalar
+
+    public init(learningRate: Model.TangentVector.VectorSpaceScalar) {
+        self.learningRate = learningRate
+    }
+
+    public convenience init(
+        for _: __shared Model,
+        learningRate: Scalar
+    ) {
+        self.init(learningRate: learningRate)
+    }
+
+    public func update(_ model: inout Model.AllDifferentiableVariables,
+                       along direction: Model.TangentVector) {
+        model.move(along: (.zero - direction).scaled(by: learningRate))
+    }
+}
diff --git a/Tests/TensorFlowTests/SequentialTests.swift b/Tests/TensorFlowTests/SequentialTests.swift
@@ -33,6 +33,7 @@ final class SequentialTests: XCTestCase {
         let rmsprop = RMSProp(for: model, learningRate: 0.02)
         let adam = Adam(for: model, learningRate: 0.02)
         let adagrad = AdaGrad(for: model, learningRate: 0.02)
+        let adadelta = AdaDelta(for: model, learningRate: 0.02)
         let x: Tensor<Float> = [[0, 0], [0, 1], [1, 0], [1, 1]]
         let y: Tensor<Float> = [0, 1, 1, 0]
         Context.local.learningPhase = .training
@@ -49,9 +50,11 @@ final class SequentialTests: XCTestCase {
             adam.update(&model.allDifferentiableVariables, along: 𝛁model)
             adagrad.update(&model, along: 𝛁model)
             adagrad.update(&model.allDifferentiableVariables, along: 𝛁model)
+            adadelta.update(&model, along: 𝛁model)
+            adadelta.update(&model.allDifferentiableVariables, along: 𝛁model)
         }
         XCTAssertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),
-                       [[0.47705528], [0.47705528], [0.47705528], [0.47705528]])
+                       [[0.47683996], [0.47683996], [0.47683996], [0.47683996]])
     }
 
     static var allTests = [