@@ -406,10 +406,6 @@ public class RAdam<Model: Differentiable>: Optimizer
406
406
/// The second moments of the weights.
407
407
public var secondMoments : Model . TangentVector = . zero
408
408
409
- public var firstMoments_h : Model . TangentVector = . zero
410
-
411
- public var secondMoments_h : Model . TangentVector = . zero
412
-
413
409
public init (
414
410
for model: __shared Model,
415
411
learningRate: Float = 1e-3 ,
@@ -435,26 +431,24 @@ public class RAdam<Model: Differentiable>: Optimizer
435
431
let step = Float ( self . step)
436
432
let beta1Power = pow ( beta1, step)
437
433
let beta2Power = pow ( beta2, step)
438
- let stepSize = self . learningRate * step / ( 1 - beta1Power)
434
+ // let stepSize = self.learningRate * step / (1 - beta1Power)
439
435
secondMoments = beta2 * secondMoments + direction .* direction * ( 1 - beta2)
440
436
firstMoments = beta1 * firstMoments + direction * ( 1 - beta1)
441
-
442
437
// Compute maximum length SMA, bias-corrected moving average and approximate length
443
438
// SMA
444
439
let N_sma_inf = 2 / ( 1 - beta2) - 1
445
440
let N_sma_t = N_sma_inf - 2 * step*beta2Power / ( 1 - beta2Power)
446
- firstMoments_h = firstMoments
447
-
448
- if N_sma_t > 4 {
449
- // Comppute Bias corrected second moments, rectification and
450
- // adapted momentum
451
- secondMoments_h = Model . TangentVector. sqrt ( secondMoments)
452
- let r = sqrt ( ( N_sma_t- 4 ) * ( N_sma_t- 2 ) * N_sma_inf/ ( ( N_sma_inf- 4 ) * ( N_sma_inf- 2 ) * ( N_sma_t) ) )
453
- model. move ( along: - stepSize*sqrt( 1 - beta2Power) * firstMoments_h*r./ secondMoments_h)
441
+
442
+ if N_sma_t > 5 {
443
+ // Compute Bias corrected second moments, rectification and adapted momentum
444
+ let secondMoments_h = Model . TangentVector. sqrt ( secondMoments) + epsilon
445
+ let stepSize = sqrt ( ( N_sma_t- 4 ) * ( N_sma_t- 2 ) * N_sma_inf/ ( ( N_sma_inf- 4 ) * ( N_sma_inf- 2 ) * ( N_sma_t) ) )
446
+ model. move ( along: - stepSize*sqrt( 1 - beta2Power) * firstMoments./ secondMoments_h)
454
447
}
455
448
else {
456
449
// Update with un-adapted momentum
457
- model. move ( along: - stepSize*firstMoments_h)
450
+ let stepSize = self . learningRate * step / ( 1 - beta1Power)
451
+ model. move ( along: - stepSize*firstMoments)
458
452
}
459
453
}
460
- }
454
+ }
0 commit comments