Skip to content
This repository was archived by the owner on Jul 1, 2023. It is now read-only.

Normalization layers fix (fixes #384 and #426). #428

Merged
merged 6 commits into from
Aug 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions Sources/TensorFlow/Layers/Normalization.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,24 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
/// - Returns: The output.
@differentiable
public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
let positiveAxis = (input.rank + axis) % input.rank
var offset = self.offset
var scale = self.scale
if positiveAxis != input.rank - 1 {
var broadcastShape = TensorShape([Int](repeating: 1, count: input.rank))
broadcastShape[positiveAxis] = input.shape[positiveAxis]
offset = offset.reshaped(to: broadcastShape)
scale = scale.reshaped(to: broadcastShape)
}
switch Context.local.learningPhase {
case .training:
let positiveAxis = (input.rank + axis) % input.rank
var normalizedAxes = Array(0..<input.rank)
normalizedAxes.remove(at: positiveAxis)
let moments = input.moments(alongAxes: normalizedAxes)
runningMean.value += (moments.mean - runningMean.value) * (1 - momentum)
runningVariance.value += (moments.variance - runningVariance.value) * (1 - momentum)
let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
let inv = rsqrt(moments.variance + epsilon) * scale
return (input - moments.mean) * inv + offset
case .inference:
let inv = rsqrt(runningVariance.value + epsilon) * scale
return (input - runningMean.value) * inv + offset
Expand All @@ -100,13 +108,14 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
momentum: Tensor<Scalar> = Tensor(0.99),
epsilon: Tensor<Scalar> = Tensor(0.001)
) {
self.axis = axis
self.momentum = momentum
self.scale = Tensor<Scalar>(ones: [featureCount])
self.offset = Tensor<Scalar>(zeros: [featureCount])
self.epsilon = epsilon
self.runningMean = Parameter(Tensor(0))
self.runningVariance = Parameter(Tensor(1))
self.init(
axis: axis,
momentum: momentum,
offset: Tensor(zeros: [featureCount]),
scale: Tensor(ones: [featureCount]),
epsilon: epsilon,
runningMean: Tensor(0),
runningVariance: Tensor(1))
}
}

Expand Down Expand Up @@ -152,8 +161,7 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
offset: Tensor(zeros: [featureCount]),
scale: Tensor(ones: [featureCount]),
axis: axis,
epsilon: epsilon
)
epsilon: epsilon)
}

/// Returns the output obtained from applying the layer to the given input.
Expand All @@ -162,8 +170,13 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
/// - Returns: The output.
@differentiable
public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
let positiveAxis = (input.rank + axis) % input.rank
var broadcastShape = input.shape
broadcastShape[positiveAxis] = 1
let offset = self.offset.reshaped(to: broadcastShape)
let scale = self.scale.reshaped(to: broadcastShape)
let moments = input.moments(alongAxes: axis)
let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
let inv = rsqrt(moments.variance + epsilon) * scale
return (input - moments.mean) * inv + offset
}
}
18 changes: 11 additions & 7 deletions Tests/TensorFlowTests/LayerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ final class LayerTests: XCTestCase {
let grad = gradient(at: x, bnLayer) { $1($0).squared().sum() }
// The expected values and gradients were computed using the following Python code:
// ```
// x = tf.constant(
// x = tf.constant(
// [[ -1.0474433, -0.11914538, -0.08634827, 0.15446888, 1.0572497],
// [ 1.5165012, 0.3753972, -0.30856386, -0.3100725, -1.9584457],
// [ 0.006384419, 1.4424847, 0.91568077, 0.66328526, -1.0794537],
Expand Down Expand Up @@ -491,8 +491,10 @@ final class LayerTests: XCTestCase {
[ 1.2142579e-01, 1.7060755e-03, -6.5005139e-02, -9.3897656e-02, 3.5770576e-02]],
accuracy: 1e-5)
assertEqual(grad.1.offset, [0.0, 0.0, 0.0, 0.0, 0.0], accuracy: 1e-5)
assertEqual(grad.1.scale, [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
accuracy: 1e-5)
assertEqual(
grad.1.scale,
[9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
accuracy: 1e-5)
}
}

Expand All @@ -507,8 +509,8 @@ final class LayerTests: XCTestCase {
let value = lnLayer(x)
let grad = gradient(at: x, lnLayer) { $1($0).squared().sum() }

// Uses the same values as `testBatchNorm()` above because `LayerNorm` with features on axis
// `1` is equivalent to `BatchNorm` with features on axis `0`.
// Uses the same values as `testBatchNorm()` above because `LayerNorm` with features on
// axis `1` is equivalent to `BatchNorm` with features on axis `0`.
assertEqual(
value,
[[-1.5439795 , -0.16477099, -0.11604305, 0.24174842, 1.5830451 ],
Expand All @@ -525,9 +527,11 @@ final class LayerTests: XCTestCase {
[ 1.8438101e-03, 8.9146197e-05, -3.6990643e-03, 6.1964989e-04, 1.1463165e-03],
[ 1.2142579e-01, 1.7060755e-03, -6.5005139e-02, -9.3897656e-02, 3.5770576e-02]],
accuracy: 1e-5)
assertEqual(grad.1.scale, [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
accuracy: 1e-5)
assertEqual(grad.1.offset, [0.0, 0.0, 0.0, 0.0, 0.0], accuracy: 1e-5)
assertEqual(
grad.1.scale,
[9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
accuracy: 1e-5)
}

static var allTests = [
Expand Down