Normalization layers fix (fixes #384 and #426). (#428)

eaplatanios · web-flow · commit f218a34fce6a · 2019-08-09T11:49:50.000-04:00
diff --git a/Sources/TensorFlow/Layers/Normalization.swift b/Sources/TensorFlow/Layers/Normalization.swift
@@ -71,16 +71,24 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
     /// - Returns: The output.
     @differentiable
     public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+        let positiveAxis = (input.rank + axis) % input.rank
+        var offset = self.offset
+        var scale = self.scale
+        if positiveAxis != input.rank - 1 {
+            var broadcastShape = TensorShape([Int](repeating: 1, count: input.rank))
+            broadcastShape[positiveAxis] = input.shape[positiveAxis]
+            offset = offset.reshaped(to: broadcastShape)
+            scale = scale.reshaped(to: broadcastShape)
+        }
         switch Context.local.learningPhase {
         case .training:
-          let positiveAxis = (input.rank + axis) % input.rank
           var normalizedAxes = Array(0..<input.rank)
           normalizedAxes.remove(at: positiveAxis)
           let moments = input.moments(alongAxes: normalizedAxes)
           runningMean.value += (moments.mean - runningMean.value) * (1 - momentum)
           runningVariance.value += (moments.variance - runningVariance.value) * (1 - momentum)
-          let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
-          return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
+          let inv = rsqrt(moments.variance + epsilon) * scale
+          return (input - moments.mean) * inv + offset
         case .inference:
           let inv = rsqrt(runningVariance.value + epsilon) * scale
           return (input - runningMean.value) * inv + offset
@@ -100,13 +108,14 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
         momentum: Tensor<Scalar> = Tensor(0.99),
         epsilon: Tensor<Scalar> = Tensor(0.001)
     ) {
-        self.axis = axis
-        self.momentum = momentum
-        self.scale = Tensor<Scalar>(ones: [featureCount])
-        self.offset = Tensor<Scalar>(zeros: [featureCount])
-        self.epsilon = epsilon
-        self.runningMean = Parameter(Tensor(0))
-        self.runningVariance = Parameter(Tensor(1))
+        self.init(
+            axis: axis,
+            momentum: momentum,
+            offset: Tensor(zeros: [featureCount]),
+            scale: Tensor(ones: [featureCount]),
+            epsilon: epsilon,
+            runningMean: Tensor(0),
+            runningVariance: Tensor(1))
     }
 }
 
@@ -152,8 +161,7 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
             offset: Tensor(zeros: [featureCount]),
             scale: Tensor(ones: [featureCount]),
             axis: axis,
-            epsilon: epsilon
-        )
+            epsilon: epsilon)
     }
 
     /// Returns the output obtained from applying the layer to the given input.
@@ -162,8 +170,13 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
     /// - Returns: The output.
     @differentiable
     public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
+        let positiveAxis = (input.rank + axis) % input.rank
+        var broadcastShape = input.shape
+        broadcastShape[positiveAxis] = 1
+        let offset = self.offset.reshaped(to: broadcastShape)
+        let scale = self.scale.reshaped(to: broadcastShape)
         let moments = input.moments(alongAxes: axis)
-        let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
-        return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
+        let inv = rsqrt(moments.variance + epsilon) * scale
+        return (input - moments.mean) * inv + offset
     }
 }
diff --git a/Tests/TensorFlowTests/LayerTests.swift b/Tests/TensorFlowTests/LayerTests.swift
@@ -474,7 +474,7 @@ final class LayerTests: XCTestCase {
             let grad = gradient(at: x, bnLayer) { $1($0).squared().sum() }
             // The expected values and gradients were computed using the following Python code:
             // ```
-            //   x = tf.constant(
+            //  x = tf.constant(
             //         [[  -1.0474433,  -0.11914538,  -0.08634827,   0.15446888,    1.0572497],
             //          [   1.5165012,    0.3753972,  -0.30856386,   -0.3100725,   -1.9584457],
             //          [ 0.006384419,    1.4424847,   0.91568077,   0.66328526,   -1.0794537],
@@ -509,8 +509,10 @@ final class LayerTests: XCTestCase {
                  [ 1.2142579e-01,  1.7060755e-03, -6.5005139e-02, -9.3897656e-02,  3.5770576e-02]],
                 accuracy: 1e-5)
             assertEqual(grad.1.offset, [0.0, 0.0, 0.0, 0.0, 0.0], accuracy: 1e-5)
-            assertEqual(grad.1.scale, [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
-                        accuracy: 1e-5)
+            assertEqual(
+                grad.1.scale,
+                [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
+                accuracy: 1e-5)
         }
     }
 
@@ -525,8 +527,8 @@ final class LayerTests: XCTestCase {
         let value = lnLayer(x)
         let grad = gradient(at: x, lnLayer) { $1($0).squared().sum() }
 
-        // Uses the same values as `testBatchNorm()` above because `LayerNorm` with features on axis
-        // `1` is equivalent to `BatchNorm` with features on axis `0`.
+        // Uses the same values as `testBatchNorm()` above because `LayerNorm` with features on
+        // axis `1` is equivalent to `BatchNorm` with features on axis `0`.
         assertEqual(
             value,
             [[-1.5439795 , -0.16477099, -0.11604305,  0.24174842,  1.5830451 ],
@@ -543,9 +545,11 @@ final class LayerTests: XCTestCase {
              [ 1.8438101e-03,  8.9146197e-05, -3.6990643e-03,  6.1964989e-04,  1.1463165e-03],
              [ 1.2142579e-01,  1.7060755e-03, -6.5005139e-02, -9.3897656e-02,  3.5770576e-02]],
             accuracy: 1e-5)
-        assertEqual(grad.1.scale, [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
-                    accuracy: 1e-5)
         assertEqual(grad.1.offset, [0.0, 0.0, 0.0, 0.0, 0.0], accuracy: 1e-5)
+        assertEqual(
+            grad.1.scale,
+            [9.977925, 9.992161, 9.986738, 9.990202, 9.886292],
+            accuracy: 1e-5)
     }
 
     static var allTests = [