use .moments() in LayerNorm and BatchNorm layers (#384)

mikowals · rxwei · commit f5222cdc4d2a · 2019-07-24T11:42:01.000-07:00
Mean and variance in the layers are now calculated using `Tensor.moments()`.  I also added tests for both BatchNorm and LayerNorm layers.  

The tests turned up a flaw in how the shape of the `scale` and `offset` which were always of shape `[featureCount]` irrespective of the input shape or axis for normalisation.  That shape leads to incorrect broadcasting when the axis being normalized along is not the last axis.

I have fixed this by always reshaping `scale` and `offset` before they are used.  This seems hacky in that I get the shapes from the calculated `mean` and `variance`.  Without the input shape being known at initialization time though I couldn't see a better way to do this.  

I think the axis argument is probably there to be consistent with Keras but most of the Swift api layers assume inputs and activations are NHWC.  So requiring NHWC, eliminating the axis argument, and the setting the correct shapes in `init()` would be another option.
diff --git a/Sources/TensorFlow/Layers/Normalization.swift b/Sources/TensorFlow/Layers/Normalization.swift
@@ -70,18 +70,19 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
         let positiveAxis = (input.rank + axis) % input.rank
         var normalizedAxes = Array(0..<input.rank)
         normalizedAxes.remove(at: positiveAxis)
-        let mean = input.mean(alongAxes: normalizedAxes)
-        let variance = input.variance(alongAxes: normalizedAxes)
-        runningMean.value += (mean - runningMean.value) * (1 - momentum)
-        runningVariance.value += (variance - runningVariance.value) * (1 - momentum)
-        let inv = rsqrt(variance + epsilon) * scale
-        return (input - mean) * inv + offset
+        let moments = input.moments(alongAxes: normalizedAxes)
+        runningMean.value += (moments.mean - runningMean.value) * (1 - momentum)
+        runningVariance.value += (moments.variance - runningVariance.value) * (1 - momentum)
+        let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
+        return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
     }
 
     @differentiable
     private func applyingInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
-        let inv = rsqrt(runningVariance.value + epsilon) * scale
-        return (input - runningMean.value) * inv + offset
+        let scaleShape = runningVariance.value.shape
+        let offsetShape = runningMean.value.shape
+        let inv = rsqrt(runningVariance.value + epsilon) * scale.reshaped(to: scaleShape)
+        return (input - runningMean.value) * inv + offset.reshaped(to: offsetShape)
     }
 
     /// Returns the output obtained from applying the layer to the given input.
@@ -187,9 +188,8 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
     /// - Returns: The output.
     @differentiable
     public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
-        let mean = input.mean(alongAxes: axis)
-        let variance = input.variance(alongAxes: axis)
-        let inv = rsqrt(variance + epsilon) * scale
-        return (input - mean) * inv + offset
+        let moments = input.moments(alongAxes: axis)
+        let inv = rsqrt(moments.variance + epsilon) * scale.reshaped(to: moments.variance.shape)
+        return (input - moments.mean) * inv + offset.reshaped(to: moments.mean.shape)
     }
 }
diff --git a/Tests/TensorFlowTests/LayerTests.swift b/Tests/TensorFlowTests/LayerTests.swift
@@ -410,6 +410,90 @@ final class LayerTests: XCTestCase {
         let expected = Tensor<Float>([[0.0], [0.7615942], [0.9640276], [0.9950547], [0.9993292]])
         XCTAssertEqual(output, expected)
     }
+    
+    func testBatchNorm() {
+        let x = Tensor<Float>([
+            [  -1.0474433,  -0.11914538,  -0.08634827,   0.15446888,    1.0572497],
+            [   1.5165012,    0.3753972,  -0.30856386,   -0.3100725,   -1.9584457],
+            [ 0.006384419,    1.4424847,   0.91568077,   0.66328526,   -1.0794537],
+            [    1.056803,   0.14263044,   -1.8308276,    0.4189805,    0.6933893],
+            [  0.30175626,  -0.16121633,   -0.4191958,  -0.53092813, -0.029484272]])
+        let bnLayer = BatchNorm<Float>(featureCount: 5, axis: 0)
+        Context.local.learningPhase = .training
+        let trainingValue = bnLayer(x)
+        let grad = gradient(at: x, bnLayer) { $1($0).squared().sum() }
+        // The expected values and gradients were computed using the following Python code:
+        // ```
+        //   x = tf.constant(
+        //         [[  -1.0474433,  -0.11914538,  -0.08634827,   0.15446888,    1.0572497],
+        //          [   1.5165012,    0.3753972,  -0.30856386,   -0.3100725,   -1.9584457],
+        //          [ 0.006384419,    1.4424847,   0.91568077,   0.66328526,   -1.0794537],
+        //          [    1.056803,   0.14263044,   -1.8308276,    0.4189805,    0.6933893],
+        //          [  0.30175626,  -0.16121633,   -0.4191958,  -0.53092813, -0.029484272]])
+        //  scale = tf.reshape(tf.constant([1., 1., 1., 1., 1.]), [5, 1])
+        //  offset = tf.reshape(tf.constant([0., 0., 0., 0., 0.]), [5, 1])
+        //  (mean, var) = tf.nn.moments(x, axes=1, keepdims=True)
+        //  bn = tf.nn.batch_normalization( x, mean, var, offset=offset, scale=scale, variance_epsilon=0.001)
+        //  scaled = tf.reduce_sum(tf.square(bn))
+        //  g = tf.gradients(scaled, [x, offset, scale])
+        //  init = tf.initialize_all_variables()
+        //  with tf.Session() as sess:
+        //    sess.run(init)
+        //    print(sess.run([bn, g]))
+        // ```
+        let expectedTrainingValue = Tensor<Float>([
+            [-1.5439795 , -0.16477099, -0.11604305,  0.24174842,  1.5830451 ],
+            [ 1.4639764 ,  0.45368853, -0.15186328, -0.15319899, -1.6126028 ],
+            [-0.44139984,  1.2124169 ,  0.60574806,  0.3150888 , -1.6918538 ],
+            [ 0.9507547 ,  0.04595902, -1.9072568 ,  0.31947452,  0.5910686 ],
+            [ 1.5834246 ,  0.02224666, -0.8476793 , -1.2244489 ,  0.46645695]])
+        
+        let expectedInputGradient = Tensor<Float>([
+            [-1.0127544e-02, -1.0807812e-03, -7.6115131e-04,  1.5857220e-03,  1.0383606e-02],
+            [ 2.0323221e-03,  6.2976527e-04, -2.1077941e-04, -2.1265696e-04, -2.2384699e-03],
+            [-1.3483668e-03,  3.7030075e-03,  1.8500184e-03,  9.6232636e-04, -5.1673558e-03],
+            [ 1.8438101e-03,  8.9146197e-05, -3.6990643e-03,  6.1964989e-04,  1.1463165e-03],
+            [ 1.2142579e-01,  1.7060755e-03, -6.5005139e-02, -9.3897656e-02,  3.5770576e-02]])
+        let expectedScaleGradient = Tensor<Float>([9.977925, 9.992161, 9.986738, 9.990202, 9.886292])
+        let expectedOffsetGradient = Tensor<Float>([0.0, 0.0, 0.0, 0.0, 0.0])
+        assertEqual(expectedTrainingValue, trainingValue, accuracy: 1e-5)
+        assertEqual(expectedInputGradient, grad.0, accuracy: 1e-5)
+        assertEqual(expectedScaleGradient, grad.1.scale, accuracy: 1e-5)
+        assertEqual(expectedOffsetGradient, grad.1.offset, accuracy: 1e-5)
+    }
+    
+    func testLayerNorm() {
+        let x = Tensor<Float>([
+            [  -1.0474433,  -0.11914538,  -0.08634827,   0.15446888,    1.0572497],
+            [   1.5165012,    0.3753972,  -0.30856386,   -0.3100725,   -1.9584457],
+            [ 0.006384419,    1.4424847,   0.91568077,   0.66328526,   -1.0794537],
+            [    1.056803,   0.14263044,   -1.8308276,    0.4189805,    0.6933893],
+            [  0.30175626,  -0.16121633,   -0.4191958,  -0.53092813, -0.029484272]])
+        let lnLayer = LayerNorm<Float>(featureCount: 5, axis: 1)
+        let value = lnLayer(x)
+        let grad = gradient(at: x, lnLayer) { $1($0).squared().sum() }
+        // Uses the same values as testBatchNorm() above because LayerNorm with features on axis 1
+        // is equivalent to BatchNorm with features on axis 0
+        let expectedValue = Tensor<Float>([
+            [-1.5439795 , -0.16477099, -0.11604305,  0.24174842,  1.5830451 ],
+            [ 1.4639764 ,  0.45368853, -0.15186328, -0.15319899, -1.6126028 ],
+            [-0.44139984,  1.2124169 ,  0.60574806,  0.3150888 , -1.6918538 ],
+            [ 0.9507547 ,  0.04595902, -1.9072568 ,  0.31947452,  0.5910686 ],
+            [ 1.5834246 ,  0.02224666, -0.8476793 , -1.2244489 ,  0.46645695]])
+        
+        let expectedInputGradient = Tensor<Float>([
+            [-1.0127544e-02, -1.0807812e-03, -7.6115131e-04,  1.5857220e-03,  1.0383606e-02],
+            [ 2.0323221e-03,  6.2976527e-04, -2.1077941e-04, -2.1265696e-04, -2.2384699e-03],
+            [-1.3483668e-03,  3.7030075e-03,  1.8500184e-03,  9.6232636e-04, -5.1673558e-03],
+            [ 1.8438101e-03,  8.9146197e-05, -3.6990643e-03,  6.1964989e-04,  1.1463165e-03],
+            [ 1.2142579e-01,  1.7060755e-03, -6.5005139e-02, -9.3897656e-02,  3.5770576e-02]])
+        let expectedScaleGradient = Tensor<Float>([9.977925, 9.992161, 9.986738, 9.990202, 9.886292])
+        let expectedOffsetGradient = Tensor<Float>([0.0, 0.0, 0.0, 0.0, 0.0])
+        assertEqual(expectedValue, value, accuracy: 1e-5)
+        assertEqual(expectedInputGradient, grad.0, accuracy: 1e-5)
+        assertEqual(expectedScaleGradient, grad.1.scale, accuracy: 1e-5)
+        assertEqual(expectedOffsetGradient, grad.1.offset, accuracy: 1e-5)
+    }
 
     static var allTests = [
         ("testSequential", testSequential),
@@ -443,6 +527,8 @@ final class LayerTests: XCTestCase {
         ("testSimpleRNNCell", testSimpleRNNCell),
         ("testDense", testDense),
         ("testRNN", testRNN),
-        ("testFunction", testFunction)
+        ("testFunction", testFunction),
+        ("testBatchNorm", testBatchNorm),
+        ("testLayerNorm", testLayerNorm)
     ]
 }