[Initializers] Refactor random and variance-scaling initializers (#335)

jon-tow · marcrasi · commit b5a49b763cc4 · 2019-11-18T16:35:03.000-08:00
diff --git a/Sources/TensorFlow/Initializers.swift b/Sources/TensorFlow/Initializers.swift
@@ -453,63 +453,75 @@ public extension Tensor where Scalar: TensorFlowFloatingPoint {
     }
 }
 
-// TODO: Can become fileprivate after the 0.4 release.
-internal extension Tensor where Scalar: TensorFlowFloatingPoint {
-    static func glorot(
-        fromStandardUniform randomUniform: __shared Tensor<Scalar>,
-        shape: __shared TensorShape
-    ) -> Tensor<Scalar> {
-        let spatialDimCount = shape.count - 2
-        let receptiveField = shape[0..<spatialDimCount].contiguousSize
-        let fanIn = shape[shape.count - 2] * receptiveField
-        let fanOut = shape[shape.count - 1] * receptiveField
-        let minusOneToOne = 2 * randomUniform - 1
-        return Scalar.sqrt(Scalar(6) / Scalar(fanIn + fanOut)) * minusOneToOne
+//===------------------------------------------------------------------------------------------===//
+// Variance Scaling
+//===------------------------------------------------------------------------------------------===//
+
+fileprivate extension TensorShape {
+    // Returns the `fanIn` and `fanOut` counts for `TensorShape`s where the last two axes represent
+    // the input channel count and output channel count, respectively.
+    func fans() -> (in: Int, out: Int) {
+        precondition(
+            count > 1,
+            "Fans cannot be computed for tensors with fewer than 2 dimensions. Got: \(count)")
+
+        // Fans for a 2-D tensor, e.g. `Dense`/`Embedding` weights.
+        if count == 2 {
+            return (self[0], self[1])
+        }
+        // Fans for tensors with rank greater than `2`, specifically convolution filters.
+        let lastSpatialAxis = endIndex - 3
+        let spatialSize = self[0..<(lastSpatialAxis + 1)].contiguousSize
+        let inputAxis = endIndex - 2
+        let fanIn = self[inputAxis] * spatialSize
+        let outputAxis = endIndex - 1
+        let fanOut = self[outputAxis] * spatialSize
+        return (fanIn, fanOut)
     }
 }
 
 public extension Tensor where Scalar: TensorFlowFloatingPoint {
-    /// Creates a tensor by performing Glorot uniform initialization for the specified shape,
-    /// randomly sampling scalar values from a uniform distribution between `-limit` and `limit`,
-    /// generated by the default random number generator, where limit is
+    /// Creates a tensor with the specified shape by performing Glorot uniform initialization.
+    ///
+    /// It draws random samples from a uniform distribution between `-limit` and `limit`
+    /// generated by the default random number generator, where `limit` is
     /// `sqrt(6 / (fanIn + fanOut))` and `fanIn`/`fanOut` represent the number of input and output
-    /// features multiplied by the receptive field if present.
+    /// features multiplied by the receptive field size.
+    ///
+    /// Reference: ["Understanding the difficulty of training deep feedforward neural networks"](
+    /// http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
     ///
     /// - Parameters:
     ///   - shape: The dimensions of the tensor.
     init(glorotUniform shape: TensorShape, seed: TensorFlowSeed = Context.local.randomSeed) {
-        let uniform = Tensor(randomUniform: shape, seed: seed)
-        self = Tensor.glorot(fromStandardUniform: uniform, shape: shape)
+        let (fanIn, fanOut) = shape.fans()
+        let limit = Tensor<Scalar>(6 / Scalar(fanIn + fanOut))
+        self.init(randomUniform: shape, lowerBound: -limit, upperBound: limit, seed: seed)
     }
-}
 
-// TODO: Can become fileprivate after the 0.4 release.
-internal extension Tensor where Scalar: TensorFlowFloatingPoint {
-    static func glorot(
-        fromStandardNormal standardNormal: __shared Tensor<Scalar>,
-        shape: __shared TensorShape
-    ) -> Tensor<Scalar> {
-        let spatialDimCount = shape.count - 2
-        let receptiveField = shape[0..<spatialDimCount].contiguousSize
-        let fanIn = shape[shape.count - 2] * receptiveField
-        let fanOut = shape[shape.count - 1] * receptiveField
-        let minusOneToOne = 2 * standardNormal - 1
-        return Scalar.sqrt(Scalar(2) / Scalar(fanIn + fanOut)) * minusOneToOne
-    }
-}
-
-public extension Tensor where Scalar: TensorFlowFloatingPoint {
-    /// Creates a tensor by performing Glorot normal initialization for the specified shape,
-    /// randomly sampling scalar values from a uniform distribution between `-limit` and `limit`,
-    /// generated by the default random number generator, where limit is
-    /// `sqrt(2 / (fanIn + fanOut))` and `fanIn`/`fanOut` represent the number of input and output
-    /// features multiplied by the receptive field if present.
+    /// Creates a tensor with the specified shape by performing Glorot normal initialization.
+    ///
+    /// It draws random samples from a truncated normal distribution centered on `0` with
+    /// standard deviation `sqrt(2 / (fanIn + fanOut))`generated by the default random number
+    /// generator, where `fanIn`/`fanOut` represent the number of input and output features
+    /// multiplied by the receptive field size.
+    ///
+    /// Reference: ["Understanding the difficulty of training deep feedforward neural networks"](
+    /// http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
     ///
     /// - Parameters:
     ///   - shape: The dimensions of the tensor.
     init(glorotNormal shape: TensorShape, seed: TensorFlowSeed = Context.local.randomSeed) {
-        let normal = Tensor(randomNormal: shape, seed: seed)
-        self = Tensor.glorot(fromStandardNormal: normal, shape: shape)
+        let (fanIn, fanOut) = shape.fans()
+        var standardDeviation = Tensor<Scalar>(Scalar.sqrt(2 / Scalar(fanIn + fanOut)))
+        // Standard deviation of truncated standard normal between `-2` and `2` standard deviations.
+        let truncationDeviation = Tensor<Scalar>(0.87962566103423978)
+        standardDeviation /= truncationDeviation // Smooths the tails of the clipped normal.
+        self.init(
+            randomTruncatedNormal: shape,
+            mean: Tensor<Scalar>(0),
+            standardDeviation: standardDeviation,
+            seed: seed)
     }
 }
 
diff --git a/Tests/TensorFlowTests/InitializerTests.swift b/Tests/TensorFlowTests/InitializerTests.swift
@@ -94,12 +94,88 @@ final class InitializerTests: XCTestCase {
         XCTAssertEqual(ShapedArray(shape: [2, 2], scalars: [1, 0, 1, 0]), i8s.array)
     }
 
+    // Constants for testing distribution based initializers.
+    private let fcShape = TensorShape([200, 100])
+    private let convShape = TensorShape([25, 25, 20, 20])
+    private let tolerance = Float(3e-2)
+
+    func testDistribution(
+        _ t: Tensor<Float>,
+        expectedMean: Float? = nil,
+        expectedStandardDeviation: Float? = nil,
+        expectedMin: Float? = nil,
+        expectedMax: Float? = nil
+    ) {
+        if let expectedMean = expectedMean {
+            let mean = t.mean().scalarized()
+            XCTAssertTrue(abs(mean - expectedMean) < tolerance)
+        }
+        if let expectedStandardDeviation = expectedStandardDeviation {
+            let standardDeviation = t.standardDeviation().scalarized()
+            XCTAssertTrue(abs(standardDeviation - expectedStandardDeviation) < tolerance)
+        }
+        if let expectedMin = expectedMin {
+            let min = t.min().scalarized()
+            XCTAssertTrue(abs(min - expectedMin) < tolerance)
+        }
+        if let expectedMax = expectedMax {
+            let max = t.max().scalarized()
+            XCTAssertTrue(abs(max - expectedMax) < tolerance)
+        }
+    }
+
+    func testRandomUniform() {
+        do {
+            let t = Tensor<Float>(
+                randomUniform: fcShape,
+                lowerBound: Tensor(2),
+                upperBound: Tensor(3))
+            testDistribution(t, expectedMean: 2.5, expectedMin: 2, expectedMax: 3)
+        }
+        do {
+            let t = Tensor<Float>(
+                randomUniform: fcShape,
+                lowerBound: Tensor(-1),
+                upperBound: Tensor(1))
+            testDistribution(t, expectedMean: 0, expectedMin: -1, expectedMax: 1)
+        }
+    }
+
+    func testRandomNormal() {
+        let t = Tensor<Float>(
+            randomNormal: convShape,
+            mean: Tensor(1),
+            standardDeviation: Tensor(2))
+        testDistribution(t, expectedMean: 1, expectedStandardDeviation: 2)
+    }
+
+    func testRandomTruncatedNormal() {
+        let t = Tensor<Float>(randomTruncatedNormal: convShape)
+        testDistribution(t, expectedMean: 0, expectedMin: -2, expectedMax: 2)
+    }
+
+    func testGlorotUniform() {
+        let t = Tensor<Float>(glorotUniform: convShape)
+        let spatialSize = convShape[0..<2].contiguousSize
+        let (fanIn, fanOut) = (convShape[2] * spatialSize, convShape[3] * spatialSize)
+        let stdDev = sqrt(Float(2.0) / Float(fanIn + fanOut))
+        testDistribution(t, expectedMean: 0, expectedStandardDeviation: stdDev)
+    }
+
+    func testGlorotNormal() {
+        let t = Tensor<Float>(glorotNormal: convShape)
+        let spatialSize = convShape[0..<2].contiguousSize
+        let (fanIn, fanOut) = (convShape[2] * spatialSize, convShape[3] * spatialSize)
+        let stdDev = sqrt(Float(2.0) / Float(fanIn + fanOut))
+        testDistribution(t, expectedMean: 0, expectedStandardDeviation: stdDev)
+    }
+
     func testOrthogonalShapesValues() {
         for shape in [[10, 10], [10, 9, 8], [100, 5, 5], [50, 40], [3, 3, 32, 64]] {
             // Check the shape.
             var t = Tensor<Float>(orthogonal: TensorShape(shape))
             XCTAssertEqual(shape, t.shape.dimensions)
-        
+
             // Check orthogonality by computing the inner product.
             t = t.reshaped(to: [t.shape.dimensions.dropLast().reduce(1, *), t.shape[t.rank - 1]])
             if t.shape[0] > t.shape[1] {
@@ -120,6 +196,11 @@ final class InitializerTests: XCTestCase {
         ("testArrayConversion", testArrayConversion),
         ("testDataTypeCast", testDataTypeCast),
         ("testBoolToNumericCast", testBoolToNumericCast),
+        ("testRandomUniform", testRandomUniform),
+        ("testRandomNormal", testRandomNormal),
+        ("testRandomTruncatedNormal", testRandomTruncatedNormal),
+        ("testGlorotUniform", testGlorotUniform),
+        ("testGlorotNormal", testGlorotNormal),
         ("testOrthogonalShapesValues", testOrthogonalShapesValues)
     ]
 }
diff --git a/Tests/TensorFlowTests/LayerTests.swift b/Tests/TensorFlowTests/LayerTests.swift
@@ -1120,11 +1120,13 @@ final class LayerTests: XCTestCase {
             let (outputs, _) = rnn.valueWithPullback(at: inputs) { rnn, inputs in
                 return rnn(inputs)
             }
-            XCTAssertEqual(outputs.map { $0.value },
-                           [[[ 0.20775771,  0.20080023, -0.13768704, -0.18534681]],
-                            [[ 0.22666009,  0.30019346, -0.19720285, -0.14683801]],
-                            [[ 0.23758979,  0.32101023, -0.20359215,  -0.1787096]],
-                            [[ 0.24337786,   0.3389194, -0.21143384,  -0.1675081]]])
+            assertEqual(
+                outputs.map { $0.value.squeezingShape(at: 0) }[0],
+                [[ 0.14798240, 0.14295710, -0.09766942, -0.131820890],
+                 [ 0.15757358, 0.19475500, -0.12810913, -0.112212844],
+                 [ 0.16168950, 0.20306197, -0.13058113, -0.123917180],
+                 [ 0.16325668, 0.20822097, -0.13273866, -0.121018395]],
+                accuracy: 1e-6)
         }
         // TODO: Figure out why the following is numerically unstable.
         // let (𝛁rnn, _) = pullback(.init(inputs.map { SimpleRNNCell<Float>.State($0) }))
@@ -1149,18 +1151,20 @@ final class LayerTests: XCTestCase {
                 let (outputs, _) = rnn.valueWithPullback(at: inputs) { rnn, inputs in
                     return rnn(inputs)
                 }
-                XCTAssertEqual(
-                    outputs.map { $0.cell },
-                    [[[ 0.08981595, 0.027691621, -0.059235442, -0.075101905]],
-                     [[ 0.12952757, 0.040402323, -0.084273980, -0.116252676]],
-                     [[ 0.14727503, 0.046511370, -0.094689950, -0.138459030]],
-                     [[ 0.15532997, 0.049573865, -0.098824400, -0.150242210]]])
-                XCTAssertEqual(
-                    outputs.map { $0.hidden },
-                    [[[ 0.046985064, 0.012670102, -0.031083463, -0.038572006]],
-                     [[ 0.066482050, 0.018388016, -0.044252350, -0.058907583]],
-                     [[ 0.074910110, 0.021107012, -0.049724963, -0.069670826]],
-                     [[ 0.078670055, 0.022462710, -0.051899005, -0.075331904]]])
+                assertEqual(
+                    outputs.map { $0.cell.squeezingShape(at: 0) }[0],
+                    [[ 0.047114454, 0.013959665, -0.030737250, -0.038524970],
+                     [ 0.069171116, 0.020617897, -0.044740470, -0.058878290],
+                     [ 0.079530790, 0.023841830, -0.051080680, -0.069567055],
+                     [ 0.084416830, 0.025424266, -0.053918116, -0.075140170]],
+                    accuracy: 1e-6)
+                assertEqual(
+                    outputs.map { $0.hidden.squeezingShape(at: 0) }[0],
+                    [[ 0.024117637, 0.0066833394, -0.015753632, -0.019533360],
+                     [ 0.035230752, 0.0098582430, -0.022934474, -0.029750597],
+                     [ 0.040405065, 0.0113919870, -0.026185552, -0.035087958],
+                     [ 0.042834233, 0.0121438510, -0.027640648, -0.037863784]],
+                    accuracy: 1e-6)
             }
         }
     }
diff --git a/Tests/TensorFlowTests/SequentialTests.swift b/Tests/TensorFlowTests/SequentialTests.swift
@@ -60,8 +60,9 @@ final class SequentialTests: XCTestCase {
                 adadelta.update(&model, along: 𝛁model)
             }
         }
-        XCTAssertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),
-                       [[0.4884567], [0.4884567], [0.4884567], [0.4884567]])
+        assertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),
+                    [[0.5115531], [0.5115531], [0.5115531], [0.5115531]],
+                    accuracy: 1e-6)
     }
 
     static var allTests = [

Original file line number	Diff line number	Diff line change
`@@ -60,8 +60,9 @@ final class SequentialTests: XCTestCase {`
`60`	`60`	`adadelta.update(&model, along: 𝛁model)`
`61`	`61`	`}`
`62`	`62`	`}`
`63`		`- XCTAssertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),`
`64`		`- [[0.4884567], [0.4884567], [0.4884567], [0.4884567]])`
	`63`	`+ assertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),`
	`64`	`+ [[0.5115531], [0.5115531], [0.5115531], [0.5115531]],`
	`65`	`+ accuracy: 1e-6)`
`65`	`66`	`}`
`66`	`67`
`67`	`68`	`static var allTests = [`