Fix derivatives for min reduction ops. (#590)

dan-zheng · web-flow · commit c84d4c05bf71 · 2019-12-21T07:56:37.000-08:00
Fix VJPs for:
- `Tensor.min(alongAxes:)`
- `Tensor.min(squeezingAxes:)`

Previously, the VJPs returned the result of `max` as the original value.
Now, the result of `min` is correctly returned.

Add tests, checking against Python TensorFlow.
diff --git a/Sources/TensorFlow/Operators/Math.swift b/Sources/TensorFlow/Operators/Math.swift
@@ -1396,6 +1396,8 @@ public func min<T>(_ lhs: Tensor<T>, _ rhs: T) -> Tensor<T> where T: Numeric & C
     min(lhs, Tensor(rhs))
 }
 
+// Note: adapted from `_MinOrMaxGrad`:
+// https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/ops/math_grad.py#L223.
 @inlinable
 internal func _vjpMinMaxHelper<T: TensorFlowFloatingPoint>(
     _ x: Tensor<T>,
@@ -1554,7 +1556,7 @@ public extension Tensor where Scalar: Numeric & Comparable {
     @inlinable
     @differentiable(
         wrt: self,
-        vjp: _vjpMinOrMax(squeezingAxes:) where Scalar: TensorFlowFloatingPoint)
+        vjp: _vjpMax(squeezingAxes:) where Scalar: TensorFlowFloatingPoint)
     func max(squeezingAxes axes: Tensor<Int32>) -> Tensor {
         return _Raw.max(self, reductionIndices: axes, keepDims: false)
     }
@@ -1585,7 +1587,7 @@ public extension Tensor where Scalar: Numeric & Comparable {
     @inlinable
     @differentiable(
         wrt: self,
-        vjp: _vjpMinOrMax(squeezingAxes:) where Scalar: TensorFlowFloatingPoint)
+        vjp: _vjpMin(squeezingAxes:) where Scalar: TensorFlowFloatingPoint)
     func min(squeezingAxes axes: Tensor<Int32>) -> Tensor {
         _Raw.min(self, reductionIndices: axes, keepDims: false)
     }
@@ -1633,7 +1635,7 @@ public extension Tensor where Scalar: Numeric & Comparable {
     /// - Parameter axes: The dimensions to reduce.
     /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
     @inlinable
-    @differentiable(wrt: self, vjp: _vjpMinOrMax(alongAxes:) where Scalar: TensorFlowFloatingPoint)
+    @differentiable(wrt: self, vjp: _vjpMin(alongAxes:) where Scalar: TensorFlowFloatingPoint)
     func min(alongAxes axes: Tensor<Int32>) -> Tensor {
         _Raw.min(self, reductionIndices: axes, keepDims: true)
     }
@@ -1665,7 +1667,7 @@ public extension Tensor where Scalar: Numeric & Comparable {
     /// - Parameter axes: The dimensions to reduce.
     /// - Precondition: Each value in `axes` must be in the range `-rank..<rank`.
     @inlinable
-    @differentiable(wrt: self, vjp: _vjpMinOrMax(alongAxes:) where Scalar: TensorFlowFloatingPoint)
+    @differentiable(wrt: self, vjp: _vjpMax(alongAxes:) where Scalar: TensorFlowFloatingPoint)
     func max(alongAxes axes: Tensor<Int32>) -> Tensor {
         _Raw.max(self, reductionIndices: axes, keepDims: true)
     }
@@ -1706,35 +1708,73 @@ public extension Tensor where Scalar: Numeric & Comparable {
 }
 
 internal extension Tensor where Scalar: TensorFlowFloatingPoint {
-  @inlinable
-  func _vjpMinOrMax(squeezingAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
-    let result = max(squeezingAxes: axes)
-    return (result, { v in
-      let yUnsqueezed = result.expandingShape(at: axes.scalars.map { Int($0) })
-      let gradientUnsqueezed = v.expandingShape(at: axes.scalars.map { Int($0) })
+    // Note: adapted from `_MinOrMaxGrad`:
+    // https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/ops/math_grad.py#L223.
+    @inlinable
+    func _vjpMinMaxHelper(
+        squeezingAxes axes: Tensor<Int32>,
+        originalValue: Tensor,
+        seed: Tensor
+    ) -> Tensor {
+        let yUnsqueezed = originalValue.expandingShape(at: axes.scalars.map { Int($0) })
+        let gradientUnsqueezed = seed.expandingShape(at: axes.scalars.map { Int($0) })
 
-      // Compute the number of selected (maximum or minimum) elements in each reduction dimension.
-      // If there are multiple minimum or maximum elements then the gradient will be divided between
-      // them.
-      let indicators = Tensor(yUnsqueezed .== self)
-      let selectedCount = indicators.sum(alongAxes: axes)
+        // Compute the number of selected (maximum or minimum) elements in each reduction dimension.
+        // If there are multiple minimum or maximum elements then the gradient will be divided
+        // between them.
+        let indicators = Tensor(yUnsqueezed .== self)
+        let selectedCount = indicators.sum(alongAxes: axes)
 
-      return gradientUnsqueezed.broadcasted(toShape: self.shapeTensor) * indicators / selectedCount
-    })
-  }
+        return gradientUnsqueezed.broadcasted(toShape: self.shapeTensor) * indicators / selectedCount
+    }
 
-  @inlinable
-  func _vjpMinOrMax(alongAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
-    let result = max(alongAxes: axes)
-    return (result, { v in
-      // Compute the number of selected (maximum or minimum) elements in each reduction dimension.
-      // If there are multiple minimum or maximum elements then the gradient will be divided between
-      // them.
-      let indicators = Tensor(result .== self)
-      let selectedCount = indicators.sum(alongAxes: axes)
-      return v.broadcasted(toShape: self.shapeTensor) * indicators / selectedCount
-    })
-  }
+    @inlinable
+    func _vjpMax(squeezingAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
+        let result = max(squeezingAxes: axes)
+        return (result, { v in
+            self._vjpMinMaxHelper(squeezingAxes: axes, originalValue: result, seed: v)
+        })
+    }
+
+    @inlinable
+    func _vjpMin(squeezingAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
+        let result = min(squeezingAxes: axes)
+        return (result, { v in
+            self._vjpMinMaxHelper(squeezingAxes: axes, originalValue: result, seed: v)
+        })
+    }
+
+    // Note: adapted from `_MinOrMaxGrad`:
+    // https://github.com/tensorflow/tensorflow/blob/r2.1/tensorflow/python/ops/math_grad.py#L223.
+    @inlinable
+    func _vjpMinMaxHelper(
+        alongAxes axes: Tensor<Int32>,
+        originalValue: Tensor,
+        seed: Tensor
+    ) -> Tensor {
+        // Compute the number of selected (maximum or minimum) elements in each reduction dimension.
+        // If there are multiple minimum or maximum elements then the gradient will be divided
+        // between them.
+        let indicators = Tensor(originalValue .== self)
+        let selectedCount = indicators.sum(alongAxes: axes)
+        return seed.broadcasted(toShape: self.shapeTensor) * indicators / selectedCount
+    }
+
+    @inlinable
+    func _vjpMax(alongAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
+        let result = max(alongAxes: axes)
+        return (result, { v in
+            self._vjpMinMaxHelper(alongAxes: axes, originalValue: result, seed: v)
+        })
+    }
+
+    @inlinable
+    func _vjpMin(alongAxes axes: Tensor<Int32>) -> (Tensor, (Tensor) -> Tensor) {
+        let result = min(alongAxes: axes)
+        return (result, { v in
+            self._vjpMinMaxHelper(alongAxes: axes, originalValue: result, seed: v)
+        })
+    }
 }
 
 // MARK: - Numeric Reductions
diff --git a/Tests/TensorFlowTests/TensorAutoDiffTests.swift b/Tests/TensorFlowTests/TensorAutoDiffTests.swift
@@ -284,77 +284,187 @@ final class TensorAutoDiffTests: XCTestCase {
         XCTAssertEqual(varianceGradAlongAxes(input), expected)
     }
 
-    func testMin() {
-        // The expected gradient values were computed using the following TensorFlow 2.0 Beta1
-        // Python code with respective `a` and `b` tensors:
+    func testMax() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
         // ```
+        // import tensorflow as tf
         // with tf.GradientTape() as t:
         //     t.watch([a, b])
-        //     y = tf.math.reduce_sum(tf.minimum(a, b))
+        //     y = tf.reduce_sum(tf.maximum(a, b))
         // print(t.gradient(y, [a, b]))
         // ```
         do {
             let a = Tensor<Float>([4, 5, 3])
             let b = Tensor<Float>([4, 2, 6])
-            let computedGradient1 = gradient(at: a, b) { a, b in min(a, b).sum() }
-            let expectedGradient1: (Tensor<Float>, Tensor<Float>) = (
-                [1.0, 0.0, 1.0], [0.0, 1.0, 0.0])
+            let computedGradient1 = gradient(at: a, b) { a, b in max(a, b).sum() }
+            let expectedGradient1: (Tensor<Float>, Tensor<Float>) = ([1, 1, 0], [0, 0, 1])
             XCTAssertEqual(computedGradient1.0, expectedGradient1.0)
             XCTAssertEqual(computedGradient1.1, expectedGradient1.1)
 
-            let computedGradient2 = gradient(at: a, b) { a, b in min(b, a).sum() }
-            let expectedGradient2: (Tensor<Float>, Tensor<Float>) =  (
-                [0.0, 0.0, 1.0], [1.0, 1.0, 0.0])
+            let computedGradient2 = gradient(at: a, b) { a, b in max(b, a).sum() }
+            let expectedGradient2: (Tensor<Float>, Tensor<Float>) = ([0, 1, 0],  [1, 0, 1])
             XCTAssertEqual(computedGradient2.0, expectedGradient2.0)
             XCTAssertEqual(computedGradient2.1, expectedGradient2.1)
         }
-
         do {
-            let a = Tensor<Float>([[3.0, -2.0], [0.3, 10.0]])
-            let b = Tensor<Float>([9.0, -3.0])
-            let computedGradient = gradient(at: a, b) { a, b in min(a, b).sum() }
-            let expectedGradient: (Tensor<Float>, Tensor<Float>) = (
-                [[1.0, 0.0], [1.0, 0.0]], [0.0, 2.0])
+            let a = Tensor<Float>([[3, -2], [0.3, 10]])
+            let b = Tensor<Float>([9, -3])
+            let computedGradient = gradient(at: a, b) { a, b in max(a, b).sum() }
+            let expectedGradient: (Tensor<Float>, Tensor<Float>)  = ([[0, 1], [0, 1]], [2, 0])
             XCTAssertEqual(computedGradient.0, expectedGradient.0)
             XCTAssertEqual(computedGradient.1, expectedGradient.1)
         }
     }
 
-    func testMax() {
-        // The expected gradient values were computed using the following TensorFlow 2.0 Beta1
-        // Python code with respective `a` and `b` tensors:
+    func testMin() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
         // ```
+        // import tensorflow as tf
         // with tf.GradientTape() as t:
         //     t.watch([a, b])
-        //     y = tf.math.reduce_sum(tf.maximum(a, b))
+        //     y = tf.reduce_sum(tf.minimum(a, b))
         // print(t.gradient(y, [a, b]))
         // ```
         do {
             let a = Tensor<Float>([4, 5, 3])
             let b = Tensor<Float>([4, 2, 6])
-            let computedGradient1 = gradient(at: a, b) { a, b in max(a, b).sum() }
-            let expectedGradient1: (Tensor<Float>, Tensor<Float>) = (
-                [1.0, 1.0, 0.0], [0.0, 0.0, 1.0])
+            let computedGradient1 = gradient(at: a, b) { a, b in min(a, b).sum() }
+            let expectedGradient1: (Tensor<Float>, Tensor<Float>) = ([1, 0, 1], [0, 1, 0])
             XCTAssertEqual(computedGradient1.0, expectedGradient1.0)
             XCTAssertEqual(computedGradient1.1, expectedGradient1.1)
 
-            let computedGradient2 = gradient(at: a, b) { a, b in max(b, a).sum() }
-            let expectedGradient2: (Tensor<Float>, Tensor<Float>) = (
-                [0.0, 1.0, 0.0],  [1.0, 0.0, 1.0])
+            let computedGradient2 = gradient(at: a, b) { a, b in min(b, a).sum() }
+            let expectedGradient2: (Tensor<Float>, Tensor<Float>) =  ([0, 0, 1], [1, 1, 0])
             XCTAssertEqual(computedGradient2.0, expectedGradient2.0)
             XCTAssertEqual(computedGradient2.1, expectedGradient2.1)
         }
+
         do {
-            let a = Tensor<Float>([[3.0, -2.0], [0.3, 10.0]])
-            let b = Tensor<Float>([9.0, -3.0])
-            let computedGradient = gradient(at: a, b) { a, b in max(a, b).sum() }
-            let expectedGradient: (Tensor<Float>, Tensor<Float>)  = (
-                [[0.0, 1.0], [0.0, 1.0]], [2.0, 0.0])
+            let a = Tensor<Float>([[3, -2], [0.3, 10]])
+            let b = Tensor<Float>([9, -3])
+            let computedGradient = gradient(at: a, b) { a, b in min(a, b).sum() }
+            let expectedGradient: (Tensor<Float>, Tensor<Float>) = ([[1, 0], [1, 0]], [0, 2])
             XCTAssertEqual(computedGradient.0, expectedGradient.0)
             XCTAssertEqual(computedGradient.1, expectedGradient.1)
         }
     }
 
+    func testMaxAlongAxes() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
+        // ```
+        // import tensorflow as tf
+        // x = tf.constant(range(6), shape=(2, 3), dtype=float)
+        // with tf.GradientTape() as t:
+        //     t.watch(x)
+        //     y = tf.reduce_sum(tf.reduce_max(x, axis=0, keepdims=True))
+        // print(t.gradient(y, x))
+        // ```
+        func maxAlongAxesSum(_ x: Tensor<Float>) -> Tensor<Float> {
+            x.max(alongAxes: 0).sum()
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [3, 4, 5]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: maxAlongAxesSum)
+            XCTAssertEqual(value, maxAlongAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[0, 0, 0], [1, 1, 1]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [2, 1, 0]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: maxAlongAxesSum)
+            XCTAssertEqual(value, maxAlongAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[0, 0.5, 1], [1, 0.5, 0]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+    }
+
+    func testMinAlongAxes() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
+        // ```
+        // import tensorflow as tf
+        // x = tf.constant(range(6), shape=(2, 3), dtype=float)
+        // with tf.GradientTape() as t:
+        //     t.watch(x)
+        //     y = tf.reduce_sum(tf.reduce_min(x, axis=0, keepdims=True))
+        // print(t.gradient(y, x))
+        // ```
+        func minAlongAxesSum(_ x: Tensor<Float>) -> Tensor<Float> {
+            x.min(alongAxes: 0).sum()
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [3, 4, 5]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: minAlongAxesSum)
+            XCTAssertEqual(value, minAlongAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[1, 1, 1], [0, 0, 0]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [2, 1, 0]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: minAlongAxesSum)
+            XCTAssertEqual(value, minAlongAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[1, 0.5, 0], [0, 0.5, 1]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+    }
+
+    func testMaxSqueezingAxes() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
+        // ```
+        // import tensorflow as tf
+        // x = tf.constant(range(6), shape=(2, 3), dtype=float)
+        // with tf.GradientTape() as t:
+        //     t.watch(x)
+        //     y = tf.reduce_sum(tf.reduce_max(x, axis=0, keepdims=False))
+        // print(t.gradient(y, x))
+        // ```
+        func maxSqueezingAxesSum(_ x: Tensor<Float>) -> Tensor<Float> {
+            x.max(squeezingAxes: 0).sum()
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [3, 4, 5]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: maxSqueezingAxesSum)
+            XCTAssertEqual(value, maxSqueezingAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[0, 0, 0], [1, 1, 1]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [2, 1, 0]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: maxSqueezingAxesSum)
+            XCTAssertEqual(value, maxSqueezingAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[0, 0.5, 1], [1, 0.5, 0]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+    }
+
+    func testMinSqueezingAxes() {
+        // Expected gradient values were computed using the following TensorFlow Python code:
+        // ```
+        // import tensorflow as tf
+        // x = tf.constant(range(6), shape=(2, 3), dtype=float)
+        // with tf.GradientTape() as t:
+        //     t.watch(x)
+        //     y = tf.reduce_sum(tf.reduce_min(x, axis=0, keepdims=False))
+        // print(t.gradient(y, x))
+        // ```
+        func minSqueezingAxesSum(_ x: Tensor<Float>) -> Tensor<Float> {
+            x.min(squeezingAxes: 0).sum()
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [3, 4, 5]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: minSqueezingAxesSum)
+            XCTAssertEqual(value, minSqueezingAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[1, 1, 1], [0, 0, 0]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+        do {
+            let x: Tensor<Float> = [[0, 1, 2], [2, 1, 0]]
+            let (value, computedGradient) = valueWithGradient(at: x, in: minSqueezingAxesSum)
+            XCTAssertEqual(value, minSqueezingAxesSum(x))
+            let expectedGradient: Tensor<Float> = [[1, 0.5, 0], [0, 0.5, 1]]
+            XCTAssertEqual(computedGradient, expectedGradient)
+        }
+    }
+
     func testTensorInitStacking() {
         let a1 = Tensor<Float>([1, 2, 3, 4, 5])
         let b1 = Tensor<Float>([6, 7, 8, 9, 10])