[TF] Reimplement unbroadcast using on-host axis calculation for performance. (#24907)

rxwei · web-flow · commit 528fb675bbd2 · 2019-05-19T19:47:53.000-07:00
The inefficiency of `unbroadcast(toShape:)`, `unbroadcast(to:)`, and `unbroadcast(like:)` has caused significant performance problems during model training because it's performing a lot of TensorFlow operations to achieve axis calculation. We were forced to implement it this way in the early GPE era when neither send/receive nor per-op dispatch was available. This PR reimplements the unbroadcast operations in terms of host-side logic to compute axes to reduce along. This significantly reduces the TensorFlow opreation dispatch overhead. The base implementation changed from `broadcast(toShape:)` to `broadcast(to:)`. With the new implementation, differentiating broadcasting operators is 37% faster (see simple test script [here](https://gist.github.com/rxwei/e1488cac5379ba2bc3aff7490e18158f)). Note: - Since we now rely on the TensorFlow runtime less, more precondition checks and assertions are added to the newly implemented `unbroadcast(to:)` method. - The part of #24408 that uses `Raw.broadcastGradientArgs(s0:s1:)` is still necessary for broadcasting binary operations to become faster. TODO: - Change `unbroadcast(toShape:)` tests added by #24899 to use `unbroadcast(to:)`, since `unbroadcast(to:)` is now the base implementation.
diff --git a/stdlib/public/TensorFlow/Gradients.swift b/stdlib/public/TensorFlow/Gradients.swift
@@ -645,16 +645,14 @@ extension Tensor where Scalar : TensorFlowFloatingPoint {
   func _vjpBroadcast(
     toShape shape: Tensor<Int32>
   ) -> (Tensor, (Tensor) -> Tensor) {
-    return (broadcast(toShape: shape), { [origShape = self.shapeTensor] v in
+    return (broadcast(toShape: shape), { [origShape = shapeTensor] v in
       v.unbroadcast(toShape: origShape)
     })
   }
 
   @inlinable
-  func _vjpUnbroadcast(
-    toShape shape: Tensor<Int32>
-  ) -> (Tensor, (Tensor) -> Tensor) {
-    return (unbroadcast(toShape: shape), { [origShape = self.shapeTensor] v in
+  func _vjpUnbroadcast(to shape: TensorShape) -> (Tensor, (Tensor) -> Tensor) {
+    return (unbroadcast(to: shape), { [origShape = shapeTensor] v in
       v.broadcast(toShape: origShape)
     })
   }
diff --git a/stdlib/public/TensorFlow/Ops.swift b/stdlib/public/TensorFlow/Ops.swift
@@ -1601,40 +1601,36 @@ public extension Tensor {
 public extension Tensor {
   @inlinable
   @differentiable(wrt: self, vjp: _vjpBroadcast(toShape:)
-    where Scalar : TensorFlowFloatingPoint)
+                  where Scalar : TensorFlowFloatingPoint)
   func broadcast(toShape shape: Tensor<Int32>) -> Tensor {
     return Raw.broadcastTo(self, shape: shape)
   }
 
   @inlinable
   @differentiable(wrt: self where Scalar : TensorFlowFloatingPoint)
   func broadcast(to shape: TensorShape) -> Tensor {
-    return broadcast(toShape: Tensor<Int32>({ shape.dimensions.map(Int32.init) }()))
+    return broadcast(
+      toShape: Tensor<Int32>({ shape.dimensions.map(Int32.init) }()))
   }
 
   /// Broadcast to the same shape as the specified `Tensor`.
   /// - Precondition: The specified shape must be compatible for broadcasting.
   @inlinable
-  @differentiable(wrt: self
-    where Scalar : TensorFlowFloatingPoint)
+  @differentiable(wrt: self where Scalar : TensorFlowFloatingPoint)
   func broadcast<OtherScalar>(like other: Tensor<OtherScalar>) -> Tensor {
     return broadcast(toShape: other.shapeTensor)
   }
 }
 
 public extension Tensor where Scalar : Numeric {
   @inlinable
-  @differentiable(wrt: self, vjp: _vjpUnbroadcast(toShape:)
-    where Scalar : TensorFlowFloatingPoint)
+  @differentiable(wrt: self where Scalar : TensorFlowFloatingPoint)
   func unbroadcast(toShape otherShape: Tensor<Int32>) -> Tensor {
-    let rankDiff = (rankTensor - otherShape.scalarCountTensor).rankLifted()
-    let ones: Tensor<Int32> = Raw.fill(dims: rankDiff, value: Tensor<Int32>(1))
-    let paddedShape = ones ++ otherShape
-    let nonEqualIndices = paddedShape .!= shapeTensor
-    let broadcastIndices = Raw.where_(nonEqualIndices).flattened()
-    let unbroadcasted: Tensor = Raw.sum(
-      self, reductionIndices: Tensor<Int32>(broadcastIndices), keepDims: false)
-    return Raw.reshape(unbroadcasted, shape: otherShape)
+    // TODO: Simplify this once differentiating control flow is supported.
+    return unbroadcast(to: {
+      precondition(otherShape.rank == 1)
+      return TensorShape(otherShape.scalars.map(Int.init))
+    }())
   }
 
   @inlinable
@@ -1644,9 +1640,31 @@ public extension Tensor where Scalar : Numeric {
   }
 
   @inlinable
-  @differentiable(wrt: self where Scalar : TensorFlowFloatingPoint)
+  @differentiable(wrt: self, vjp: _vjpUnbroadcast(to:)
+                  where Scalar : TensorFlowFloatingPoint)
   func unbroadcast(to shape: TensorShape) -> Tensor {
-    return unbroadcast(toShape: Tensor<Int32>({ shape.dimensions.map(Int32.init) }()))
+    let dimensions = self.shape.dimensions
+    var otherDimensions = shape.dimensions
+    let rankDifference = dimensions.count - otherDimensions.count
+    precondition(rankDifference >= 0, """
+        The rank of 'self' must be greater than or equal to the number of \
+        dimensions in the destination shape
+        """)
+    if rankDifference > 0 {
+      otherDimensions.insert(
+        contentsOf: repeatElement(1, count: rankDifference),
+        at: 0
+      )
+    }
+    assert(dimensions.count == otherDimensions.count)
+    var axes: [Int] = []
+    axes.reserveCapacity(dimensions.count)
+    for (i, (dim, otherDim)) in zip(dimensions, otherDimensions).enumerated() {
+      if dim == otherDim { continue }
+      if otherDim == 1 { axes.append(i); continue }
+      preconditionFailure("Cannot unbroadcast \(self.shape) to \(shape)")
+    }
+    return sum(alongAxes: axes).reshaped(to: shape)
   }
 
   @inlinable