[WIP] Equations for losses (#579)

joaogui1 · BradLarson · Bart Chrzaszcz · web-flow · commit 9a47e3cb71c7 · 2020-02-05T10:40:09.000-06:00
* Draft equations for common losses

* Better wording

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* Fix typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* Fix typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* Fix typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* Fix typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* Fix typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

* All non categorical losses

* Combatibility with tf2.x documentation

* hinge -&gt; Hinge

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* Fix typo

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* Hinge -&gt; hinge

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* Remove extra space

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* Add spaces for consistency

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* Add spaces for consistency

Co-Authored-By: Bart Chrzaszcz &lt;bart.chr@gmail.com&gt;

* L losses draft and Hinge losses

* mean errors

* cosh and poisson

* crossentropies and KL divergence

* L* losses and hubber

* Typo

Co-Authored-By: Brad Larson &lt;bradlarson@google.com&gt;

Co-authored-by: Brad Larson &lt;bradlarson@google.com&gt;
Co-authored-by: Bart Chrzaszcz &lt;bartchr@apple.com&gt;
diff --git a/Sources/TensorFlow/Loss.swift b/Sources/TensorFlow/Loss.swift
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/// Returns the L1 loss between predictions and expectations.
+/// Computes the L1 loss between `expected` and `predicted`.
+/// `loss = reduction(abs(expected - predicted))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -27,7 +28,8 @@ public func l1Loss<Scalar: TensorFlowFloatingPoint>(
     reduction(abs(expected - predicted))
 }
 
-/// Returns the L2 loss between predictions and expectations.
+/// Computes the L2 loss between `expected` and `predicted`.
+/// `loss = reduction(square(expected - predicted))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -42,7 +44,8 @@ public func l2Loss<Scalar: TensorFlowFloatingPoint>(
     reduction((expected - predicted).squared())
 }
 
-/// Returns the mean absolute error between predictions and expectations.
+/// Computes the mean of absolute difference between labels and predictions.
+/// `loss = mean(abs(expected - predicted))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -55,7 +58,8 @@ public func meanAbsoluteError<Scalar: TensorFlowFloatingPoint>(
     l1Loss(predicted: predicted, expected: expected, reduction: _mean)
 }
 
-/// Returns the mean squared error between predictions and expectations.
+/// Computes the mean of squares of errors between labels and predictions.
+/// `loss = mean(square(expected - predicted))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -68,7 +72,8 @@ public func meanSquaredError<Scalar: TensorFlowFloatingPoint>(
     l2Loss(predicted: predicted, expected: expected, reduction: _mean)
 }
 
-/// Returns the mean squared logarithmic error between predictions and expectations.
+/// Computes the mean squared logarithmic error between `predicted` and `expected`
+///  `loss = square(log(expected) - log(predicted))`
 ///
 /// - Note: Negative tensor entries will be clamped at `0` to avoid undefined
 ///   logarithmic behavior, as `log(_:)` is undefined for negative reals.
@@ -86,7 +91,8 @@ public func meanSquaredLogarithmicError<Scalar: TensorFlowFloatingPoint>(
     return l2Loss(predicted: logPredicted, expected: logExpected, reduction: _mean)
 }
 
-/// Returns the mean absolute percentage error between predictions and expectations.
+/// Computes the mean absolute percentage error between `predicted` and `expected`.
+///  `loss = 100 * mean(abs((expected - predicted) / abs(expected)))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -99,7 +105,9 @@ public func meanAbsolutePercentageError<Scalar: TensorFlowFloatingPoint>(
     100 * abs((expected - predicted) / abs(expected)).mean()
 }
 
-/// Returns the hinge loss between predictions and expectations.
+/// Computes the hinge loss between `predicted` and `expected`.
+///  `loss = reduction(max(0, 1 - predicted * expected))` 
+///  `expected` values are expected to be -1 or 1.
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -114,7 +122,9 @@ public func hingeLoss<Scalar: TensorFlowFloatingPoint>(
     reduction(max(Tensor(0), Tensor(1) - expected * predicted))
 }
 
-/// Returns the squared hinge loss between predictions and expectations.
+/// Computes the squared hinge loss between `predicted` and `expected`.
+///  `loss = reduction(square(max(0, 1 - predicted * expected)))`
+///  `expected` values are expected to be -1 or 1.
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -129,7 +139,10 @@ public func squaredHingeLoss<Scalar: TensorFlowFloatingPoint>(
     reduction(hingeLoss(predicted: predicted, expected: expected).squared())
 }
 
-/// Returns the hinge loss between predictions and expectations.
+/// Computes the categorical hinge loss between `predicted` and `expected`.
+///  `loss = maximum(negative - positive + 1, 0)`
+///   where `negative = max((1 - expected) * predicted)` and 
+///  `positive = sum(predicted * expected)`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -146,8 +159,9 @@ public func categoricalHingeLoss<Scalar: TensorFlowFloatingPoint>(
     return reduction(max(Tensor(0), negative - positive + Tensor(1)))
 }
 
-/// Returns the logarithm of the hyperbolic cosine of the error between predictions and
-/// expectations.
+/// Computes the logarithm of the hyperbolic cosine of the prediction error.
+///  `logcosh = log((exp(x) + exp(-x))/2)`,
+///   where x is the error `predicted - expected`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -163,7 +177,9 @@ public func logCoshLoss<Scalar: TensorFlowFloatingPoint>(
     return reduction(x + softplus(Tensor(-2) * x) - log(Tensor(2)))
 }
 
-/// Returns the Poisson loss between predictions and expectations.
+/// Computes the Poisson loss between predicted and expected
+///  The Poisson loss is the mean of the elements of the `Tensor`
+///  `predicted - expected * log(predicted)`.
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -178,8 +194,8 @@ public func poissonLoss<Scalar: TensorFlowFloatingPoint>(
     reduction(predicted - expected * log(predicted))
 }
 
-/// Returns the Kullback-Leibler divergence (KL divergence) between between expectations and
-/// predictions. Given two distributions `p` and `q`, KL divergence computes `p * log(p / q)`.
+/// Computes Kullback-Leibler divergence loss between `expected` and `predicted`.
+/// `loss = reduction(expected * log(expected / predicted))`
 ///
 /// - Parameters:
 ///   - predicted: Predicted outputs from a neural network.
@@ -194,7 +210,10 @@ public func kullbackLeiblerDivergence<Scalar: TensorFlowFloatingPoint>(
     reduction(expected * log(expected / predicted))
 }
 
-/// Returns the softmax cross entropy (categorical cross entropy) between logits and labels.
+/// Computes the sparse softmax cross entropy (categorical cross entropy) between logits and labels.
+///  Use this crossentropy loss function when there are two or more label classes.
+///  We expect labels to be provided as integers. There should be `# classes` 
+///  floating point values per feature for `logits` and a single floating point value per feature for `expected`.
 ///
 /// - Parameters:
 ///   - logits: One-hot encoded outputs from a neural network.
@@ -228,7 +247,10 @@ func _vjpSoftmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
     return (loss, { $0.expandingShape(at: -1) * grad })
 }
 
-/// Returns the softmax cross entropy (categorical cross entropy) between logits and labels.
+/// Computes the sparse softmax cross entropy (categorical cross entropy) between logits and labels.
+///  Use this crossentropy loss function when there are two or more label classes.
+///  We expect labels to be provided provided in a `one_hot` representation. 
+///  There should be `# classes` floating point values per feature.
 ///
 /// - Parameters:
 ///   - logits: Unscaled log probabilities from a neural network.
@@ -263,10 +285,10 @@ func _vjpSoftmaxCrossEntropyHelper<Scalar: TensorFlowFloatingPoint>(
     return (loss, { $0.expandingShape(at: -1) * grad })
 }
 
-/// Returns the sigmoid cross entropy (binary cross entropy) between logits and labels.
-///
-/// The reduction is reduced over all elements. If reduced over batch size is intended, please
-/// consider to scale the loss.
+/// Computes the sigmoid cross entropy (binary cross entropy) between logits and labels.
+///  Use this cross-entropy loss when there are only two label classes (assumed to
+///  be 0 and 1). For each example, there should be a single floating-point value
+///  per prediction.
 ///
 /// - Parameters:
 ///   - logits: The unscaled output of a neural network.
@@ -284,10 +306,10 @@ public func sigmoidCrossEntropy<Scalar: TensorFlowFloatingPoint>(
     return reduction(maxLogitsWithZero - logits * labels + log1p(exp(-negAbsLogits)))
 }
 
-/// Returns the Huber loss between predictions and expectations.
+/// Computes the Huber loss between `predicted` and `expected`.
 ///
-/// For each value `x` in the difference `expected - predicted`, the loss is:
-/// - `0.5 * x^2` if `abs(x) <= δ`.
+/// For each value `x` in `error = expected - predicted`:
+/// - `0.5 * x^2` if `|x| <= δ`.
 /// - `0.5 * δ^2 + δ * (|x| - δ)` otherwise.
 ///
 /// - Source: [Uncyclopedia article](https://en.wikipedia.org/wiki/Huber_loss).