Change significance to be determined by IQR fencing

rylev · rylev · commit d07ca42f9a33 · 2021-09-08T14:34:33.000+02:00
diff --git a/docs/comparison-analysis.md b/docs/comparison-analysis.md
@@ -18,11 +18,11 @@ At the core of comparison analysis are the collection of test results for the tw
 100 * ((statisticForArtifactB - statisticForArtifactA) / statisticForArtifactA)
 ```
 
-## High-level analysis description 
+## High-level analysis description
 
 Analysis of the changes is performed in order to determine whether artifact B represents a performance change over artifact A. At a high level the analysis performed takes the following form:
 
-How many _significant_ test results indicate performance changes and what is the magnitude of the changes (i.e., how large are the changes regardless of the direction of change)? 
+How many _significant_ test results indicate performance changes and what is the magnitude of the changes (i.e., how large are the changes regardless of the direction of change)?
 
 * If there are improvements and regressions with magnitude of medium or above then the comparison is mixed.
 * If there are only either improvements or regressions then the comparison is labeled with that kind.
@@ -37,7 +37,16 @@ Whether we actually _report_ an analysis or not depends on the context and how _
 
 ### What makes a test result significant?
 
-A test result is significant if the relative change percentage meets some threshold. What the threshold is depends of whether the test case is "dodgy" or not (see below for an examination of "dodginess"). For dodgy test cases, the threshold is set at 1%. For non-dodgy test cases, the threshold is set to 0.1%.
+A test result is significant if the relative change percentage is considered an outlier against historical data. Determining whether a value is an outlier is done through interquartile range "fencing" (i.e., whether a value exceeds a threshold equal to the third quartile plus 1.5 times the interquartile range):
+
+```
+interquartile_range = Q3 - Q1
+result > Q3 + (interquartile_range * 1.5)
+```
+
+(Assuming the data is ordered, Q3 is the median of the upper half of the data while Q1 is the median of the lower half.)
+
+We ignore the lower fence, because result data is bounded by 0.
 
 ### What makes a test case "dodgy"?
 
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
@@ -639,24 +639,30 @@ impl BenchmarkVariance {
         self.data.iter().sum::<f64>() / self.data.len() as f64
     }
 
+    /// The percent change of the deltas sorted from smallest delta to largest
+    fn percent_changes(&self) -> Vec<f64> {
+        let mut deltas = self
+            .deltas()
+            .zip(self.data.iter())
+            .map(|(d, &r)| d / r)
+            .collect::<Vec<_>>();
+        deltas.sort_by(|d1, d2| d1.partial_cmp(d2).unwrap_or(std::cmp::Ordering::Equal));
+        deltas
+    }
+
     fn calculate_description(&mut self) {
         self.description = BenchmarkVarianceDescription::Normal;
 
         let results_mean = self.mean();
-        let mut deltas = self
-            .data
-            .windows(2)
-            .map(|window| (window[0] - window[1]).abs())
-            .collect::<Vec<_>>();
-        deltas.sort_by(|d1, d2| d1.partial_cmp(d2).unwrap_or(std::cmp::Ordering::Equal));
-        let non_significant = deltas
+        let percent_changes = self.percent_changes();
+        let non_significant = percent_changes
             .iter()
-            .zip(self.data.iter())
-            .take_while(|(&d, &r)| d / r < Self::SIGNFICANT_DELTA_THRESHOLD)
+            .take_while(|&&c| c < Self::SIGNFICANT_DELTA_THRESHOLD)
             .collect::<Vec<_>>();
 
-        let percent_significant_changes =
-            ((deltas.len() - non_significant.len()) as f64 / deltas.len() as f64) * 100.0;
+        let percent_significant_changes = ((percent_changes.len() - non_significant.len()) as f64
+            / percent_changes.len() as f64)
+            * 100.0;
         debug!(
             "Percent significant changes: {:.1}%",
             percent_significant_changes
@@ -668,14 +674,20 @@ impl BenchmarkVariance {
         }
 
         let delta_mean =
-            non_significant.iter().map(|(&d, _)| d).sum::<f64>() / (non_significant.len() as f64);
+            non_significant.iter().cloned().sum::<f64>() / (non_significant.len() as f64);
         let ratio_change = delta_mean / results_mean;
-        debug!("Ratio change: {:.3}", ratio_change);
         if ratio_change > Self::NOISE_THRESHOLD {
             self.description = BenchmarkVarianceDescription::Noisy;
         }
     }
 
+    // Absolute deltas between adjacent results
+    fn deltas(&self) -> impl Iterator<Item = f64> + '_ {
+        self.data
+            .windows(2)
+            .map(|window| (window[0] - window[1]).abs())
+    }
+
     /// Whether we can trust this benchmark or not
     fn is_dodgy(&self) -> bool {
         matches!(
@@ -740,13 +752,9 @@ pub struct TestResultComparison {
 
 impl TestResultComparison {
     /// The amount of relative change considered significant when
-    /// the test case is not dodgy
+    /// we cannot determine from historical data
     const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD: f64 = 0.002;
 
-    /// The amount of relative change considered significant when
-    /// the test case is dodgy
-    const SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY: f64 = 0.008;
-
     fn is_regression(&self) -> bool {
         let (a, b) = self.results;
         b > a
@@ -761,8 +769,28 @@ impl TestResultComparison {
     }
 
     fn signifcance_threshold(&self) -> f64 {
-        if self.is_dodgy() {
-            Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD_DODGY
+        if let Some(pcs) = self.variance.as_ref().map(|s| s.percent_changes()) {
+            fn median(data: &[f64]) -> f64 {
+                if data.len() % 2 == 0 {
+                    (data[(data.len() - 1) / 2] + data[data.len() / 2]) / 2.0
+                } else {
+                    data[data.len() / 2]
+                }
+            }
+
+            let len = pcs.len();
+            let (h1_end, h2_begin) = if len % 2 == 0 {
+                (len / 2 - 2, len / 2 + 1)
+            } else {
+                (len / 2 - 1, len / 2 + 1)
+            };
+            // significance is determined by the upper
+            // interquartile range fence
+            let q1 = median(&pcs[..=h1_end]);
+            let q3 = median(&pcs[h2_begin..]);
+            let iqr = q3 - q1;
+            let upper_fence = q3 + (iqr * 1.5);
+            upper_fence
         } else {
             Self::SIGNIFICANT_RELATIVE_CHANGE_THRESHOLD
         }