Clarify relevance

rylev · rylev · commit 87ba918961bb · 2022-04-04T11:23:18.000+02:00
diff --git a/docs/glossary.md b/docs/glossary.md
@@ -25,20 +25,20 @@ The following is a glossary of domain specific terminology. Although benchmarks
 * **test case**: a combination of a benchmark, a profile, and a scenario.
 * **test**: the act of running an artifact under a test case. Each test result is composed of many iterations.
 * **test iteration**: a single iteration that makes up a test. Note: we currently normally run 2 test iterations for each test. 
-* **test result**: the result of the collection of all statistics from running a test. Currently the minimum of the statistics.
+* **test result**: the result of the collection of all statistics from running a test. Currently, the minimum value of a statistic from all the test iterations is used.
 * **statistic**: a single value of a metric in a test result
 * **statistic description**: the combination of a metric and a test case which describes a statistic.
 * **statistic series**: statistics for the same statistic description over time.
 * **run**: a collection of test results for all currently available test cases run on a given artifact. 
-* **test result delta**: the delta between two test results for the same test case but (optionally) different artifacts. The [comparison page](https://perf.rust-lang.org/compare.html) lists all the test result deltas as percentages comparing two runs.  
 
 ## Analysis
 
-* **test result delta**: the difference between two test results for the same metric and test case.
-* **significance threshold**: the threshold at which a test result delta is considered "significant" (i.e., a real change in performance and not just noise). This is calculated using [the upper IQR fence](https://www.statisticshowto.com/upper-and-lower-fences/#:~:text=Upper%20and%20lower%20fences%20cordon,%E2%80%93%20(1.5%20*%20IQR)) as seen [here](https://github.com/rust-lang/rustc-perf/blob/8ba845644b4cfcffd96b909898d7225931b55557/site/src/comparison.rs#L935-L941).
-* **significant test result delta**: a test result delta above the significance threshold. Significant test result deltas can be thought of as "statistically significant".
+* **test result comparison**: the delta between two test results for the same test case but (optionally) different artifacts. The [comparison page](https://perf.rust-lang.org/compare.html) lists all the test result comparisons as percentages between two runs.  
+* **significance threshold**: the threshold at which a test result comparison is considered "significant" (i.e., a real change in performance and not just noise). This is calculated using [the upper IQR fence](https://www.statisticshowto.com/upper-and-lower-fences/#:~:text=Upper%20and%20lower%20fences%20cordon,%E2%80%93%20(1.5%20*%20IQR)) as seen [here](https://github.com/rust-lang/rustc-perf/blob/8ba845644b4cfcffd96b909898d7225931b55557/site/src/comparison.rs#L935-L941).
+* **significant test result comparison**: a test result comparison above the significance threshold. Significant test result comparisons can be thought of as being "statistically significant".
+* **relevant test result comparison**: a test result comparison can be significant but still not be relevant (i.e., worth paying attention to). Relevance is a factor of the test result comparison's *magnitude*. Comparisons are considered relevant if they have a small magnitude or more. This term is often used to mean "significant *and* relevant" since relevant changes are necessarily also significant.
+* **test result comparison magnitude**: how "large" the delta is between the two test result's under comparison. This is determined by the average of two factors: the absolute size of the change (i.e., a change of 5% is larger than a change of 1%) and the amount above the significance threshold (i.e., a change that is 5x the significance threshold is larger than a change 1.5x the significance threshold).
 * **dodgy test case**: a test case for which the significance threshold is significantly large indicating a high amount of variability in the test and thus making it necessary to be somewhat skeptical of any results too close to the significance threshold.
-* **relevant test result delta**: a synonym for *significant test result delta* in situations where the term "significant" might be ambiguous and readers may potentially interpret *significant* as "large" or "statistically significant". For example, in try run results, we use the term relevant instead of significant.
 
 ## Other 
 
diff --git a/site/src/comparison.rs b/site/src/comparison.rs
@@ -145,11 +145,11 @@ async fn populate_report(
     report: &mut HashMap<Option<Direction>, Vec<String>>,
 ) {
     if let Some(summary) = ComparisonSummary::summarize_comparison(comparison) {
-        let confidence = summary.confidence();
-        if confidence.is_atleast_probably_relevant() {
+        let relevance = summary.relevance_level();
+        if relevance.at_least_somewhat_relevant() {
             if let Some(direction) = summary.direction() {
                 let entry = report
-                    .entry(confidence.is_definitely_relevant().then(|| direction))
+                    .entry(relevance.very_relevant().then(|| direction))
                     .or_default();
 
                 entry.push(summary.write(comparison).await)
@@ -160,7 +160,7 @@ async fn populate_report(
 
 /// A summary of a given comparison
 ///
-/// This summary only includes changes that are significant and relevant (as determined by a changes magnitude).
+/// This summary only includes changes that are significant and relevant (as determined by a change's magnitude).
 pub struct ComparisonSummary {
     /// Significant comparisons of magnitude small and above
     /// and ordered by magnitude from largest to smallest
@@ -182,7 +182,7 @@ impl ComparisonSummary {
             .statistics
             .iter()
             .filter(|c| c.is_significant())
-            .filter(|c| c.magnitude().is_small_or_above())
+            .filter(|c| c.is_relevant())
             .inspect(|c| {
                 if c.is_improvement() {
                     num_improvements += 1;
@@ -361,24 +361,25 @@ impl ComparisonSummary {
         self.comparisons.iter().find(|s| s.is_regression())
     }
 
-    pub fn confidence(&self) -> ComparisonConfidence {
+    /// The relevance level of the entire comparison
+    pub fn relevance_level(&self) -> RelevanceLevel {
         let mut num_small_changes = 0;
         let mut num_medium_changes = 0;
         for c in self.comparisons.iter() {
             match c.magnitude() {
                 Magnitude::Small => num_small_changes += 1,
                 Magnitude::Medium => num_medium_changes += 1,
-                Magnitude::Large => return ComparisonConfidence::DefinitelyRelevant,
-                Magnitude::VeryLarge => return ComparisonConfidence::DefinitelyRelevant,
+                Magnitude::Large => return RelevanceLevel::High,
+                Magnitude::VeryLarge => return RelevanceLevel::High,
                 Magnitude::VerySmall => unreachable!(),
             }
         }
 
         match (num_small_changes, num_medium_changes) {
-            (_, m) if m > 1 => ComparisonConfidence::DefinitelyRelevant,
-            (_, 1) => ComparisonConfidence::ProbablyRelevant,
-            (s, 0) if s > 10 => ComparisonConfidence::ProbablyRelevant,
-            _ => ComparisonConfidence::MaybeRelevant,
+            (_, m) if m > 1 => RelevanceLevel::High,
+            (_, 1) => RelevanceLevel::Medium,
+            (s, 0) if s > 10 => RelevanceLevel::Medium,
+            _ => RelevanceLevel::Low,
         }
     }
 
@@ -553,22 +554,21 @@ pub fn write_summary_table(
     .unwrap();
 }
 
-/// The amount of confidence we have that a comparison actually represents a real
-/// change in the performance characteristics.
+/// How relevant a set of test result comparisons are.
 #[derive(Clone, Copy, Debug)]
-pub enum ComparisonConfidence {
-    MaybeRelevant,
-    ProbablyRelevant,
-    DefinitelyRelevant,
+pub enum RelevanceLevel {
+    Low,
+    Medium,
+    High,
 }
 
-impl ComparisonConfidence {
-    pub fn is_definitely_relevant(self) -> bool {
-        matches!(self, Self::DefinitelyRelevant)
+impl RelevanceLevel {
+    pub fn very_relevant(self) -> bool {
+        matches!(self, Self::High)
     }
 
-    pub fn is_atleast_probably_relevant(self) -> bool {
-        matches!(self, Self::DefinitelyRelevant | Self::ProbablyRelevant)
+    pub fn at_least_somewhat_relevant(self) -> bool {
+        matches!(self, Self::High | Self::Medium)
     }
 }
 
@@ -1040,6 +1040,15 @@ impl TestResultComparison {
         Some(change.abs() / threshold)
     }
 
+    /// Whether the comparison is relevant or not
+    fn is_relevant(&self) -> bool {
+        self.magnitude().is_small_or_above()
+    }
+
+    /// The magnitude of the change.
+    ///
+    /// This is the average of the absolute magnitude of the change
+    /// and the amount above the significance threshold.
     fn magnitude(&self) -> Magnitude {
         let change = self.relative_change().abs();
         let threshold = self.significance_threshold();
@@ -1054,7 +1063,7 @@ impl TestResultComparison {
         } else {
             Magnitude::VeryLarge
         };
-        let change_magnitude = if change < 0.002 {
+        let absolute_magnitude = if change < 0.002 {
             Magnitude::VerySmall
         } else if change < 0.01 {
             Magnitude::Small
@@ -1084,7 +1093,9 @@ impl TestResultComparison {
             }
         }
 
-        from_u8((as_u8(over_threshold) + as_u8(change_magnitude)) / 2)
+        // Take the average of the absolute magnitude and the magnitude
+        // above the significance threshold.
+        from_u8((as_u8(over_threshold) + as_u8(absolute_magnitude)) / 2)
     }
 
     fn is_dodgy(&self) -> bool {
diff --git a/site/src/github.rs b/site/src/github.rs
@@ -1,6 +1,6 @@
 use crate::api::github::Issue;
 use crate::comparison::{
-    write_summary_table, Comparison, ComparisonConfidence, ComparisonSummary, Direction,
+    write_summary_table, Comparison, ComparisonSummary, Direction, RelevanceLevel,
 };
 use crate::load::{Config, SiteCtxt, TryCommit};
 
@@ -741,7 +741,7 @@ fn generate_short_summary(
 
     match summary {
         Some(summary) => {
-            if comparison_is_relevant(summary.confidence(), is_master_commit) {
+            if comparison_is_relevant(summary.relevance_level(), is_master_commit) {
                 let direction = summary.direction().unwrap();
                 let num_improvements = summary.number_of_improvements();
                 let num_regressions = summary.number_of_regressions();
@@ -811,12 +811,12 @@ fn split_comparison(
 }
 
 /// Whether a comparison is relevant enough to show
-fn comparison_is_relevant(confidence: ComparisonConfidence, is_master_commit: bool) -> bool {
+fn comparison_is_relevant(relevance: RelevanceLevel, is_master_commit: bool) -> bool {
     if is_master_commit {
-        confidence.is_definitely_relevant()
+        relevance.very_relevant()
     } else {
         // is try run
-        confidence.is_atleast_probably_relevant()
+        relevance.at_least_somewhat_relevant()
     }
 }