1use rand::{rngs::StdRng, RngCore, SeedableRng};
2use serde::{Deserialize, Serialize};
3
4use crate::metrics::RunResult;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub enum MetricKey {
9 WallClockNs,
10 CpuNs,
11 GasPerSec,
12 RssBytes,
13}
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct BootstrapDelta {
18 pub n: usize,
19 pub baseline_mean: f64,
20 pub feature_mean: f64,
21 pub mean: f64,
22 pub ci_low_95: f64,
23 pub ci_high_95: f64,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
28pub enum Verdict {
29 Improvement,
30 Neutral,
31 Regression { metric: String, delta_pct: f64 },
32}
33
34impl Verdict {
35 pub fn label(&self) -> &'static str {
36 match self {
37 Self::Improvement => "IMPROVEMENT",
38 Self::Neutral => "NEUTRAL",
39 Self::Regression { .. } => "REGRESSION",
40 }
41 }
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct ComparisonReport {
47 pub manifest_name: String,
48 pub baseline_summary: crate::metrics::SummaryMetrics,
49 pub feature_summary: crate::metrics::SummaryMetrics,
50 pub deltas: Vec<(MetricKey, BootstrapDelta)>,
51 pub verdict: Verdict,
52}
53
54pub fn compare(
56 baseline: &RunResult,
57 feature: &RunResult,
58 bootstrap_iters: usize,
59 tolerance_pct: f64,
60 seed: u64,
61) -> ComparisonReport {
62 let n = baseline.blocks.len().min(feature.blocks.len());
63 let mut deltas = Vec::new();
64 for m in [
65 MetricKey::WallClockNs,
66 MetricKey::CpuNs,
67 MetricKey::GasPerSec,
68 MetricKey::RssBytes,
69 ] {
70 let mut paired: Vec<(f64, f64)> = Vec::with_capacity(n);
71 for i in 0..n {
72 let b = match m {
73 MetricKey::WallClockNs => baseline.blocks[i].wall_clock_ns as f64,
74 MetricKey::CpuNs => baseline.blocks[i].cpu_ns as f64,
75 MetricKey::GasPerSec => baseline.blocks[i].gas_per_sec(),
76 MetricKey::RssBytes => baseline.blocks[i].rss_bytes as f64,
77 };
78 let f = match m {
79 MetricKey::WallClockNs => feature.blocks[i].wall_clock_ns as f64,
80 MetricKey::CpuNs => feature.blocks[i].cpu_ns as f64,
81 MetricKey::GasPerSec => feature.blocks[i].gas_per_sec(),
82 MetricKey::RssBytes => feature.blocks[i].rss_bytes as f64,
83 };
84 paired.push((b, f));
85 }
86 deltas.push((m, bootstrap_paired_delta(&paired, bootstrap_iters, seed)));
87 }
88 let verdict = decide_verdict_for_compare(&deltas, tolerance_pct);
89 ComparisonReport {
90 manifest_name: baseline.manifest_name.clone(),
91 baseline_summary: baseline.summary.clone(),
92 feature_summary: feature.summary.clone(),
93 deltas,
94 verdict,
95 }
96}
97
98fn decide_verdict_for_compare(
99 deltas: &[(MetricKey, BootstrapDelta)],
100 tolerance_pct: f64,
101) -> Verdict {
102 let mut worst: Option<(MetricKey, f64)> = None;
103 for (k, d) in deltas {
104 let baseline_mean = d.baseline_mean.max(1e-9);
105 let pct = match k {
106 MetricKey::GasPerSec => -d.mean / baseline_mean * 100.0,
107 _ => d.mean / baseline_mean * 100.0,
108 };
109 let ci_pct_lower = match k {
110 MetricKey::GasPerSec => -d.ci_low_95 / baseline_mean * 100.0,
111 _ => d.ci_low_95 / baseline_mean * 100.0,
112 };
113 if ci_pct_lower > tolerance_pct && worst.map(|(_, w)| pct > w).unwrap_or(true) {
114 worst = Some((*k, pct));
115 }
116 }
117 if let Some((m, pct)) = worst {
118 return Verdict::Regression {
119 metric: format!("{m:?}"),
120 delta_pct: pct,
121 };
122 }
123
124 let mut improved = false;
125 for (k, d) in deltas {
126 let baseline_mean = d.baseline_mean.max(1e-9);
127 let high = match k {
128 MetricKey::GasPerSec => -d.ci_high_95 / baseline_mean * 100.0,
129 _ => d.ci_high_95 / baseline_mean * 100.0,
130 };
131 if high < 0.0 {
132 improved = true;
133 }
134 }
135 if improved {
136 Verdict::Improvement
137 } else {
138 Verdict::Neutral
139 }
140}
141
142pub fn bootstrap_paired_delta(
144 paired: &[(f64, f64)],
145 iterations: usize,
146 seed: u64,
147) -> BootstrapDelta {
148 if paired.is_empty() {
149 return BootstrapDelta {
150 n: 0,
151 baseline_mean: 0.0,
152 feature_mean: 0.0,
153 mean: 0.0,
154 ci_low_95: 0.0,
155 ci_high_95: 0.0,
156 };
157 }
158 let baseline_mean = paired.iter().map(|p| p.0).sum::<f64>() / paired.len() as f64;
159 let feature_mean = paired.iter().map(|p| p.1).sum::<f64>() / paired.len() as f64;
160 let mean_delta = feature_mean - baseline_mean;
161
162 let mut rng = StdRng::seed_from_u64(seed);
163 let n = paired.len();
164 let mut sample_means = Vec::with_capacity(iterations);
165 for _ in 0..iterations {
166 let mut sum = 0.0;
167 for _ in 0..n {
168 let idx = (rng.next_u32() as usize) % n;
169 let (b, f) = paired[idx];
170 sum += f - b;
171 }
172 sample_means.push(sum / n as f64);
173 }
174 sample_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
175 let lo_idx = ((iterations as f64) * 0.025).floor() as usize;
176 let hi_idx = ((iterations as f64) * 0.975).ceil() as usize - 1;
177 let ci_low = sample_means[lo_idx.min(sample_means.len() - 1)];
178 let ci_high = sample_means[hi_idx.min(sample_means.len() - 1)];
179
180 BootstrapDelta {
181 n,
182 baseline_mean,
183 feature_mean,
184 mean: mean_delta,
185 ci_low_95: ci_low,
186 ci_high_95: ci_high,
187 }
188}
189
190#[cfg(test)]
191mod tests {
192 use super::*;
193
194 #[test]
195 fn bootstrap_identical_samples_has_zero_ci() {
196 let pairs: Vec<_> = (0..50).map(|i| (i as f64, i as f64)).collect();
197 let d = bootstrap_paired_delta(&pairs, 1000, 42);
198 assert_eq!(d.n, 50);
199 assert!(d.mean.abs() < 1e-9);
200 assert!(d.ci_low_95.abs() < 1e-9);
201 assert!(d.ci_high_95.abs() < 1e-9);
202 }
203
204 #[test]
205 fn bootstrap_detects_clear_improvement() {
206 let pairs: Vec<_> = (0..50)
207 .map(|i| (100.0_f64 + i as f64, 50.0 + i as f64))
208 .collect();
209 let d = bootstrap_paired_delta(&pairs, 1000, 42);
210 assert!(d.mean < 0.0);
211 assert!(d.ci_high_95 < 0.0); }
213
214 #[test]
215 fn verdict_regression_when_metric_strictly_worse() {
216 let deltas = vec![(
217 MetricKey::WallClockNs,
218 BootstrapDelta {
219 n: 10,
220 baseline_mean: 1000.0,
221 feature_mean: 1500.0,
222 mean: 500.0,
223 ci_low_95: 400.0,
224 ci_high_95: 600.0,
225 },
226 )];
227 let v = decide_verdict_for_compare(&deltas, 5.0);
228 assert!(matches!(v, Verdict::Regression { .. }));
229 }
230
231 #[test]
232 fn verdict_improvement_when_gas_per_sec_strictly_higher() {
233 let deltas = vec![(
234 MetricKey::GasPerSec,
235 BootstrapDelta {
236 n: 10,
237 baseline_mean: 100.0,
238 feature_mean: 150.0,
239 mean: 50.0,
240 ci_low_95: 40.0,
241 ci_high_95: 60.0,
242 },
243 )];
244 let v = decide_verdict_for_compare(&deltas, 5.0);
245 assert_eq!(v, Verdict::Improvement);
246 }
247}