arb_bench/runner/
abba.rs

1use serde::{Deserialize, Serialize};
2
3use super::{BenchRunner, RunnerConfig, Workload};
4use crate::{
5    metrics::{BlockMetric, RunResult},
6    report::compare::{bootstrap_paired_delta, BootstrapDelta, MetricKey, Verdict},
7};
8
9/// Configuration for the ABBA scheduler.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct AbbaConfig {
12    /// Number of A-B-B-A iterations. Total runs per side = `2 * iterations`.
13    pub iterations: usize,
14    /// Bootstrap iteration count for the paired CI computation.
15    pub bootstrap_iters: usize,
16    /// Allowed regression in percent before the verdict turns to `Regression`.
17    pub tolerance_pct: f64,
18    /// PRNG seed for the bootstrap.
19    pub seed: u64,
20    pub runner: RunnerConfig,
21}
22
23impl Default for AbbaConfig {
24    fn default() -> Self {
25        Self {
26            iterations: 3,
27            bootstrap_iters: 10_000,
28            tolerance_pct: 5.0,
29            seed: 0xC0FF_EE12_3456_789A,
30            runner: RunnerConfig::default(),
31        }
32    }
33}
34
35/// Paired baseline + feature runs on the same workload.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct PairedSample {
38    pub iter_index: usize,
39    pub baseline: RunResult,
40    pub feature: RunResult,
41}
42
43/// Output of an ABBA-driven comparison.
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct AbbaResult {
46    pub manifest_name: String,
47    pub iterations: usize,
48    pub samples: Vec<PairedSample>,
49    pub deltas: Vec<(MetricKey, BootstrapDelta)>,
50    pub verdict: Verdict,
51}
52
53/// Each factory is called `2 * iterations` times and returns a fresh workload + runner.
54/// Workloads should be identical between sides; the runner is where the two sides differ.
55pub fn run_abba<FB, FF>(
56    config: &AbbaConfig,
57    manifest_name: &str,
58    mut build_baseline: FB,
59    mut build_feature: FF,
60) -> eyre::Result<AbbaResult>
61where
62    FB: FnMut() -> eyre::Result<(Workload, Box<dyn BenchRunner>)>,
63    FF: FnMut() -> eyre::Result<(Workload, Box<dyn BenchRunner>)>,
64{
65    let mut samples = Vec::with_capacity(config.iterations);
66    for i in 0..config.iterations {
67        let order = if i % 2 == 0 {
68            [Side::Baseline, Side::Feature, Side::Feature, Side::Baseline]
69        } else {
70            [Side::Feature, Side::Baseline, Side::Baseline, Side::Feature]
71        };
72
73        let mut runs: [Option<RunResult>; 4] = Default::default();
74        for (slot, side) in order.iter().enumerate() {
75            let (workload, mut runner) = match side {
76                Side::Baseline => build_baseline()?,
77                Side::Feature => build_feature()?,
78            };
79            runs[slot] = Some(runner.execute(workload)?);
80        }
81
82        let mut baseline_runs = Vec::new();
83        let mut feature_runs = Vec::new();
84        for (slot, side) in order.iter().enumerate() {
85            let r = runs[slot].take().unwrap();
86            match side {
87                Side::Baseline => baseline_runs.push(r),
88                Side::Feature => feature_runs.push(r),
89            }
90        }
91        let baseline = average_runs(&baseline_runs)?;
92        let feature = average_runs(&feature_runs)?;
93
94        samples.push(PairedSample {
95            iter_index: i,
96            baseline,
97            feature,
98        });
99    }
100
101    let deltas = compute_paired_deltas(&samples, config);
102    let verdict = decide_verdict(&deltas, config.tolerance_pct);
103
104    Ok(AbbaResult {
105        manifest_name: manifest_name.to_string(),
106        iterations: config.iterations,
107        samples,
108        deltas,
109        verdict,
110    })
111}
112
113#[derive(Debug, Clone, Copy)]
114enum Side {
115    Baseline,
116    Feature,
117}
118
119/// Combine N runs by averaging per-block metrics.
120fn average_runs(runs: &[RunResult]) -> eyre::Result<RunResult> {
121    if runs.is_empty() {
122        eyre::bail!("average_runs: empty");
123    }
124    if runs.len() == 1 {
125        return Ok(runs[0].clone());
126    }
127    let n = runs[0].blocks.len();
128    if !runs.iter().all(|r| r.blocks.len() == n) {
129        eyre::bail!("average_runs: differing block counts");
130    }
131    let mut blocks: Vec<BlockMetric> = Vec::with_capacity(n);
132    for i in 0..n {
133        let wall: u64 =
134            runs.iter().map(|r| r.blocks[i].wall_clock_ns).sum::<u64>() / runs.len() as u64;
135        let cpu: u64 = runs.iter().map(|r| r.blocks[i].cpu_ns).sum::<u64>() / runs.len() as u64;
136        let rss: u64 = runs.iter().map(|r| r.blocks[i].rss_bytes).sum::<u64>() / runs.len() as u64;
137        blocks.push(BlockMetric {
138            block_number: runs[0].blocks[i].block_number,
139            wall_clock_ns: wall,
140            cpu_ns: cpu,
141            gas_used: runs[0].blocks[i].gas_used,
142            tx_count: runs[0].blocks[i].tx_count,
143            success_count: runs[0].blocks[i].success_count,
144            rss_bytes: rss,
145        });
146    }
147    let windows = crate::metrics::rolling::build_windows(&blocks, 500);
148    let summary = crate::metrics::SummaryMetrics::from_blocks(&blocks, &windows);
149    Ok(RunResult {
150        manifest_name: runs[0].manifest_name.clone(),
151        blocks,
152        windows,
153        summary,
154        host: runs[0].host.clone(),
155    })
156}
157
158/// Bootstrap paired deltas per metric.
159fn compute_paired_deltas(
160    samples: &[PairedSample],
161    config: &AbbaConfig,
162) -> Vec<(MetricKey, BootstrapDelta)> {
163    let mut out = Vec::new();
164    let metrics = [
165        MetricKey::WallClockNs,
166        MetricKey::GasPerSec,
167        MetricKey::CpuNs,
168        MetricKey::RssBytes,
169    ];
170    for m in metrics {
171        let mut paired = Vec::new();
172        for s in samples {
173            let n = s.baseline.blocks.len().min(s.feature.blocks.len());
174            for i in 0..n {
175                let b = metric_value(&s.baseline.blocks[i], m);
176                let f = metric_value(&s.feature.blocks[i], m);
177                paired.push((b, f));
178            }
179        }
180        if paired.is_empty() {
181            continue;
182        }
183        let delta = bootstrap_paired_delta(&paired, config.bootstrap_iters, config.seed);
184        out.push((m, delta));
185    }
186    out
187}
188
189fn metric_value(b: &BlockMetric, m: MetricKey) -> f64 {
190    match m {
191        MetricKey::WallClockNs => b.wall_clock_ns as f64,
192        MetricKey::CpuNs => b.cpu_ns as f64,
193        MetricKey::GasPerSec => b.gas_per_sec(),
194        MetricKey::RssBytes => b.rss_bytes as f64,
195    }
196}
197
198fn decide_verdict(deltas: &[(MetricKey, BootstrapDelta)], tolerance_pct: f64) -> Verdict {
199    let mut worst: Option<(MetricKey, f64)> = None;
200    for (k, d) in deltas {
201        let baseline_mean = d.baseline_mean.max(1e-9);
202        let pct = match k {
203            MetricKey::GasPerSec => -d.mean / baseline_mean * 100.0,
204            _ => d.mean / baseline_mean * 100.0,
205        };
206        let ci_pct = match k {
207            MetricKey::GasPerSec => -d.ci_low_95 / baseline_mean * 100.0,
208            _ => d.ci_low_95 / baseline_mean * 100.0,
209        };
210        if ci_pct > tolerance_pct {
211            match worst {
212                Some((_, w)) if w >= pct => {}
213                _ => worst = Some((*k, pct)),
214            }
215        }
216    }
217    if let Some((metric, pct)) = worst {
218        return Verdict::Regression {
219            metric: format!("{metric:?}"),
220            delta_pct: pct,
221        };
222    }
223
224    let mut any_improvement = false;
225    for (k, d) in deltas {
226        let baseline_mean = d.baseline_mean.max(1e-9);
227        let pct_high = match k {
228            MetricKey::GasPerSec => -d.ci_high_95 / baseline_mean * 100.0,
229            _ => d.ci_high_95 / baseline_mean * 100.0,
230        };
231        if pct_high < 0.0 {
232            any_improvement = true;
233        }
234    }
235    if any_improvement {
236        Verdict::Improvement
237    } else {
238        Verdict::Neutral
239    }
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use crate::capture::synthetic::generate;
246
247    #[test]
248    fn abba_smoke_runs_and_yields_neutral_for_identical_runs() {
249        let cfg = AbbaConfig {
250            iterations: 1,
251            bootstrap_iters: 200,
252            tolerance_pct: 50.0,
253            seed: 1,
254            runner: RunnerConfig {
255                rolling_window_blocks: 2,
256                abort_on_block_error: false,
257            },
258        };
259        let build = || {
260            let w = generate(
261                "test/abba",
262                421614,
263                30,
264                "transfer_train",
265                &serde_json::json!({ "block_count": 2, "txs_per_block": 2 }),
266            )?;
267            let r: Box<dyn BenchRunner> = Box::new(
268                crate::runner::in_process::InProcessRunner::new(cfg.runner.clone()),
269            );
270            Ok::<_, eyre::Report>((w, r))
271        };
272        let result = run_abba(&cfg, "test/abba", build, build).unwrap();
273        assert_eq!(result.iterations, 1);
274        assert!(!result.deltas.is_empty());
275        // Verdict for identical workloads under wide tolerance is neutral or improvement;
276        // never regression.
277        assert!(!matches!(result.verdict, Verdict::Regression { .. }));
278    }
279}