Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
55bb98a
Add `trimmedCount` to `TimerStatistics` and `BenchmarkResults` types
MajorLift Apr 8, 2026
80d89da
Export `trimOutliers` and expose IQR count in `calculateTimerStatistics`
MajorLift Apr 8, 2026
01bc5ca
Add `WARMUP_RUNS`, `MIN_SAMPLES_FOR_VERDICT`, and `POWER_USER_NUM_BRO…
MajorLift Apr 8, 2026
4008c78
Apply warm-up exclusion and IQR trimming in `runPageLoadBenchmark`, p…
MajorLift Apr 8, 2026
cd5b927
Increase default `browserLoads` to 15 for `POWER_USER_HOME` preset
MajorLift Apr 8, 2026
10dbb46
Emit `trimmedCount` as derived Sentry tag per metric
MajorLift Apr 8, 2026
56d4f41
Add unit tests for IQR outlier trimming edge cases
MajorLift Apr 20, 2026
775bb15
ci: rebalance startupPowerUserHome to 15 browserLoads × 7 pageLoads
MajorLift Apr 20, 2026
897ae1d
fix: resolve lint errors in statistics and outlier-trimming test
MajorLift Apr 21, 2026
7cd855d
fix: filter measuredWebVitalsRuns by iteration instead of slice
MajorLift Apr 22, 2026
f7a7d21
fix: address Bugbot review #4156995681 — guard empty measuredResults …
MajorLift Apr 22, 2026
df553e4
[skip-e2e] [force-builds]
MajorLift Apr 22, 2026
3909d41
Narrow `dataQuality` type to literal union
MajorLift Apr 23, 2026
e98d614
Remove unused code
MajorLift Apr 23, 2026
47db254
Tighten test assertion
MajorLift Apr 23, 2026
ece9e74
Drop \`trimOutliers\` wrapper and fix reference aliasing in \`detectO…
MajorLift Apr 23, 2026
b0197ef
Add \`outliers\` to \`BenchmarkResults\` type; document IQR-only poli…
MajorLift Apr 23, 2026
697c24f
Replace inline CV ladder with \`assessDataQuality\`, emit \`outliers\…
MajorLift Apr 23, 2026
0bc831c
Spread input on \`detectOutliersZScore\` small-array early return
MajorLift Apr 23, 2026
1a5457b
Collect \`outliers\` from \`TimerStatistics\` in \`convertTimerStatis…
MajorLift Apr 23, 2026
451bc12
Fix prettier line-length violation in \`send-to-sentry.ts\`
MajorLift Apr 23, 2026
0827815
Fix JSDoc lint errors on \`runPageLoadBenchmark\`
MajorLift Apr 23, 2026
a8c5df8
Populate \`outliers\` in \`runPageLoadBenchmark\` return for Sentry c…
MajorLift Apr 23, 2026
4c40363
Spread \`outliers\` copy in \`runPageLoadBenchmark\` to avoid shared …
MajorLift Apr 23, 2026
f1fc8fe
Move \`browserLoads <= WARMUP_RUNS\` guard before benchmark loop in \…
MajorLift Apr 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jobs:
- name: Run the benchmark
if: ${{ env.BENCHMARK_GATED == 'true' }}
run: >-
${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
shell: bash

- name: Send benchmark results to Sentry (main/release only)
Expand Down
2 changes: 2 additions & 0 deletions shared/constants/benchmarks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ export type TimerStatistics = {
p99: number;
samples: number;
outliers: number;
trimmedCount?: number;
dataQuality: 'good' | 'poor' | 'unreliable';
};

Expand Down Expand Up @@ -108,6 +109,7 @@ export type BenchmarkResults = {
stdDev: StatisticalResult;
p75: StatisticalResult;
p95: StatisticalResult;
trimmedCount?: StatisticalResult;
webVitals?: WebVitalsSummary;
};

Expand Down
10 changes: 8 additions & 2 deletions test/e2e/benchmarks/flows/startup/power-user-home.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ import {
type BenchmarkResults,
type WebVitalsMetrics,
} from '../../../../../shared/constants/benchmarks';
import { WITH_STATE_POWER_USER } from '../../utils/constants';
import {
WITH_STATE_POWER_USER,
POWER_USER_NUM_BROWSER_LOADS,
} from '../../utils/constants';
import { runPageLoadBenchmark, collectWebVitals } from '../../utils';
import type {
Metrics,
Expand Down Expand Up @@ -87,5 +90,8 @@ async function measurePagePowerUser(
export async function run(
options: PageLoadBenchmarkOptions,
): Promise<BenchmarkResults> {
return runPageLoadBenchmark(measurePagePowerUser, options);
return runPageLoadBenchmark(measurePagePowerUser, {
...options,
browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS,
});
}
39 changes: 39 additions & 0 deletions test/e2e/benchmarks/send-to-sentry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,51 @@ async function main() {
}
}

// Derived reliability metrics: CV, dataQuality, tailRatio
// CV = (stdDev / mean) * 100 — coefficient of variation, directly
// comparable across steps with different magnitudes.
// dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50).
// tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter.
const derivedMetrics: Record<string, number | string> = {};
if (benchmark.mean && benchmark.stdDev) {
for (const [key, meanVal] of Object.entries(benchmark.mean)) {
const stdDevVal = benchmark.stdDev[key];
if (meanVal > 0 && stdDevVal !== undefined) {
const cv = (stdDevVal / meanVal) * 100;
derivedMetrics[`${type}.cv.${key}`] = cv;
let dataQuality: string;
Comment thread
MajorLift marked this conversation as resolved.
Outdated
if (cv < 30) {
dataQuality = 'good';
Comment thread
gauthierpetetin marked this conversation as resolved.
Outdated
} else if (cv < 50) {
dataQuality = 'poor';
} else {
dataQuality = 'unreliable';
}
derivedMetrics[`${type}.dataQuality.${key}`] = dataQuality;
}
}
}
if (benchmark.p95 && benchmark.p75) {
for (const [key, p95Val] of Object.entries(benchmark.p95)) {
const p75Val = benchmark.p75[key];
if (p75Val !== undefined && p75Val > 0) {
derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val;
}
}
}
if (benchmark.trimmedCount) {
for (const [key, count] of Object.entries(benchmark.trimmedCount)) {
derivedMetrics[`${type}.trimmedCount.${key}`] = count;
}
}

// Timer data: structured logs (existing path, unchanged)
Sentry.logger.info(message, {
...baseCiAttributes,
'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD,
'ci.testTitle': benchmark.testTitle,
...allMetrics,
...derivedMetrics,
});

// Web vitals: separate reporting path via spans
Expand Down
9 changes: 9 additions & 0 deletions test/e2e/benchmarks/utils/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS;
/** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */
export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS;

/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */
export const POWER_USER_NUM_BROWSER_LOADS = 15;

/** Number of leading browser-load sessions to discard as warm-up before computing stats. */
export const WARMUP_RUNS = 1;

/** Minimum effective sample count (after warm-up exclusion + IQR trimming) required to emit a Mann-Whitney verdict. */
export const MIN_SAMPLES_FOR_VERDICT = 5;
Comment thread
gauthierpetetin marked this conversation as resolved.
Outdated
Comment thread
MajorLift marked this conversation as resolved.
Outdated

export const ALL_METRICS = {
uiStartup: 'UI Startup',
load: 'navigation[0].load',
Expand Down
122 changes: 122 additions & 0 deletions test/e2e/benchmarks/utils/outlier-trimming.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import { trimOutliers } from './statistics';

describe('trimOutliers (IQR-based)', () => {
describe('small sample edge cases', () => {
it('returns empty array unchanged', () => {
const result = trimOutliers([]);
expect(result.samples).toEqual([]);
expect(result.trimmedCount).toBe(0);
});

it('returns array of 1 unchanged', () => {
const result = trimOutliers([500]);
expect(result.samples).toEqual([500]);
expect(result.trimmedCount).toBe(0);
});

it('returns array of 2 unchanged', () => {
const result = trimOutliers([100, 200]);
expect(result.samples).toEqual([100, 200]);
expect(result.trimmedCount).toBe(0);
});

it('returns array of 3 unchanged (below IQR threshold)', () => {
const result = trimOutliers([100, 200, 9000]);
expect(result.samples).toEqual([100, 200, 9000]);
expect(result.trimmedCount).toBe(0);
});
});

describe('no outliers', () => {
it('returns all values when distribution is tight', () => {
const samples = [100, 101, 99, 102, 98, 100, 101, 99];
const result = trimOutliers(samples);
expect(result.trimmedCount).toBe(0);
expect(result.samples).toHaveLength(samples.length);
});

it('returns all values when all samples are identical', () => {
const samples = [200, 200, 200, 200, 200, 200];
const result = trimOutliers(samples);
expect(result.trimmedCount).toBe(0);
expect(result.samples).toHaveLength(samples.length);
});
});

describe('outlier removal', () => {
it('removes a single high outlier', () => {
const samples = [10, 11, 12, 10, 11, 12, 11, 1000];
const result = trimOutliers(samples);
expect(result.samples).not.toContain(1000);
expect(result.trimmedCount).toBe(1);
});

it('removes a single low outlier', () => {
const samples = [100, 102, 101, 103, 100, 101, 1];
const result = trimOutliers(samples);
expect(result.samples).not.toContain(1);
expect(result.trimmedCount).toBe(1);
});

it('removes multiple outliers on both ends', () => {
const samples = [1, 100, 101, 102, 100, 101, 103, 9999];
const result = trimOutliers(samples);
expect(result.samples).not.toContain(1);
expect(result.samples).not.toContain(9999);
expect(result.trimmedCount).toBe(2);
});

it('preserves non-outlier values exactly', () => {
const core = [100, 105, 95, 102, 98, 101];
const samples = [...core, 5000];
const result = trimOutliers(samples);
for (const v of core) {
expect(result.samples).toContain(v);
}
});
});

describe('realistic benchmark scenario (n=15)', () => {
it('removes 0-3 outliers from a 15-sample benchmark run', () => {
// Simulates 15 independent browser-load sessions with 1-2 JIT/GC spikes
const normal = [
320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317,
];
const withSpikes = [...normal, 900, 850]; // two cold-start spikes
const result = trimOutliers(withSpikes);
expect(result.trimmedCount).toBeGreaterThanOrEqual(1);
expect(result.trimmedCount).toBeLessThanOrEqual(3);
expect(result.samples.length).toBeGreaterThanOrEqual(12);
Comment thread
gauthierpetetin marked this conversation as resolved.
Outdated
Comment thread
MajorLift marked this conversation as resolved.
Outdated
});

it('does not over-trim a low-variance run', () => {
const stable = [
300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302,
300,
];
const result = trimOutliers(stable);
expect(result.trimmedCount).toBe(0);
expect(result.samples).toHaveLength(stable.length);
});
});

describe('input ordering', () => {
it('produces the same trimmedCount regardless of input order', () => {
const ordered = [10, 11, 12, 13, 14, 15, 1000];
const shuffled = [1000, 13, 10, 15, 12, 11, 14];
const r1 = trimOutliers(ordered);
const r2 = trimOutliers(shuffled);
expect(r1.trimmedCount).toBe(r2.trimmedCount);
expect(r1.samples.sort((a, b) => a - b)).toEqual(
r2.samples.sort((a, b) => a - b),
);
});

it('does not mutate the input array', () => {
const samples = [10, 11, 1000, 12, 13];
const copy = [...samples];
trimOutliers(samples);
expect(samples).toEqual(copy);
});
});
});
34 changes: 26 additions & 8 deletions test/e2e/benchmarks/utils/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
ALL_METRICS,
DEFAULT_NUM_BROWSER_LOADS,
DEFAULT_NUM_PAGE_LOADS,
WARMUP_RUNS,
} from './constants';
import {
aggregateWebVitals,
Expand All @@ -27,6 +28,7 @@ import {
calcStdDevResult,
calculateTimerStatistics,
checkExclusionRate,
detectOutliersIQR,
MAX_EXCLUSION_RATE,
MAX_TOTAL_DURATION_MS,
validateThresholds,
Expand Down Expand Up @@ -264,6 +266,7 @@ export function convertTimerStatisticsToBenchmarkResults(
const stdDev: StatisticalResult = {};
const p75: StatisticalResult = {};
const p95: StatisticalResult = {};
const trimmedCount: StatisticalResult = {};

// timers already includes promoted web vitals from runBenchmarkWithIterations
for (const timer of timers) {
Expand All @@ -273,8 +276,13 @@ export function convertTimerStatisticsToBenchmarkResults(
stdDev[timer.id] = timer.stdDev;
p75[timer.id] = timer.p75;
p95[timer.id] = timer.p95;
if (timer.trimmedCount !== undefined) {
trimmedCount[timer.id] = timer.trimmedCount;
}
}

const hasTrimmedCounts = Object.keys(trimmedCount).length > 0;

return {
testTitle,
persona,
Expand All @@ -287,6 +295,7 @@ export function convertTimerStatisticsToBenchmarkResults(
stdDev,
p75,
p95,
...(hasTrimmedCounts && { trimmedCount }),
Comment thread
cursor[bot] marked this conversation as resolved.
...(webVitals && { webVitals }),
};
}
Expand Down Expand Up @@ -366,10 +375,16 @@ export async function runPageLoadBenchmark(
resultPersona = persona;
}

if (runResults.some((result) => result.navigation.length > 1)) {
// Discard the first WARMUP_RUNS browser-load sessions before computing stats.
// Each session contributes exactly pageLoads metric objects to runResults.
const warmupSize = WARMUP_RUNS * pageLoads;
const measuredResults = runResults.slice(warmupSize);
Comment thread
cursor[bot] marked this conversation as resolved.
const measuredWebVitalsRuns = allWebVitalsRuns.slice(warmupSize);
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

if (measuredResults.some((result) => result.navigation.length > 1)) {
throw new Error(`Multiple navigations not supported`);
}
const firstNonNavigate = runResults.find(
const firstNonNavigate = measuredResults.find(
(result) => result.navigation[0].type !== 'navigate',
);
if (firstNonNavigate !== undefined) {
Expand All @@ -379,17 +394,19 @@ export async function runPageLoadBenchmark(
}

const result: Record<string, number[]> = {};
const trimmedCounts: StatisticalResult = {};
for (const [key, tracePath] of Object.entries(ALL_METRICS)) {
result[key] = runResults
.map((m) => get(m, tracePath) as number)
.sort((a, b) => a - b);
const rawSamples = measuredResults.map((m) => get(m, tracePath) as number);
const { filtered, outlierCount } = detectOutliersIQR(rawSamples);
Comment thread
gauthierpetetin marked this conversation as resolved.
result[key] = [...filtered].sort((a, b) => a - b);
trimmedCounts[key] = outlierCount;
}

let webVitals: WebVitalsSummary | undefined;
if (allWebVitalsRuns.length > 0) {
if (measuredWebVitalsRuns.length > 0) {
webVitals = {
runs: allWebVitalsRuns,
aggregated: aggregateWebVitals(allWebVitalsRuns),
runs: measuredWebVitalsRuns,
aggregated: aggregateWebVitals(measuredWebVitalsRuns),
};
}

Expand Down Expand Up @@ -423,6 +440,7 @@ export async function runPageLoadBenchmark(
stdDev: stdDevResult,
p75,
p95,
trimmedCount: trimmedCounts,
Comment thread
cursor[bot] marked this conversation as resolved.
...(webVitals && { webVitals }),
};
}
Expand Down
28 changes: 25 additions & 3 deletions test/e2e/benchmarks/utils/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,23 @@ export const detectOutliersIQR = (
return { filtered, outlierCount: outliers.length, outliers };
};

/**
* IQR-based outlier trimming.
* Returns the filtered samples and the number of values removed.
* Intended for use before stats computation and before Mann-Whitney U tests.
*
* Thin wrapper over {@link detectOutliersIQR} with a stable public interface.
*
* @param samples - Raw per-run durations (unsorted)
*/
export function trimOutliers(samples: number[]): {
Comment thread
gauthierpetetin marked this conversation as resolved.
Outdated
samples: number[];
trimmedCount: number;
} {
const { filtered, outlierCount } = detectOutliersIQR(samples);
return { samples: filtered, trimmedCount: outlierCount };
}
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

/**
* Combined outlier detection using both IQR and z-score methods
* A value is only kept if it passes both methods
Expand Down Expand Up @@ -353,14 +370,18 @@ export const calculateTimerStatistics = (
maxDuration,
minDuration,
);
const { filtered, outlierCount } = detectOutliers(sanityResult.filtered);
const iqrResult = detectOutliersIQR(sanityResult.filtered);
const zScoreResult = detectOutliersZScore(iqrResult.filtered);
const { filtered } = zScoreResult;
const totalExcluded =
sanityResult.excludedCount +
iqrResult.outlierCount +
zScoreResult.outlierCount;
const sorted = [...filtered].sort((a, b) => a - b);
const mean = calculateMean(filtered);
const stdDev = calculateStdDev(filtered);
const cv = mean > 0 ? (stdDev / mean) * 100 : 0;

const totalExcluded = sanityResult.excludedCount + outlierCount;

return {
id: timerId,
mean,
Expand All @@ -374,6 +395,7 @@ export const calculateTimerStatistics = (
p99: calculatePercentile(sorted, 99),
samples: filtered.length,
outliers: totalExcluded,
trimmedCount: iqrResult.outlierCount,
dataQuality: assessDataQuality(cv),
};
};
Expand Down
Loading