Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
55bb98a
Add `trimmedCount` to `TimerStatistics` and `BenchmarkResults` types
MajorLift Apr 8, 2026
80d89da
Export `trimOutliers` and expose IQR count in `calculateTimerStatistics`
MajorLift Apr 8, 2026
01bc5ca
Add `WARMUP_RUNS`, `MIN_SAMPLES_FOR_VERDICT`, and `POWER_USER_NUM_BRO…
MajorLift Apr 8, 2026
4008c78
Apply warm-up exclusion and IQR trimming in `runPageLoadBenchmark`, p…
MajorLift Apr 8, 2026
cd5b927
Increase default `browserLoads` to 15 for `POWER_USER_HOME` preset
MajorLift Apr 8, 2026
10dbb46
Emit `trimmedCount` as derived Sentry tag per metric
MajorLift Apr 8, 2026
56d4f41
Add unit tests for IQR outlier trimming edge cases
MajorLift Apr 20, 2026
775bb15
ci: rebalance startupPowerUserHome to 15 browserLoads × 7 pageLoads
MajorLift Apr 20, 2026
897ae1d
fix: resolve lint errors in statistics and outlier-trimming test
MajorLift Apr 21, 2026
7cd855d
fix: filter measuredWebVitalsRuns by iteration instead of slice
MajorLift Apr 22, 2026
f7a7d21
fix: address Bugbot review #4156995681 — guard empty measuredResults …
MajorLift Apr 22, 2026
df553e4
[skip-e2e] [force-builds]
MajorLift Apr 22, 2026
3909d41
Narrow `dataQuality` type to literal union
MajorLift Apr 23, 2026
e98d614
Remove unused code
MajorLift Apr 23, 2026
47db254
Tighten test assertion
MajorLift Apr 23, 2026
ece9e74
Drop \`trimOutliers\` wrapper and fix reference aliasing in \`detectO…
MajorLift Apr 23, 2026
b0197ef
Add \`outliers\` to \`BenchmarkResults\` type; document IQR-only poli…
MajorLift Apr 23, 2026
697c24f
Replace inline CV ladder with \`assessDataQuality\`, emit \`outliers\…
MajorLift Apr 23, 2026
0bc831c
Spread input on \`detectOutliersZScore\` small-array early return
MajorLift Apr 23, 2026
1a5457b
Collect \`outliers\` from \`TimerStatistics\` in \`convertTimerStatis…
MajorLift Apr 23, 2026
451bc12
Fix prettier line-length violation in \`send-to-sentry.ts\`
MajorLift Apr 23, 2026
0827815
Fix JSDoc lint errors on \`runPageLoadBenchmark\`
MajorLift Apr 23, 2026
a8c5df8
Populate \`outliers\` in \`runPageLoadBenchmark\` return for Sentry c…
MajorLift Apr 23, 2026
4c40363
Spread \`outliers\` copy in \`runPageLoadBenchmark\` to avoid shared …
MajorLift Apr 23, 2026
f1fc8fe
Move \`browserLoads <= WARMUP_RUNS\` guard before benchmark loop in \…
MajorLift Apr 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jobs:
- name: Run the benchmark
if: ${{ env.BENCHMARK_GATED == 'true' }}
run: >-
${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
shell: bash

- name: Send benchmark results to Sentry (main/release only)
Expand Down
3 changes: 3 additions & 0 deletions shared/constants/benchmarks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ export type TimerStatistics = {
p99: number;
samples: number;
outliers: number;
trimmedCount?: number;
dataQuality: 'good' | 'poor' | 'unreliable';
};

Expand Down Expand Up @@ -108,6 +109,8 @@ export type BenchmarkResults = {
stdDev: StatisticalResult;
p75: StatisticalResult;
p95: StatisticalResult;
trimmedCount?: StatisticalResult;
outliers?: StatisticalResult;
webVitals?: WebVitalsSummary;
};

Expand Down
10 changes: 8 additions & 2 deletions test/e2e/benchmarks/flows/startup/power-user-home.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ import {
type BenchmarkResults,
type WebVitalsMetrics,
} from '../../../../../shared/constants/benchmarks';
import { WITH_STATE_POWER_USER } from '../../utils/constants';
import {
WITH_STATE_POWER_USER,
POWER_USER_NUM_BROWSER_LOADS,
} from '../../utils/constants';
import { runPageLoadBenchmark, collectWebVitals } from '../../utils';
import type {
Metrics,
Expand Down Expand Up @@ -87,5 +90,8 @@ async function measurePagePowerUser(
export async function run(
options: PageLoadBenchmarkOptions,
): Promise<BenchmarkResults> {
return runPageLoadBenchmark(measurePagePowerUser, options);
return runPageLoadBenchmark(measurePagePowerUser, {
...options,
browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS,
});
}
39 changes: 38 additions & 1 deletion test/e2e/benchmarks/send-to-sentry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import type {
} from '../../../shared/constants/benchmarks';
import { getGitBranch, getGitCommitHash } from './utils/git';
import type { UserActionResult } from './utils/types';
import { aggregateWebVitals } from './utils/statistics';
import { aggregateWebVitals, assessDataQuality } from './utils/statistics';

const packageJsonPath = path.resolve(__dirname, '../../../package.json');
const { version } = JSON.parse(readFileSync(packageJsonPath, 'utf-8')) as {
Expand Down Expand Up @@ -241,12 +241,49 @@ async function main() {
}
}

// Derived reliability metrics: CV, dataQuality, tailRatio
// CV = (stdDev / mean) * 100 — coefficient of variation, directly
// comparable across steps with different magnitudes.
// dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50).
// tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter.
const derivedMetrics: Record<string, number | string> = {};
if (benchmark.mean && benchmark.stdDev) {
for (const [key, meanVal] of Object.entries(benchmark.mean)) {
const stdDevVal = benchmark.stdDev[key];
if (meanVal > 0 && stdDevVal !== undefined) {
const cv = (stdDevVal / meanVal) * 100;
derivedMetrics[`${type}.cv.${key}`] = cv;
derivedMetrics[`${type}.dataQuality.${key}`] =
assessDataQuality(cv);
}
}
}
if (benchmark.p95 && benchmark.p75) {
for (const [key, p95Val] of Object.entries(benchmark.p95)) {
const p75Val = benchmark.p75[key];
if (p75Val !== undefined && p75Val > 0) {
derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val;
}
}
}
if (benchmark.trimmedCount) {
for (const [key, count] of Object.entries(benchmark.trimmedCount)) {
derivedMetrics[`${type}.trimmedCount.${key}`] = count;
}
}
if (benchmark.outliers) {
for (const [key, count] of Object.entries(benchmark.outliers)) {
derivedMetrics[`${type}.outliers.${key}`] = count;
}
}

// Timer data: structured logs (existing path, unchanged)
Sentry.logger.info(message, {
...baseCiAttributes,
'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD,
'ci.testTitle': benchmark.testTitle,
...allMetrics,
...derivedMetrics,
});

// Web vitals: separate reporting path via spans
Expand Down
6 changes: 6 additions & 0 deletions test/e2e/benchmarks/utils/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS;
/** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */
export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS;

/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */
export const POWER_USER_NUM_BROWSER_LOADS = 15;

/** Number of leading browser-load sessions to discard as warm-up before computing stats. */
export const WARMUP_RUNS = 1;

export const ALL_METRICS = {
uiStartup: 'UI Startup',
load: 'navigation[0].load',
Expand Down
131 changes: 131 additions & 0 deletions test/e2e/benchmarks/utils/outlier-trimming.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import { detectOutliersIQR } from './statistics';

describe('detectOutliersIQR (IQR-based)', () => {
describe('small sample edge cases', () => {
it('returns empty array unchanged', () => {
const result = detectOutliersIQR([]);
expect(result.filtered).toEqual([]);
expect(result.outlierCount).toBe(0);
});

it('returns array of 1 unchanged', () => {
const result = detectOutliersIQR([500]);
expect(result.filtered).toEqual([500]);
expect(result.outlierCount).toBe(0);
});

it('returns array of 2 unchanged', () => {
const result = detectOutliersIQR([100, 200]);
expect(result.filtered).toEqual([100, 200]);
expect(result.outlierCount).toBe(0);
});

it('returns array of 3 unchanged (below IQR threshold)', () => {
const result = detectOutliersIQR([100, 200, 9000]);
expect(result.filtered).toEqual([100, 200, 9000]);
expect(result.outlierCount).toBe(0);
});
});

describe('no outliers', () => {
it('returns all values when distribution is tight', () => {
const samples = [100, 101, 99, 102, 98, 100, 101, 99];
const result = detectOutliersIQR(samples);
expect(result.outlierCount).toBe(0);
expect(result.filtered).toHaveLength(samples.length);
});

it('returns all values when all samples are identical', () => {
const samples = [200, 200, 200, 200, 200, 200];
const result = detectOutliersIQR(samples);
expect(result.outlierCount).toBe(0);
expect(result.filtered).toHaveLength(samples.length);
});
});

describe('outlier removal', () => {
it('removes a single high outlier', () => {
const samples = [10, 11, 12, 10, 11, 12, 11, 1000];
const result = detectOutliersIQR(samples);
expect(result.filtered).not.toContain(1000);
expect(result.outlierCount).toBe(1);
});

it('removes a single low outlier', () => {
const samples = [100, 102, 101, 103, 100, 101, 1];
const result = detectOutliersIQR(samples);
expect(result.filtered).not.toContain(1);
expect(result.outlierCount).toBe(1);
});

it('removes multiple outliers on both ends', () => {
const samples = [1, 100, 101, 102, 100, 101, 103, 9999];
const result = detectOutliersIQR(samples);
expect(result.filtered).not.toContain(1);
expect(result.filtered).not.toContain(9999);
expect(result.outlierCount).toBe(2);
});

it('preserves non-outlier values exactly', () => {
const core = [100, 105, 95, 102, 98, 101];
const samples = [...core, 5000];
const result = detectOutliersIQR(samples);
for (const v of core) {
expect(result.filtered).toContain(v);
}
});
});

describe('realistic benchmark scenario (n=15)', () => {
it('removes exactly 2 outliers from a deterministic 15-sample benchmark run', () => {
// Q1=318, Q3=328, IQR=10, upper fence=343 — 850 and 900 both exceed it
const normal = [
320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317,
];
const withSpikes = [...normal, 900, 850]; // two cold-start spikes
const result = detectOutliersIQR(withSpikes);
expect(result.outlierCount).toBe(2);
expect(result.filtered).toHaveLength(13);
});

it('does not over-trim a low-variance run', () => {
const stable = [
300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302,
300,
];
const result = detectOutliersIQR(stable);
expect(result.outlierCount).toBe(0);
expect(result.filtered).toHaveLength(stable.length);
});
});

describe('input ordering', () => {
it('produces the same outlierCount regardless of input order', () => {
const ordered = [10, 11, 12, 13, 14, 15, 1000];
const shuffled = [1000, 13, 10, 15, 12, 11, 14];
const r1 = detectOutliersIQR(ordered);
const r2 = detectOutliersIQR(shuffled);
expect(r1.outlierCount).toBe(r2.outlierCount);
expect(r1.filtered.sort((a, b) => a - b)).toEqual(
r2.filtered.sort((a, b) => a - b),
);
});

it('does not mutate the input array (n >= 4, filter path)', () => {
const samples = [10, 11, 1000, 12, 13];
const copy = [...samples];
detectOutliersIQR(samples);
expect(samples).toEqual(copy);
});

it('does not mutate the input array (n < 4, early-return path)', () => {
const samples = [100, 200, 9000];
const copy = [...samples];
const result = detectOutliersIQR(samples);
expect(samples).toEqual(copy);
// returned array must not be the same reference
result.filtered.push(999);
expect(samples).toEqual(copy);
});
});
});
76 changes: 68 additions & 8 deletions test/e2e/benchmarks/utils/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
ALL_METRICS,
DEFAULT_NUM_BROWSER_LOADS,
DEFAULT_NUM_PAGE_LOADS,
WARMUP_RUNS,
} from './constants';
import {
aggregateWebVitals,
Expand All @@ -27,6 +28,7 @@ import {
calcStdDevResult,
calculateTimerStatistics,
checkExclusionRate,
detectOutliersIQR,
MAX_EXCLUSION_RATE,
MAX_TOTAL_DURATION_MS,
validateThresholds,
Expand Down Expand Up @@ -264,6 +266,8 @@ export function convertTimerStatisticsToBenchmarkResults(
const stdDev: StatisticalResult = {};
const p75: StatisticalResult = {};
const p95: StatisticalResult = {};
const trimmedCount: StatisticalResult = {};
const outliers: StatisticalResult = {};

// timers already includes promoted web vitals from runBenchmarkWithIterations
for (const timer of timers) {
Expand All @@ -273,8 +277,15 @@ export function convertTimerStatisticsToBenchmarkResults(
stdDev[timer.id] = timer.stdDev;
p75[timer.id] = timer.p75;
p95[timer.id] = timer.p95;
if (timer.trimmedCount !== undefined) {
trimmedCount[timer.id] = timer.trimmedCount;
}
outliers[timer.id] = timer.outliers;
}

const hasTrimmedCounts = Object.keys(trimmedCount).length > 0;
const hasOutliers = Object.keys(outliers).length > 0;

return {
testTitle,
persona,
Expand All @@ -287,6 +298,8 @@ export function convertTimerStatisticsToBenchmarkResults(
stdDev,
p75,
p95,
...(hasTrimmedCounts && { trimmedCount }),
Comment thread
cursor[bot] marked this conversation as resolved.
...(hasOutliers && { outliers }),
...(webVitals && { webVitals }),
};
}
Expand Down Expand Up @@ -321,6 +334,34 @@ export function convertSummaryToResults(
);
}

/**
* Run the dApp page-load benchmark and aggregate results.
*
* Uses IQR-only outlier trimming (`trimmedCount`). Z-score is intentionally
* excluded for two reasons:
*
* 1. Serial correlation: page loads run serially within each browser session
* (shared JIT cache, extension state). A slow startup elevates all `pageLoads`
* measurements in a session as a correlated cluster. Z-score assumes i.i.d.
* samples and would flag the cluster as outliers even when the elevation has a
* real underlying cause. IQR is rank-based and distribution-free, making it
* robust to correlated samples.
*
* 2. Multimodal distributions: `longTask*`, `tbt`, and `numNetworkReqs` are
* zero or near-zero most runs with occasional real spikes. Z-score would flag
* those spikes as noise, removing genuine signal.
*
* `calculateTimerStatistics` (iteration-based benchmarks) applies both IQR and
* z-score because each run is an independent browser session.
*
* @param measurePageFn - Function that drives one browser session and returns metrics
* @param options - Benchmark configuration
* @param options.browserLoads - Number of full browser sessions to run
* @param options.pageLoads - Number of page loads per browser session
* @param options.retries - Number of retries per browser session on failure
* @param options.platform - Optional platform label written to results
* @param options.buildType - Optional build label written to results
*/
export async function runPageLoadBenchmark(
measurePageFn: (
pageName: string,
Expand Down Expand Up @@ -366,10 +407,25 @@ export async function runPageLoadBenchmark(
resultPersona = persona;
}

if (runResults.some((result) => result.navigation.length > 1)) {
// Discard the first WARMUP_RUNS browser-load sessions before computing stats.
// Each session contributes exactly pageLoads metric objects to runResults.
if (browserLoads <= WARMUP_RUNS) {
throw new Error(
`browserLoads (${browserLoads}) must be greater than WARMUP_RUNS (${WARMUP_RUNS})`,
);
}
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated
const warmupSize = WARMUP_RUNS * pageLoads;
const measuredResults = runResults.slice(warmupSize);
Comment thread
cursor[bot] marked this conversation as resolved.
// Web vitals entries are sparse (collection can fail silently), so filter by
// iteration index rather than slicing by position to avoid off-by-N errors.
const measuredWebVitalsRuns = allWebVitalsRuns.filter(
(wv) => wv.iteration >= warmupSize,
);

if (measuredResults.some((result) => result.navigation.length > 1)) {
throw new Error(`Multiple navigations not supported`);
}
const firstNonNavigate = runResults.find(
const firstNonNavigate = measuredResults.find(
(result) => result.navigation[0].type !== 'navigate',
);
if (firstNonNavigate !== undefined) {
Expand All @@ -379,17 +435,19 @@ export async function runPageLoadBenchmark(
}

const result: Record<string, number[]> = {};
const trimmedCounts: StatisticalResult = {};
for (const [key, tracePath] of Object.entries(ALL_METRICS)) {
result[key] = runResults
.map((m) => get(m, tracePath) as number)
.sort((a, b) => a - b);
const rawSamples = measuredResults.map((m) => get(m, tracePath) as number);
const { filtered, outlierCount } = detectOutliersIQR(rawSamples);
Comment thread
gauthierpetetin marked this conversation as resolved.
result[key] = [...filtered].sort((a, b) => a - b);
trimmedCounts[key] = outlierCount;
}

let webVitals: WebVitalsSummary | undefined;
if (allWebVitalsRuns.length > 0) {
if (measuredWebVitalsRuns.length > 0) {
webVitals = {
runs: allWebVitalsRuns,
aggregated: aggregateWebVitals(allWebVitalsRuns),
runs: measuredWebVitalsRuns,
aggregated: aggregateWebVitals(measuredWebVitalsRuns),
};
}

Expand Down Expand Up @@ -423,6 +481,8 @@ export async function runPageLoadBenchmark(
stdDev: stdDevResult,
p75,
p95,
trimmedCount: trimmedCounts,
Comment thread
cursor[bot] marked this conversation as resolved.
outliers: { ...trimmedCounts },
...(webVitals && { webVitals }),
};
}
Expand Down
Loading
Loading