diff --git a/.github/workflows/run-benchmarks.yml b/.github/workflows/run-benchmarks.yml index 6af843acc62e..ce59dc040732 100644 --- a/.github/workflows/run-benchmarks.yml +++ b/.github/workflows/run-benchmarks.yml @@ -128,7 +128,7 @@ jobs: - name: Run the benchmark if: ${{ env.BENCHMARK_GATED == 'true' }} run: >- - ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }} + ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }} shell: bash - name: Send benchmark results to Sentry (main/release only) diff --git a/shared/constants/benchmarks.ts b/shared/constants/benchmarks.ts index 396cdde552ea..33fb828ebdcf 100644 --- a/shared/constants/benchmarks.ts +++ b/shared/constants/benchmarks.ts @@ -71,6 +71,7 @@ export type TimerStatistics = { p99: number; samples: number; outliers: number; + trimmedCount?: number; dataQuality: 'good' | 'poor' | 'unreliable'; }; @@ -108,6 +109,8 @@ export type BenchmarkResults = { stdDev: StatisticalResult; p75: StatisticalResult; p95: StatisticalResult; + trimmedCount?: StatisticalResult; + outliers?: StatisticalResult; webVitals?: WebVitalsSummary; }; diff --git a/test/e2e/benchmarks/flows/startup/power-user-home.ts b/test/e2e/benchmarks/flows/startup/power-user-home.ts index e2b69e83bbe4..af67bfc48aaf 100644 --- a/test/e2e/benchmarks/flows/startup/power-user-home.ts +++ b/test/e2e/benchmarks/flows/startup/power-user-home.ts @@ -15,7 +15,10 @@ import { type BenchmarkResults, type WebVitalsMetrics, } from '../../../../../shared/constants/benchmarks'; -import { WITH_STATE_POWER_USER } from '../../utils/constants'; +import { + WITH_STATE_POWER_USER, + POWER_USER_NUM_BROWSER_LOADS, +} from '../../utils/constants'; import { runPageLoadBenchmark, collectWebVitals } from '../../utils'; import type { Metrics, @@ -87,5 +90,8 @@ async function measurePagePowerUser( export async function run( options: PageLoadBenchmarkOptions, ): Promise { - return runPageLoadBenchmark(measurePagePowerUser, options); + return runPageLoadBenchmark(measurePagePowerUser, { + ...options, + browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS, + }); } diff --git a/test/e2e/benchmarks/send-to-sentry.ts b/test/e2e/benchmarks/send-to-sentry.ts index 27a962880224..39afde7fd254 100644 --- a/test/e2e/benchmarks/send-to-sentry.ts +++ b/test/e2e/benchmarks/send-to-sentry.ts @@ -30,7 +30,7 @@ import type { } from '../../../shared/constants/benchmarks'; import { getGitBranch, getGitCommitHash } from './utils/git'; import type { UserActionResult } from './utils/types'; -import { aggregateWebVitals } from './utils/statistics'; +import { aggregateWebVitals, assessDataQuality } from './utils/statistics'; const packageJsonPath = path.resolve(__dirname, '../../../package.json'); const { version } = JSON.parse(readFileSync(packageJsonPath, 'utf-8')) as { @@ -241,12 +241,49 @@ async function main() { } } + // Derived reliability metrics: CV, dataQuality, tailRatio + // CV = (stdDev / mean) * 100 — coefficient of variation, directly + // comparable across steps with different magnitudes. + // dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50). + // tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter. + const derivedMetrics: Record = {}; + if (benchmark.mean && benchmark.stdDev) { + for (const [key, meanVal] of Object.entries(benchmark.mean)) { + const stdDevVal = benchmark.stdDev[key]; + if (meanVal > 0 && stdDevVal !== undefined) { + const cv = (stdDevVal / meanVal) * 100; + derivedMetrics[`${type}.cv.${key}`] = cv; + derivedMetrics[`${type}.dataQuality.${key}`] = + assessDataQuality(cv); + } + } + } + if (benchmark.p95 && benchmark.p75) { + for (const [key, p95Val] of Object.entries(benchmark.p95)) { + const p75Val = benchmark.p75[key]; + if (p75Val !== undefined && p75Val > 0) { + derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val; + } + } + } + if (benchmark.trimmedCount) { + for (const [key, count] of Object.entries(benchmark.trimmedCount)) { + derivedMetrics[`${type}.trimmedCount.${key}`] = count; + } + } + if (benchmark.outliers) { + for (const [key, count] of Object.entries(benchmark.outliers)) { + derivedMetrics[`${type}.outliers.${key}`] = count; + } + } + // Timer data: structured logs (existing path, unchanged) Sentry.logger.info(message, { ...baseCiAttributes, 'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD, 'ci.testTitle': benchmark.testTitle, ...allMetrics, + ...derivedMetrics, }); // Web vitals: separate reporting path via spans diff --git a/test/e2e/benchmarks/utils/constants.ts b/test/e2e/benchmarks/utils/constants.ts index ddaeaa8da73d..b2e1fc2a95f1 100644 --- a/test/e2e/benchmarks/utils/constants.ts +++ b/test/e2e/benchmarks/utils/constants.ts @@ -50,6 +50,12 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS; /** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */ export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS; +/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */ +export const POWER_USER_NUM_BROWSER_LOADS = 15; + +/** Number of leading browser-load sessions to discard as warm-up before computing stats. */ +export const WARMUP_RUNS = 1; + export const ALL_METRICS = { uiStartup: 'UI Startup', load: 'navigation[0].load', diff --git a/test/e2e/benchmarks/utils/outlier-trimming.test.ts b/test/e2e/benchmarks/utils/outlier-trimming.test.ts new file mode 100644 index 000000000000..225b9bafb7bc --- /dev/null +++ b/test/e2e/benchmarks/utils/outlier-trimming.test.ts @@ -0,0 +1,131 @@ +import { detectOutliersIQR } from './statistics'; + +describe('detectOutliersIQR (IQR-based)', () => { + describe('small sample edge cases', () => { + it('returns empty array unchanged', () => { + const result = detectOutliersIQR([]); + expect(result.filtered).toEqual([]); + expect(result.outlierCount).toBe(0); + }); + + it('returns array of 1 unchanged', () => { + const result = detectOutliersIQR([500]); + expect(result.filtered).toEqual([500]); + expect(result.outlierCount).toBe(0); + }); + + it('returns array of 2 unchanged', () => { + const result = detectOutliersIQR([100, 200]); + expect(result.filtered).toEqual([100, 200]); + expect(result.outlierCount).toBe(0); + }); + + it('returns array of 3 unchanged (below IQR threshold)', () => { + const result = detectOutliersIQR([100, 200, 9000]); + expect(result.filtered).toEqual([100, 200, 9000]); + expect(result.outlierCount).toBe(0); + }); + }); + + describe('no outliers', () => { + it('returns all values when distribution is tight', () => { + const samples = [100, 101, 99, 102, 98, 100, 101, 99]; + const result = detectOutliersIQR(samples); + expect(result.outlierCount).toBe(0); + expect(result.filtered).toHaveLength(samples.length); + }); + + it('returns all values when all samples are identical', () => { + const samples = [200, 200, 200, 200, 200, 200]; + const result = detectOutliersIQR(samples); + expect(result.outlierCount).toBe(0); + expect(result.filtered).toHaveLength(samples.length); + }); + }); + + describe('outlier removal', () => { + it('removes a single high outlier', () => { + const samples = [10, 11, 12, 10, 11, 12, 11, 1000]; + const result = detectOutliersIQR(samples); + expect(result.filtered).not.toContain(1000); + expect(result.outlierCount).toBe(1); + }); + + it('removes a single low outlier', () => { + const samples = [100, 102, 101, 103, 100, 101, 1]; + const result = detectOutliersIQR(samples); + expect(result.filtered).not.toContain(1); + expect(result.outlierCount).toBe(1); + }); + + it('removes multiple outliers on both ends', () => { + const samples = [1, 100, 101, 102, 100, 101, 103, 9999]; + const result = detectOutliersIQR(samples); + expect(result.filtered).not.toContain(1); + expect(result.filtered).not.toContain(9999); + expect(result.outlierCount).toBe(2); + }); + + it('preserves non-outlier values exactly', () => { + const core = [100, 105, 95, 102, 98, 101]; + const samples = [...core, 5000]; + const result = detectOutliersIQR(samples); + for (const v of core) { + expect(result.filtered).toContain(v); + } + }); + }); + + describe('realistic benchmark scenario (n=15)', () => { + it('removes exactly 2 outliers from a deterministic 15-sample benchmark run', () => { + // Q1=318, Q3=328, IQR=10, upper fence=343 — 850 and 900 both exceed it + const normal = [ + 320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317, + ]; + const withSpikes = [...normal, 900, 850]; // two cold-start spikes + const result = detectOutliersIQR(withSpikes); + expect(result.outlierCount).toBe(2); + expect(result.filtered).toHaveLength(13); + }); + + it('does not over-trim a low-variance run', () => { + const stable = [ + 300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302, + 300, + ]; + const result = detectOutliersIQR(stable); + expect(result.outlierCount).toBe(0); + expect(result.filtered).toHaveLength(stable.length); + }); + }); + + describe('input ordering', () => { + it('produces the same outlierCount regardless of input order', () => { + const ordered = [10, 11, 12, 13, 14, 15, 1000]; + const shuffled = [1000, 13, 10, 15, 12, 11, 14]; + const r1 = detectOutliersIQR(ordered); + const r2 = detectOutliersIQR(shuffled); + expect(r1.outlierCount).toBe(r2.outlierCount); + expect(r1.filtered.sort((a, b) => a - b)).toEqual( + r2.filtered.sort((a, b) => a - b), + ); + }); + + it('does not mutate the input array (n >= 4, filter path)', () => { + const samples = [10, 11, 1000, 12, 13]; + const copy = [...samples]; + detectOutliersIQR(samples); + expect(samples).toEqual(copy); + }); + + it('does not mutate the input array (n < 4, early-return path)', () => { + const samples = [100, 200, 9000]; + const copy = [...samples]; + const result = detectOutliersIQR(samples); + expect(samples).toEqual(copy); + // returned array must not be the same reference + result.filtered.push(999); + expect(samples).toEqual(copy); + }); + }); +}); diff --git a/test/e2e/benchmarks/utils/runner.ts b/test/e2e/benchmarks/utils/runner.ts index f6a57547c2f4..7d7d330a8c06 100644 --- a/test/e2e/benchmarks/utils/runner.ts +++ b/test/e2e/benchmarks/utils/runner.ts @@ -17,6 +17,7 @@ import { ALL_METRICS, DEFAULT_NUM_BROWSER_LOADS, DEFAULT_NUM_PAGE_LOADS, + WARMUP_RUNS, } from './constants'; import { aggregateWebVitals, @@ -27,6 +28,7 @@ import { calcStdDevResult, calculateTimerStatistics, checkExclusionRate, + detectOutliersIQR, MAX_EXCLUSION_RATE, MAX_TOTAL_DURATION_MS, validateThresholds, @@ -264,6 +266,8 @@ export function convertTimerStatisticsToBenchmarkResults( const stdDev: StatisticalResult = {}; const p75: StatisticalResult = {}; const p95: StatisticalResult = {}; + const trimmedCount: StatisticalResult = {}; + const outliers: StatisticalResult = {}; // timers already includes promoted web vitals from runBenchmarkWithIterations for (const timer of timers) { @@ -273,8 +277,15 @@ export function convertTimerStatisticsToBenchmarkResults( stdDev[timer.id] = timer.stdDev; p75[timer.id] = timer.p75; p95[timer.id] = timer.p95; + if (timer.trimmedCount !== undefined) { + trimmedCount[timer.id] = timer.trimmedCount; + } + outliers[timer.id] = timer.outliers; } + const hasTrimmedCounts = Object.keys(trimmedCount).length > 0; + const hasOutliers = Object.keys(outliers).length > 0; + return { testTitle, persona, @@ -287,6 +298,8 @@ export function convertTimerStatisticsToBenchmarkResults( stdDev, p75, p95, + ...(hasTrimmedCounts && { trimmedCount }), + ...(hasOutliers && { outliers }), ...(webVitals && { webVitals }), }; } @@ -321,6 +334,34 @@ export function convertSummaryToResults( ); } +/** + * Run the dApp page-load benchmark and aggregate results. + * + * Uses IQR-only outlier trimming (`trimmedCount`). Z-score is intentionally + * excluded for two reasons: + * + * 1. Serial correlation: page loads run serially within each browser session + * (shared JIT cache, extension state). A slow startup elevates all `pageLoads` + * measurements in a session as a correlated cluster. Z-score assumes i.i.d. + * samples and would flag the cluster as outliers even when the elevation has a + * real underlying cause. IQR is rank-based and distribution-free, making it + * robust to correlated samples. + * + * 2. Multimodal distributions: `longTask*`, `tbt`, and `numNetworkReqs` are + * zero or near-zero most runs with occasional real spikes. Z-score would flag + * those spikes as noise, removing genuine signal. + * + * `calculateTimerStatistics` (iteration-based benchmarks) applies both IQR and + * z-score because each run is an independent browser session. + * + * @param measurePageFn - Function that drives one browser session and returns metrics + * @param options - Benchmark configuration + * @param options.browserLoads - Number of full browser sessions to run + * @param options.pageLoads - Number of page loads per browser session + * @param options.retries - Number of retries per browser session on failure + * @param options.platform - Optional platform label written to results + * @param options.buildType - Optional build label written to results + */ export async function runPageLoadBenchmark( measurePageFn: ( pageName: string, @@ -342,6 +383,12 @@ export async function runPageLoadBenchmark( buildType, } = options; + if (browserLoads <= WARMUP_RUNS) { + throw new Error( + `browserLoads (${browserLoads}) must be greater than WARMUP_RUNS (${WARMUP_RUNS})`, + ); + } + const pageName = 'home'; let runResults: Metrics[] = []; let allWebVitalsRuns: WebVitalsRun[] = []; @@ -366,10 +413,20 @@ export async function runPageLoadBenchmark( resultPersona = persona; } - if (runResults.some((result) => result.navigation.length > 1)) { + // Discard the first WARMUP_RUNS browser-load sessions before computing stats. + // Each session contributes exactly pageLoads metric objects to runResults. + const warmupSize = WARMUP_RUNS * pageLoads; + const measuredResults = runResults.slice(warmupSize); + // Web vitals entries are sparse (collection can fail silently), so filter by + // iteration index rather than slicing by position to avoid off-by-N errors. + const measuredWebVitalsRuns = allWebVitalsRuns.filter( + (wv) => wv.iteration >= warmupSize, + ); + + if (measuredResults.some((result) => result.navigation.length > 1)) { throw new Error(`Multiple navigations not supported`); } - const firstNonNavigate = runResults.find( + const firstNonNavigate = measuredResults.find( (result) => result.navigation[0].type !== 'navigate', ); if (firstNonNavigate !== undefined) { @@ -379,17 +436,19 @@ export async function runPageLoadBenchmark( } const result: Record = {}; + const trimmedCounts: StatisticalResult = {}; for (const [key, tracePath] of Object.entries(ALL_METRICS)) { - result[key] = runResults - .map((m) => get(m, tracePath) as number) - .sort((a, b) => a - b); + const rawSamples = measuredResults.map((m) => get(m, tracePath) as number); + const { filtered, outlierCount } = detectOutliersIQR(rawSamples); + result[key] = [...filtered].sort((a, b) => a - b); + trimmedCounts[key] = outlierCount; } let webVitals: WebVitalsSummary | undefined; - if (allWebVitalsRuns.length > 0) { + if (measuredWebVitalsRuns.length > 0) { webVitals = { - runs: allWebVitalsRuns, - aggregated: aggregateWebVitals(allWebVitalsRuns), + runs: measuredWebVitalsRuns, + aggregated: aggregateWebVitals(measuredWebVitalsRuns), }; } @@ -423,6 +482,8 @@ export async function runPageLoadBenchmark( stdDev: stdDevResult, p75, p95, + trimmedCount: trimmedCounts, + outliers: { ...trimmedCounts }, ...(webVitals && { webVitals }), }; } diff --git a/test/e2e/benchmarks/utils/statistics.ts b/test/e2e/benchmarks/utils/statistics.ts index f3c201d8df96..c9cbe85bf780 100644 --- a/test/e2e/benchmarks/utils/statistics.ts +++ b/test/e2e/benchmarks/utils/statistics.ts @@ -168,7 +168,7 @@ export const detectOutliersZScore = ( threshold: number = Z_SCORE_THRESHOLD, ): { filtered: number[]; outlierCount: number; outliers: number[] } => { if (values.length < 3) { - return { filtered: values, outlierCount: 0, outliers: [] }; + return { filtered: [...values], outlierCount: 0, outliers: [] }; } const mean = calculateMean(values); @@ -197,7 +197,7 @@ export const detectOutliersIQR = ( values: number[], ): { filtered: number[]; outlierCount: number; outliers: number[] } => { if (values.length < 4) { - return { filtered: values, outlierCount: 0, outliers: [] }; + return { filtered: [...values], outlierCount: 0, outliers: [] }; } const sorted = [...values].sort((a, b) => a - b); @@ -353,14 +353,18 @@ export const calculateTimerStatistics = ( maxDuration, minDuration, ); - const { filtered, outlierCount } = detectOutliers(sanityResult.filtered); + const iqrResult = detectOutliersIQR(sanityResult.filtered); + const zScoreResult = detectOutliersZScore(iqrResult.filtered); + const { filtered } = zScoreResult; + const totalExcluded = + sanityResult.excludedCount + + iqrResult.outlierCount + + zScoreResult.outlierCount; const sorted = [...filtered].sort((a, b) => a - b); const mean = calculateMean(filtered); const stdDev = calculateStdDev(filtered); const cv = mean > 0 ? (stdDev / mean) * 100 : 0; - const totalExcluded = sanityResult.excludedCount + outlierCount; - return { id: timerId, mean, @@ -374,6 +378,7 @@ export const calculateTimerStatistics = ( p99: calculatePercentile(sorted, 99), samples: filtered.length, outliers: totalExcluded, + trimmedCount: iqrResult.outlierCount, dataQuality: assessDataQuality(cv), }; };