MetaMask · MajorLift · Apr 24, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -128,7 +128,7 @@ jobs:
       - name: Run the benchmark
         if: ${{ env.BENCHMARK_GATED == 'true' }}
         run: >-
-          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
+          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
         shell: bash
 
       - name: Send benchmark results to Sentry (main/release only)

diff --git a/shared/constants/benchmarks.ts b/shared/constants/benchmarks.ts
@@ -71,6 +71,7 @@ export type TimerStatistics = {
   p99: number;
   samples: number;
   outliers: number;
+  trimmedCount?: number;
   dataQuality: 'good' | 'poor' | 'unreliable';
 };
 
@@ -108,6 +109,7 @@ export type BenchmarkResults = {
   stdDev: StatisticalResult;
   p75: StatisticalResult;
   p95: StatisticalResult;
+  trimmedCount?: StatisticalResult;
   webVitals?: WebVitalsSummary;
 };
 

diff --git a/test/e2e/benchmarks/flows/startup/power-user-home.ts b/test/e2e/benchmarks/flows/startup/power-user-home.ts
@@ -15,7 +15,10 @@ import {
   type BenchmarkResults,
   type WebVitalsMetrics,
 } from '../../../../../shared/constants/benchmarks';
-import { WITH_STATE_POWER_USER } from '../../utils/constants';
+import {
+  WITH_STATE_POWER_USER,
+  POWER_USER_NUM_BROWSER_LOADS,
+} from '../../utils/constants';
 import { runPageLoadBenchmark, collectWebVitals } from '../../utils';
 import type {
   Metrics,
@@ -87,5 +90,8 @@ async function measurePagePowerUser(
 export async function run(
   options: PageLoadBenchmarkOptions,
 ): Promise<BenchmarkResults> {
-  return runPageLoadBenchmark(measurePagePowerUser, options);
+  return runPageLoadBenchmark(measurePagePowerUser, {
+    ...options,
+    browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS,
+  });
 }
diff --git a/test/e2e/benchmarks/send-to-sentry.ts b/test/e2e/benchmarks/send-to-sentry.ts
@@ -241,12 +241,51 @@ async function main() {
         }
       }
 
+      // Derived reliability metrics: CV, dataQuality, tailRatio
+      // CV = (stdDev / mean) * 100 — coefficient of variation, directly
+      // comparable across steps with different magnitudes.
+      // dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50).
+      // tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter.
+      const derivedMetrics: Record<string, number | string> = {};
+      if (benchmark.mean && benchmark.stdDev) {
+        for (const [key, meanVal] of Object.entries(benchmark.mean)) {
+          const stdDevVal = benchmark.stdDev[key];
+          if (meanVal > 0 && stdDevVal !== undefined) {
+            const cv = (stdDevVal / meanVal) * 100;
+            derivedMetrics[`${type}.cv.${key}`] = cv;
+            let dataQuality: string;
+            if (cv < 30) {
+              dataQuality = 'good';
+            } else if (cv < 50) {
+              dataQuality = 'poor';
+            } else {
+              dataQuality = 'unreliable';
+            }
+            derivedMetrics[`${type}.dataQuality.${key}`] = dataQuality;
+          }
+        }
+      }
+      if (benchmark.p95 && benchmark.p75) {
+        for (const [key, p95Val] of Object.entries(benchmark.p95)) {
+          const p75Val = benchmark.p75[key];
+          if (p75Val !== undefined && p75Val > 0) {
+            derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val;
+          }
+        }
+      }
+      if (benchmark.trimmedCount) {
+        for (const [key, count] of Object.entries(benchmark.trimmedCount)) {
+          derivedMetrics[`${type}.trimmedCount.${key}`] = count;
+        }
+      }
+
       // Timer data: structured logs (existing path, unchanged)
       Sentry.logger.info(message, {
         ...baseCiAttributes,
         'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD,
         'ci.testTitle': benchmark.testTitle,
         ...allMetrics,
+        ...derivedMetrics,
       });
 
       // Web vitals: separate reporting path via spans

diff --git a/test/e2e/benchmarks/utils/constants.ts b/test/e2e/benchmarks/utils/constants.ts
@@ -50,6 +50,15 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS;
 /** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */
 export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS;
 
+/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */
+export const POWER_USER_NUM_BROWSER_LOADS = 15;
+
+/** Number of leading browser-load sessions to discard as warm-up before computing stats. */
+export const WARMUP_RUNS = 1;
+
+/** Minimum effective sample count (after warm-up exclusion + IQR trimming) required to emit a Mann-Whitney verdict. */
+export const MIN_SAMPLES_FOR_VERDICT = 5;
+
 export const ALL_METRICS = {
   uiStartup: 'UI Startup',
   load: 'navigation[0].load',

diff --git a/test/e2e/benchmarks/utils/outlier-trimming.test.ts b/test/e2e/benchmarks/utils/outlier-trimming.test.ts
@@ -0,0 +1,122 @@
+import { trimOutliers } from './statistics';
+
+describe('trimOutliers (IQR-based)', () => {
+  describe('small sample edge cases', () => {
+    it('returns empty array unchanged', () => {
+      const result = trimOutliers([]);
+      expect(result.samples).toEqual([]);
+      expect(result.trimmedCount).toBe(0);
+    });
+
+    it('returns array of 1 unchanged', () => {
+      const result = trimOutliers([500]);
+      expect(result.samples).toEqual([500]);
+      expect(result.trimmedCount).toBe(0);
+    });
+
+    it('returns array of 2 unchanged', () => {
+      const result = trimOutliers([100, 200]);
+      expect(result.samples).toEqual([100, 200]);
+      expect(result.trimmedCount).toBe(0);
+    });
+
+    it('returns array of 3 unchanged (below IQR threshold)', () => {
+      const result = trimOutliers([100, 200, 9000]);
+      expect(result.samples).toEqual([100, 200, 9000]);
+      expect(result.trimmedCount).toBe(0);
+    });
+  });
+
+  describe('no outliers', () => {
+    it('returns all values when distribution is tight', () => {
+      const samples = [100, 101, 99, 102, 98, 100, 101, 99];
+      const result = trimOutliers(samples);
+      expect(result.trimmedCount).toBe(0);
+      expect(result.samples).toHaveLength(samples.length);
+    });
+
+    it('returns all values when all samples are identical', () => {
+      const samples = [200, 200, 200, 200, 200, 200];
+      const result = trimOutliers(samples);
+      expect(result.trimmedCount).toBe(0);
+      expect(result.samples).toHaveLength(samples.length);
+    });
+  });
+
+  describe('outlier removal', () => {
+    it('removes a single high outlier', () => {
+      const samples = [10, 11, 12, 10, 11, 12, 11, 1000];
+      const result = trimOutliers(samples);
+      expect(result.samples).not.toContain(1000);
+      expect(result.trimmedCount).toBe(1);
+    });
+
+    it('removes a single low outlier', () => {
+      const samples = [100, 102, 101, 103, 100, 101, 1];
+      const result = trimOutliers(samples);
+      expect(result.samples).not.toContain(1);
+      expect(result.trimmedCount).toBe(1);
+    });
+
+    it('removes multiple outliers on both ends', () => {
+      const samples = [1, 100, 101, 102, 100, 101, 103, 9999];
+      const result = trimOutliers(samples);
+      expect(result.samples).not.toContain(1);
+      expect(result.samples).not.toContain(9999);
+      expect(result.trimmedCount).toBe(2);
+    });
+
+    it('preserves non-outlier values exactly', () => {
+      const core = [100, 105, 95, 102, 98, 101];
+      const samples = [...core, 5000];
+      const result = trimOutliers(samples);
+      for (const v of core) {
+        expect(result.samples).toContain(v);
+      }
+    });
+  });
+
+  describe('realistic benchmark scenario (n=15)', () => {
+    it('removes 0-3 outliers from a 15-sample benchmark run', () => {
+      // Simulates 15 independent browser-load sessions with 1-2 JIT/GC spikes
+      const normal = [
+        320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317,
+      ];
+      const withSpikes = [...normal, 900, 850]; // two cold-start spikes
+      const result = trimOutliers(withSpikes);
+      expect(result.trimmedCount).toBeGreaterThanOrEqual(1);
+      expect(result.trimmedCount).toBeLessThanOrEqual(3);
+      expect(result.samples.length).toBeGreaterThanOrEqual(12);
+    });
+
+    it('does not over-trim a low-variance run', () => {
+      const stable = [
+        300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302,
+        300,
+      ];
+      const result = trimOutliers(stable);
+      expect(result.trimmedCount).toBe(0);
+      expect(result.samples).toHaveLength(stable.length);
+    });
+  });
+
+  describe('input ordering', () => {
+    it('produces the same trimmedCount regardless of input order', () => {
+      const ordered = [10, 11, 12, 13, 14, 15, 1000];
+      const shuffled = [1000, 13, 10, 15, 12, 11, 14];
+      const r1 = trimOutliers(ordered);
+      const r2 = trimOutliers(shuffled);
+      expect(r1.trimmedCount).toBe(r2.trimmedCount);
+      expect(r1.samples.sort((a, b) => a - b)).toEqual(
+        r2.samples.sort((a, b) => a - b),
+      );
+    });
+
+    it('does not mutate the input array', () => {
+      const samples = [10, 11, 1000, 12, 13];
+      const copy = [...samples];
+      trimOutliers(samples);
+      expect(samples).toEqual(copy);
+    });
+  });
+});
diff --git a/test/e2e/benchmarks/utils/runner.ts b/test/e2e/benchmarks/utils/runner.ts
@@ -17,6 +17,7 @@ import {
   ALL_METRICS,
   DEFAULT_NUM_BROWSER_LOADS,
   DEFAULT_NUM_PAGE_LOADS,
+  WARMUP_RUNS,
 } from './constants';
 import {
   aggregateWebVitals,
@@ -27,6 +28,7 @@ import {
   calcStdDevResult,
   calculateTimerStatistics,
   checkExclusionRate,
+  detectOutliersIQR,
   MAX_EXCLUSION_RATE,
   MAX_TOTAL_DURATION_MS,
   validateThresholds,
@@ -264,6 +266,7 @@ export function convertTimerStatisticsToBenchmarkResults(
   const stdDev: StatisticalResult = {};
   const p75: StatisticalResult = {};
   const p95: StatisticalResult = {};
+  const trimmedCount: StatisticalResult = {};
 
   // timers already includes promoted web vitals from runBenchmarkWithIterations
   for (const timer of timers) {
@@ -273,8 +276,13 @@ export function convertTimerStatisticsToBenchmarkResults(
     stdDev[timer.id] = timer.stdDev;
     p75[timer.id] = timer.p75;
     p95[timer.id] = timer.p95;
+    if (timer.trimmedCount !== undefined) {
+      trimmedCount[timer.id] = timer.trimmedCount;
+    }
   }
 
+  const hasTrimmedCounts = Object.keys(trimmedCount).length > 0;
+
   return {
     testTitle,
     persona,
@@ -287,6 +295,7 @@ export function convertTimerStatisticsToBenchmarkResults(
     stdDev,
     p75,
     p95,
+    ...(hasTrimmedCounts && { trimmedCount }),
     ...(webVitals && { webVitals }),
   };
 }
@@ -366,10 +375,16 @@ export async function runPageLoadBenchmark(
     resultPersona = persona;
   }
 
-  if (runResults.some((result) => result.navigation.length > 1)) {
+  // Discard the first WARMUP_RUNS browser-load sessions before computing stats.
+  // Each session contributes exactly pageLoads metric objects to runResults.
+  const warmupSize = WARMUP_RUNS * pageLoads;
+  const measuredResults = runResults.slice(warmupSize);
+  const measuredWebVitalsRuns = allWebVitalsRuns.slice(warmupSize);
+
+  if (measuredResults.some((result) => result.navigation.length > 1)) {
     throw new Error(`Multiple navigations not supported`);
   }
-  const firstNonNavigate = runResults.find(
+  const firstNonNavigate = measuredResults.find(
     (result) => result.navigation[0].type !== 'navigate',
   );
   if (firstNonNavigate !== undefined) {
@@ -379,17 +394,19 @@ export async function runPageLoadBenchmark(
   }
 
   const result: Record<string, number[]> = {};
+  const trimmedCounts: StatisticalResult = {};
   for (const [key, tracePath] of Object.entries(ALL_METRICS)) {
-    result[key] = runResults
-      .map((m) => get(m, tracePath) as number)
-      .sort((a, b) => a - b);
+    const rawSamples = measuredResults.map((m) => get(m, tracePath) as number);
+    const { filtered, outlierCount } = detectOutliersIQR(rawSamples);
+    result[key] = [...filtered].sort((a, b) => a - b);
+    trimmedCounts[key] = outlierCount;
   }
 
   let webVitals: WebVitalsSummary | undefined;
-  if (allWebVitalsRuns.length > 0) {
+  if (measuredWebVitalsRuns.length > 0) {
     webVitals = {
-      runs: allWebVitalsRuns,
-      aggregated: aggregateWebVitals(allWebVitalsRuns),
+      runs: measuredWebVitalsRuns,
+      aggregated: aggregateWebVitals(measuredWebVitalsRuns),
     };
   }
 
@@ -423,6 +440,7 @@ export async function runPageLoadBenchmark(
     stdDev: stdDevResult,
     p75,
     p95,
+    trimmedCount: trimmedCounts,
     ...(webVitals && { webVitals }),
   };
 }

diff --git a/test/e2e/benchmarks/utils/statistics.ts b/test/e2e/benchmarks/utils/statistics.ts
@@ -219,6 +219,23 @@ export const detectOutliersIQR = (
   return { filtered, outlierCount: outliers.length, outliers };
 };
 
+/**
+ * IQR-based outlier trimming.
+ * Returns the filtered samples and the number of values removed.
+ * Intended for use before stats computation and before Mann-Whitney U tests.
+ *
+ * Thin wrapper over {@link detectOutliersIQR} with a stable public interface.
+ *
+ * @param samples - Raw per-run durations (unsorted)
+ */
+export function trimOutliers(samples: number[]): {
+  samples: number[];
+  trimmedCount: number;
+} {
+  const { filtered, outlierCount } = detectOutliersIQR(samples);
+  return { samples: filtered, trimmedCount: outlierCount };
+}
+
 /**
  * Combined outlier detection using both IQR and z-score methods
  * A value is only kept if it passes both methods
@@ -353,14 +370,18 @@ export const calculateTimerStatistics = (
     maxDuration,
     minDuration,
   );
-  const { filtered, outlierCount } = detectOutliers(sanityResult.filtered);
+  const iqrResult = detectOutliersIQR(sanityResult.filtered);
+  const zScoreResult = detectOutliersZScore(iqrResult.filtered);
+  const { filtered } = zScoreResult;
+  const totalExcluded =
+    sanityResult.excludedCount +
+    iqrResult.outlierCount +
+    zScoreResult.outlierCount;
   const sorted = [...filtered].sort((a, b) => a - b);
   const mean = calculateMean(filtered);
   const stdDev = calculateStdDev(filtered);
   const cv = mean > 0 ? (stdDev / mean) * 100 : 0;
 
-  const totalExcluded = sanityResult.excludedCount + outlierCount;
-
   return {
     id: timerId,
     mean,
@@ -374,6 +395,7 @@ export const calculateTimerStatistics = (
     p99: calculatePercentile(sorted, 99),
     samples: filtered.length,
     outliers: totalExcluded,
+    trimmedCount: iqrResult.outlierCount,
     dataQuality: assessDataQuality(cv),
   };
 };