MetaMask · MajorLift · Apr 24, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -128,7 +128,7 @@ jobs:
       - name: Run the benchmark
         if: ${{ env.BENCHMARK_GATED == 'true' }}
         run: >-
-          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
+          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
         shell: bash
 
       - name: Send benchmark results to Sentry (main/release only)

diff --git a/shared/constants/benchmarks.ts b/shared/constants/benchmarks.ts
@@ -71,6 +71,7 @@ export type TimerStatistics = {
   p99: number;
   samples: number;
   outliers: number;
+  trimmedCount?: number;
   dataQuality: 'good' | 'poor' | 'unreliable';
 };
 
@@ -108,6 +109,8 @@ export type BenchmarkResults = {
   stdDev: StatisticalResult;
   p75: StatisticalResult;
   p95: StatisticalResult;
+  trimmedCount?: StatisticalResult;
+  outliers?: StatisticalResult;
   webVitals?: WebVitalsSummary;
 };
 

diff --git a/test/e2e/benchmarks/flows/startup/power-user-home.ts b/test/e2e/benchmarks/flows/startup/power-user-home.ts
@@ -15,7 +15,10 @@ import {
   type BenchmarkResults,
   type WebVitalsMetrics,
 } from '../../../../../shared/constants/benchmarks';
-import { WITH_STATE_POWER_USER } from '../../utils/constants';
+import {
+  WITH_STATE_POWER_USER,
+  POWER_USER_NUM_BROWSER_LOADS,
+} from '../../utils/constants';
 import { runPageLoadBenchmark, collectWebVitals } from '../../utils';
 import type {
   Metrics,
@@ -87,5 +90,8 @@ async function measurePagePowerUser(
 export async function run(
   options: PageLoadBenchmarkOptions,
 ): Promise<BenchmarkResults> {
-  return runPageLoadBenchmark(measurePagePowerUser, options);
+  return runPageLoadBenchmark(measurePagePowerUser, {
+    ...options,
+    browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS,
+  });
 }
diff --git a/test/e2e/benchmarks/send-to-sentry.ts b/test/e2e/benchmarks/send-to-sentry.ts
@@ -30,7 +30,7 @@ import type {
 } from '../../../shared/constants/benchmarks';
 import { getGitBranch, getGitCommitHash } from './utils/git';
 import type { UserActionResult } from './utils/types';
-import { aggregateWebVitals } from './utils/statistics';
+import { aggregateWebVitals, assessDataQuality } from './utils/statistics';
 
 const packageJsonPath = path.resolve(__dirname, '../../../package.json');
 const { version } = JSON.parse(readFileSync(packageJsonPath, 'utf-8')) as {
@@ -241,12 +241,49 @@ async function main() {
         }
       }
 
+      // Derived reliability metrics: CV, dataQuality, tailRatio
+      // CV = (stdDev / mean) * 100 — coefficient of variation, directly
+      // comparable across steps with different magnitudes.
+      // dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50).
+      // tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter.
+      const derivedMetrics: Record<string, number | string> = {};
+      if (benchmark.mean && benchmark.stdDev) {
+        for (const [key, meanVal] of Object.entries(benchmark.mean)) {
+          const stdDevVal = benchmark.stdDev[key];
+          if (meanVal > 0 && stdDevVal !== undefined) {
+            const cv = (stdDevVal / meanVal) * 100;
+            derivedMetrics[`${type}.cv.${key}`] = cv;
+            derivedMetrics[`${type}.dataQuality.${key}`] =
+              assessDataQuality(cv);
+          }
+        }
+      }
+      if (benchmark.p95 && benchmark.p75) {
+        for (const [key, p95Val] of Object.entries(benchmark.p95)) {
+          const p75Val = benchmark.p75[key];
+          if (p75Val !== undefined && p75Val > 0) {
+            derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val;
+          }
+        }
+      }
+      if (benchmark.trimmedCount) {
+        for (const [key, count] of Object.entries(benchmark.trimmedCount)) {
+          derivedMetrics[`${type}.trimmedCount.${key}`] = count;
+        }
+      }
+      if (benchmark.outliers) {
+        for (const [key, count] of Object.entries(benchmark.outliers)) {
+          derivedMetrics[`${type}.outliers.${key}`] = count;
+        }
+      }
+
       // Timer data: structured logs (existing path, unchanged)
       Sentry.logger.info(message, {
         ...baseCiAttributes,
         'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD,
         'ci.testTitle': benchmark.testTitle,
         ...allMetrics,
+        ...derivedMetrics,
       });
 
       // Web vitals: separate reporting path via spans

diff --git a/test/e2e/benchmarks/utils/constants.ts b/test/e2e/benchmarks/utils/constants.ts
@@ -50,6 +50,12 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS;
 /** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */
 export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS;
 
+/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */
+export const POWER_USER_NUM_BROWSER_LOADS = 15;
+
+/** Number of leading browser-load sessions to discard as warm-up before computing stats. */
+export const WARMUP_RUNS = 1;
+
 export const ALL_METRICS = {
   uiStartup: 'UI Startup',
   load: 'navigation[0].load',

diff --git a/test/e2e/benchmarks/utils/outlier-trimming.test.ts b/test/e2e/benchmarks/utils/outlier-trimming.test.ts
@@ -0,0 +1,131 @@
+import { detectOutliersIQR } from './statistics';
+
+describe('detectOutliersIQR (IQR-based)', () => {
+  describe('small sample edge cases', () => {
+    it('returns empty array unchanged', () => {
+      const result = detectOutliersIQR([]);
+      expect(result.filtered).toEqual([]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 1 unchanged', () => {
+      const result = detectOutliersIQR([500]);
+      expect(result.filtered).toEqual([500]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 2 unchanged', () => {
+      const result = detectOutliersIQR([100, 200]);
+      expect(result.filtered).toEqual([100, 200]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 3 unchanged (below IQR threshold)', () => {
+      const result = detectOutliersIQR([100, 200, 9000]);
+      expect(result.filtered).toEqual([100, 200, 9000]);
+      expect(result.outlierCount).toBe(0);
+    });
+  });
+
+  describe('no outliers', () => {
+    it('returns all values when distribution is tight', () => {
+      const samples = [100, 101, 99, 102, 98, 100, 101, 99];
+      const result = detectOutliersIQR(samples);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(samples.length);
+    });
+
+    it('returns all values when all samples are identical', () => {
+      const samples = [200, 200, 200, 200, 200, 200];
+      const result = detectOutliersIQR(samples);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(samples.length);
+    });
+  });
+
+  describe('outlier removal', () => {
+    it('removes a single high outlier', () => {
+      const samples = [10, 11, 12, 10, 11, 12, 11, 1000];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1000);
+      expect(result.outlierCount).toBe(1);
+    });
+
+    it('removes a single low outlier', () => {
+      const samples = [100, 102, 101, 103, 100, 101, 1];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1);
+      expect(result.outlierCount).toBe(1);
+    });
+
+    it('removes multiple outliers on both ends', () => {
+      const samples = [1, 100, 101, 102, 100, 101, 103, 9999];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1);
+      expect(result.filtered).not.toContain(9999);
+      expect(result.outlierCount).toBe(2);
+    });
+
+    it('preserves non-outlier values exactly', () => {
+      const core = [100, 105, 95, 102, 98, 101];
+      const samples = [...core, 5000];
+      const result = detectOutliersIQR(samples);
+      for (const v of core) {
+        expect(result.filtered).toContain(v);
+      }
+    });
+  });
+
+  describe('realistic benchmark scenario (n=15)', () => {
+    it('removes exactly 2 outliers from a deterministic 15-sample benchmark run', () => {
+      // Q1=318, Q3=328, IQR=10, upper fence=343 — 850 and 900 both exceed it
+      const normal = [
+        320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317,
+      ];
+      const withSpikes = [...normal, 900, 850]; // two cold-start spikes
+      const result = detectOutliersIQR(withSpikes);
+      expect(result.outlierCount).toBe(2);
+      expect(result.filtered).toHaveLength(13);
+    });
+
+    it('does not over-trim a low-variance run', () => {
+      const stable = [
+        300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302,
+        300,
+      ];
+      const result = detectOutliersIQR(stable);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(stable.length);
+    });
+  });
+
+  describe('input ordering', () => {
+    it('produces the same outlierCount regardless of input order', () => {
+      const ordered = [10, 11, 12, 13, 14, 15, 1000];
+      const shuffled = [1000, 13, 10, 15, 12, 11, 14];
+      const r1 = detectOutliersIQR(ordered);
+      const r2 = detectOutliersIQR(shuffled);
+      expect(r1.outlierCount).toBe(r2.outlierCount);
+      expect(r1.filtered.sort((a, b) => a - b)).toEqual(
+        r2.filtered.sort((a, b) => a - b),
+      );
+    });
+
+    it('does not mutate the input array (n >= 4, filter path)', () => {
+      const samples = [10, 11, 1000, 12, 13];
+      const copy = [...samples];
+      detectOutliersIQR(samples);
+      expect(samples).toEqual(copy);
+    });
+
+    it('does not mutate the input array (n < 4, early-return path)', () => {
+      const samples = [100, 200, 9000];
+      const copy = [...samples];
+      const result = detectOutliersIQR(samples);
+      expect(samples).toEqual(copy);
+      // returned array must not be the same reference
+      result.filtered.push(999);
+      expect(samples).toEqual(copy);
+    });
+  });
+});
diff --git a/test/e2e/benchmarks/utils/runner.ts b/test/e2e/benchmarks/utils/runner.ts
@@ -17,6 +17,7 @@ import {
   ALL_METRICS,
   DEFAULT_NUM_BROWSER_LOADS,
   DEFAULT_NUM_PAGE_LOADS,
+  WARMUP_RUNS,
 } from './constants';
 import {
   aggregateWebVitals,
@@ -27,6 +28,7 @@ import {
   calcStdDevResult,
   calculateTimerStatistics,
   checkExclusionRate,
+  detectOutliersIQR,
   MAX_EXCLUSION_RATE,
   MAX_TOTAL_DURATION_MS,
   validateThresholds,
@@ -264,6 +266,8 @@ export function convertTimerStatisticsToBenchmarkResults(
   const stdDev: StatisticalResult = {};
   const p75: StatisticalResult = {};
   const p95: StatisticalResult = {};
+  const trimmedCount: StatisticalResult = {};
+  const outliers: StatisticalResult = {};
 
   // timers already includes promoted web vitals from runBenchmarkWithIterations
   for (const timer of timers) {
@@ -273,8 +277,15 @@ export function convertTimerStatisticsToBenchmarkResults(
     stdDev[timer.id] = timer.stdDev;
     p75[timer.id] = timer.p75;
     p95[timer.id] = timer.p95;
+    if (timer.trimmedCount !== undefined) {
+      trimmedCount[timer.id] = timer.trimmedCount;
+    }
+    outliers[timer.id] = timer.outliers;
   }
 
+  const hasTrimmedCounts = Object.keys(trimmedCount).length > 0;
+  const hasOutliers = Object.keys(outliers).length > 0;
+
   return {
     testTitle,
     persona,
@@ -287,6 +298,8 @@ export function convertTimerStatisticsToBenchmarkResults(
     stdDev,
     p75,
     p95,
+    ...(hasTrimmedCounts && { trimmedCount }),
+    ...(hasOutliers && { outliers }),
     ...(webVitals && { webVitals }),
   };
 }
@@ -321,6 +334,34 @@ export function convertSummaryToResults(
   );
 }
 
+/**
+ * Run the dApp page-load benchmark and aggregate results.
+ *
+ * Uses IQR-only outlier trimming (`trimmedCount`). Z-score is intentionally
+ * excluded for two reasons:
+ *
+ * 1. Serial correlation: page loads run serially within each browser session
+ * (shared JIT cache, extension state). A slow startup elevates all `pageLoads`
+ * measurements in a session as a correlated cluster. Z-score assumes i.i.d.
+ * samples and would flag the cluster as outliers even when the elevation has a
+ * real underlying cause. IQR is rank-based and distribution-free, making it
+ * robust to correlated samples.
+ *
+ * 2. Multimodal distributions: `longTask*`, `tbt`, and `numNetworkReqs` are
+ * zero or near-zero most runs with occasional real spikes. Z-score would flag
+ * those spikes as noise, removing genuine signal.
+ *
+ * `calculateTimerStatistics` (iteration-based benchmarks) applies both IQR and
+ * z-score because each run is an independent browser session.
+ *
+ * @param measurePageFn - Function that drives one browser session and returns metrics
+ * @param options - Benchmark configuration
+ * @param options.browserLoads - Number of full browser sessions to run
+ * @param options.pageLoads - Number of page loads per browser session
+ * @param options.retries - Number of retries per browser session on failure
+ * @param options.platform - Optional platform label written to results
+ * @param options.buildType - Optional build label written to results
+ */
 export async function runPageLoadBenchmark(
   measurePageFn: (
     pageName: string,
@@ -366,10 +407,25 @@ export async function runPageLoadBenchmark(
     resultPersona = persona;
   }
 
-  if (runResults.some((result) => result.navigation.length > 1)) {
+  // Discard the first WARMUP_RUNS browser-load sessions before computing stats.
+  // Each session contributes exactly pageLoads metric objects to runResults.
+  if (browserLoads <= WARMUP_RUNS) {
+    throw new Error(
+      `browserLoads (${browserLoads}) must be greater than WARMUP_RUNS (${WARMUP_RUNS})`,
+    );
+  }
+  const warmupSize = WARMUP_RUNS * pageLoads;
+  const measuredResults = runResults.slice(warmupSize);
+  // Web vitals entries are sparse (collection can fail silently), so filter by
+  // iteration index rather than slicing by position to avoid off-by-N errors.
+  const measuredWebVitalsRuns = allWebVitalsRuns.filter(
+    (wv) => wv.iteration >= warmupSize,
+  );
+
+  if (measuredResults.some((result) => result.navigation.length > 1)) {
     throw new Error(`Multiple navigations not supported`);
   }
-  const firstNonNavigate = runResults.find(
+  const firstNonNavigate = measuredResults.find(
     (result) => result.navigation[0].type !== 'navigate',
   );
   if (firstNonNavigate !== undefined) {
@@ -379,17 +435,19 @@ export async function runPageLoadBenchmark(
   }
 
   const result: Record<string, number[]> = {};
+  const trimmedCounts: StatisticalResult = {};
   for (const [key, tracePath] of Object.entries(ALL_METRICS)) {
-    result[key] = runResults
-      .map((m) => get(m, tracePath) as number)
-      .sort((a, b) => a - b);
+    const rawSamples = measuredResults.map((m) => get(m, tracePath) as number);
+    const { filtered, outlierCount } = detectOutliersIQR(rawSamples);
+    result[key] = [...filtered].sort((a, b) => a - b);
+    trimmedCounts[key] = outlierCount;
   }
 
   let webVitals: WebVitalsSummary | undefined;
-  if (allWebVitalsRuns.length > 0) {
+  if (measuredWebVitalsRuns.length > 0) {
     webVitals = {
-      runs: allWebVitalsRuns,
-      aggregated: aggregateWebVitals(allWebVitalsRuns),
+      runs: measuredWebVitalsRuns,
+      aggregated: aggregateWebVitals(measuredWebVitalsRuns),
     };
   }
 
@@ -423,6 +481,8 @@ export async function runPageLoadBenchmark(
     stdDev: stdDevResult,
     p75,
     p95,
+    trimmedCount: trimmedCounts,
+    outliers: { ...trimmedCounts },
     ...(webVitals && { webVitals }),
   };
 }