MetaMask · MajorLift · Apr 24, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -128,7 +128,7 @@ jobs:
       - name: Run the benchmark
         if: ${{ env.BENCHMARK_GATED == 'true' }}
         run: >-
-          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 10 --pageLoads 10 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
+          ${{ matrix.pageType == 'startupPowerUserHome' && format('yarn test:e2e:benchmark --preset startupPowerUserHome --browserLoads 15 --pageLoads 7 --out test-artifacts/benchmarks/benchmark-{0}-{1}-startupPowerUserHome.json --retries 2', matrix.browser, matrix.buildType) || format('yarn test:e2e:benchmark --preset {2} --out test-artifacts/benchmarks/benchmark-{0}-{1}-{2}.json --retries 2', matrix.browser, matrix.buildType, matrix.pageType) }}
         shell: bash
 
       - name: Send benchmark results to Sentry (main/release only)

diff --git a/shared/constants/benchmarks.ts b/shared/constants/benchmarks.ts
@@ -71,6 +71,7 @@ export type TimerStatistics = {
   p99: number;
   samples: number;
   outliers: number;
+  trimmedCount?: number;
   dataQuality: 'good' | 'poor' | 'unreliable';
 };
 
@@ -108,6 +109,8 @@ export type BenchmarkResults = {
   stdDev: StatisticalResult;
   p75: StatisticalResult;
   p95: StatisticalResult;
+  trimmedCount?: StatisticalResult;
+  outliers?: StatisticalResult;
   webVitals?: WebVitalsSummary;
 };
 

diff --git a/test/e2e/benchmarks/flows/startup/power-user-home.ts b/test/e2e/benchmarks/flows/startup/power-user-home.ts
@@ -15,7 +15,10 @@ import {
   type BenchmarkResults,
   type WebVitalsMetrics,
 } from '../../../../../shared/constants/benchmarks';
-import { WITH_STATE_POWER_USER } from '../../utils/constants';
+import {
+  WITH_STATE_POWER_USER,
+  POWER_USER_NUM_BROWSER_LOADS,
+} from '../../utils/constants';
 import { runPageLoadBenchmark, collectWebVitals } from '../../utils';
 import type {
   Metrics,
@@ -87,5 +90,8 @@ async function measurePagePowerUser(
 export async function run(
   options: PageLoadBenchmarkOptions,
 ): Promise<BenchmarkResults> {
-  return runPageLoadBenchmark(measurePagePowerUser, options);
+  return runPageLoadBenchmark(measurePagePowerUser, {
+    ...options,
+    browserLoads: options.browserLoads ?? POWER_USER_NUM_BROWSER_LOADS,
+  });
 }
diff --git a/test/e2e/benchmarks/send-to-sentry.ts b/test/e2e/benchmarks/send-to-sentry.ts
@@ -30,7 +30,7 @@ import type {
 } from '../../../shared/constants/benchmarks';
 import { getGitBranch, getGitCommitHash } from './utils/git';
 import type { UserActionResult } from './utils/types';
-import { aggregateWebVitals } from './utils/statistics';
+import { aggregateWebVitals, assessDataQuality } from './utils/statistics';
 
 const packageJsonPath = path.resolve(__dirname, '../../../package.json');
 const { version } = JSON.parse(readFileSync(packageJsonPath, 'utf-8')) as {
@@ -241,12 +241,49 @@ async function main() {
         }
       }
 
+      // Derived reliability metrics: CV, dataQuality, tailRatio
+      // CV = (stdDev / mean) * 100 — coefficient of variation, directly
+      // comparable across steps with different magnitudes.
+      // dataQuality: 'good' (CV<30), 'poor' (30-50), 'unreliable' (>50).
+      // tailRatio = p95/p75 — tail heaviness; closer to 1.0 = tighter.
+      const derivedMetrics: Record<string, number | string> = {};
+      if (benchmark.mean && benchmark.stdDev) {
+        for (const [key, meanVal] of Object.entries(benchmark.mean)) {
+          const stdDevVal = benchmark.stdDev[key];
+          if (meanVal > 0 && stdDevVal !== undefined) {
+            const cv = (stdDevVal / meanVal) * 100;
+            derivedMetrics[`${type}.cv.${key}`] = cv;
+            derivedMetrics[`${type}.dataQuality.${key}`] =
+              assessDataQuality(cv);
+          }
+        }
+      }
+      if (benchmark.p95 && benchmark.p75) {
+        for (const [key, p95Val] of Object.entries(benchmark.p95)) {
+          const p75Val = benchmark.p75[key];
+          if (p75Val !== undefined && p75Val > 0) {
+            derivedMetrics[`${type}.tailRatio.${key}`] = p95Val / p75Val;
+          }
+        }
+      }
+      if (benchmark.trimmedCount) {
+        for (const [key, count] of Object.entries(benchmark.trimmedCount)) {
+          derivedMetrics[`${type}.trimmedCount.${key}`] = count;
+        }
+      }
+      if (benchmark.outliers) {
+        for (const [key, count] of Object.entries(benchmark.outliers)) {
+          derivedMetrics[`${type}.outliers.${key}`] = count;
+        }
+      }
+
       // Timer data: structured logs (existing path, unchanged)
       Sentry.logger.info(message, {
         ...baseCiAttributes,
         'ci.persona': benchmark.persona || BENCHMARK_PERSONA.STANDARD,
         'ci.testTitle': benchmark.testTitle,
         ...allMetrics,
+        ...derivedMetrics,
       });
 
       // Web vitals: separate reporting path via spans

diff --git a/test/e2e/benchmarks/utils/constants.ts b/test/e2e/benchmarks/utils/constants.ts
@@ -50,6 +50,12 @@ export const DEFAULT_NUM_BROWSER_LOADS = DEFAULT_BENCHMARK_BROWSER_LOADS;
 /** Same as {@link DEFAULT_BENCHMARK_PAGE_LOADS} in `shared/constants/benchmarks`. */
 export const DEFAULT_NUM_PAGE_LOADS = DEFAULT_BENCHMARK_PAGE_LOADS;
 
+/** Browser loads for the POWER_USER_HOME preset (higher than default to offset warm-up exclusion). */
+export const POWER_USER_NUM_BROWSER_LOADS = 15;
+
+/** Number of leading browser-load sessions to discard as warm-up before computing stats. */
+export const WARMUP_RUNS = 1;
+
 export const ALL_METRICS = {
   uiStartup: 'UI Startup',
   load: 'navigation[0].load',

diff --git a/test/e2e/benchmarks/utils/outlier-trimming.test.ts b/test/e2e/benchmarks/utils/outlier-trimming.test.ts
@@ -0,0 +1,131 @@
+import { detectOutliersIQR } from './statistics';
+
+describe('detectOutliersIQR (IQR-based)', () => {
+  describe('small sample edge cases', () => {
+    it('returns empty array unchanged', () => {
+      const result = detectOutliersIQR([]);
+      expect(result.filtered).toEqual([]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 1 unchanged', () => {
+      const result = detectOutliersIQR([500]);
+      expect(result.filtered).toEqual([500]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 2 unchanged', () => {
+      const result = detectOutliersIQR([100, 200]);
+      expect(result.filtered).toEqual([100, 200]);
+      expect(result.outlierCount).toBe(0);
+    });
+
+    it('returns array of 3 unchanged (below IQR threshold)', () => {
+      const result = detectOutliersIQR([100, 200, 9000]);
+      expect(result.filtered).toEqual([100, 200, 9000]);
+      expect(result.outlierCount).toBe(0);
+    });
+  });
+
+  describe('no outliers', () => {
+    it('returns all values when distribution is tight', () => {
+      const samples = [100, 101, 99, 102, 98, 100, 101, 99];
+      const result = detectOutliersIQR(samples);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(samples.length);
+    });
+
+    it('returns all values when all samples are identical', () => {
+      const samples = [200, 200, 200, 200, 200, 200];
+      const result = detectOutliersIQR(samples);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(samples.length);
+    });
+  });
+
+  describe('outlier removal', () => {
+    it('removes a single high outlier', () => {
+      const samples = [10, 11, 12, 10, 11, 12, 11, 1000];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1000);
+      expect(result.outlierCount).toBe(1);
+    });
+
+    it('removes a single low outlier', () => {
+      const samples = [100, 102, 101, 103, 100, 101, 1];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1);
+      expect(result.outlierCount).toBe(1);
+    });
+
+    it('removes multiple outliers on both ends', () => {
+      const samples = [1, 100, 101, 102, 100, 101, 103, 9999];
+      const result = detectOutliersIQR(samples);
+      expect(result.filtered).not.toContain(1);
+      expect(result.filtered).not.toContain(9999);
+      expect(result.outlierCount).toBe(2);
+    });
+
+    it('preserves non-outlier values exactly', () => {
+      const core = [100, 105, 95, 102, 98, 101];
+      const samples = [...core, 5000];
+      const result = detectOutliersIQR(samples);
+      for (const v of core) {
+        expect(result.filtered).toContain(v);
+      }
+    });
+  });
+
+  describe('realistic benchmark scenario (n=15)', () => {
+    it('removes exactly 2 outliers from a deterministic 15-sample benchmark run', () => {
+      // Q1=318, Q3=328, IQR=10, upper fence=343 — 850 and 900 both exceed it
+      const normal = [
+        320, 330, 315, 325, 318, 322, 328, 316, 319, 324, 321, 323, 317,
+      ];
+      const withSpikes = [...normal, 900, 850]; // two cold-start spikes
+      const result = detectOutliersIQR(withSpikes);
+      expect(result.outlierCount).toBe(2);
+      expect(result.filtered).toHaveLength(13);
+    });
+
+    it('does not over-trim a low-variance run', () => {
+      const stable = [
+        300, 302, 298, 301, 299, 303, 300, 301, 302, 298, 300, 301, 299, 302,
+        300,
+      ];
+      const result = detectOutliersIQR(stable);
+      expect(result.outlierCount).toBe(0);
+      expect(result.filtered).toHaveLength(stable.length);
+    });
+  });
+
+  describe('input ordering', () => {
+    it('produces the same outlierCount regardless of input order', () => {
+      const ordered = [10, 11, 12, 13, 14, 15, 1000];
+      const shuffled = [1000, 13, 10, 15, 12, 11, 14];
+      const r1 = detectOutliersIQR(ordered);
+      const r2 = detectOutliersIQR(shuffled);
+      expect(r1.outlierCount).toBe(r2.outlierCount);
+      expect(r1.filtered.sort((a, b) => a - b)).toEqual(
+        r2.filtered.sort((a, b) => a - b),
+      );
+    });
+
+    it('does not mutate the input array (n >= 4, filter path)', () => {
+      const samples = [10, 11, 1000, 12, 13];
+      const copy = [...samples];
+      detectOutliersIQR(samples);
+      expect(samples).toEqual(copy);
+    });
+
+    it('does not mutate the input array (n < 4, early-return path)', () => {
+      const samples = [100, 200, 9000];
+      const copy = [...samples];
+      const result = detectOutliersIQR(samples);
+      expect(samples).toEqual(copy);
+      // returned array must not be the same reference
+      result.filtered.push(999);
+      expect(samples).toEqual(copy);
+    });
+  });
+});