computesdk · kisernl · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .env
 node_modules/
-.DS_Store
+.DS_Store
+dist/
diff --git a/db/burst-100k.sql b/db/burst-100k.sql
@@ -0,0 +1,63 @@
+-- Schema for the 100k burst benchmark.
+--
+-- Applied idempotently by scripts/burst-100k-launch.sh on every run, so all
+-- statements use IF NOT EXISTS. To evolve the schema later, add a follow-up
+-- .sql file and apply it once by hand — a migration framework is overkill
+-- for two tables.
+
+CREATE TABLE IF NOT EXISTS runs (
+  id                    TEXT PRIMARY KEY,           -- e.g. 20260512T143000Z-a3f8d91-e2b
+  provider              TEXT NOT NULL,
+  commit_sha            TEXT NOT NULL,
+  instance_id           TEXT NOT NULL,              -- Namespace instance ID
+  started_at            TIMESTAMPTZ NOT NULL,
+  ended_at              TIMESTAMPTZ,
+  last_heartbeat        TIMESTAMPTZ,
+  status                TEXT NOT NULL               -- running | done | failed
+                        CHECK (status IN ('running', 'done', 'failed')),
+  sandboxes_attempted   INTEGER,
+  sandboxes_succeeded   INTEGER,
+  timeouts              INTEGER,                    -- count of sandbox_results.status='timeout'
+  http_errors           INTEGER,                    -- count of sandbox_results.status='http_error'
+  network_errors        INTEGER,                    -- count of sandbox_results.status='network_error'
+  p50_latency_ms        INTEGER,
+  p99_latency_ms        INTEGER,
+  error_message         TEXT,                       -- populated on status='failed'
+  tigris_prefix         TEXT NOT NULL               -- e.g. s3://<bucket>/<run_id>/
+);
+
+-- Idempotent column additions for already-existing tables (created before
+-- these columns existed). CREATE TABLE IF NOT EXISTS above only fires on
+-- a fresh DB; existing DBs need ALTER TABLE.
+ALTER TABLE runs ADD COLUMN IF NOT EXISTS timeouts       INTEGER;
+ALTER TABLE runs ADD COLUMN IF NOT EXISTS http_errors    INTEGER;
+ALTER TABLE runs ADD COLUMN IF NOT EXISTS network_errors INTEGER;
+
+CREATE INDEX IF NOT EXISTS runs_provider_started
+  ON runs (provider, started_at DESC);
+
+-- Partial index for the stuck-run query:
+--   SELECT * FROM runs WHERE status='running' AND last_heartbeat < now() - interval '5 minutes';
+CREATE INDEX IF NOT EXISTS runs_stuck
+  ON runs (last_heartbeat) WHERE status = 'running';
+
+
+CREATE TABLE IF NOT EXISTS sandbox_results (
+  run_id            TEXT NOT NULL REFERENCES runs(id),
+  sandbox_idx       INTEGER NOT NULL,               -- 0 .. concurrencyTarget-1
+  started_at        TIMESTAMPTZ NOT NULL,
+  completed_at      TIMESTAMPTZ,
+  latency_ms        INTEGER,
+  status            TEXT NOT NULL                   -- ok | timeout | http_error | network_error
+                    CHECK (status IN ('ok', 'timeout', 'http_error', 'network_error')),
+  http_status       INTEGER,
+  error_code        TEXT,
+  provider_metadata JSONB,                          -- adapter-exposed primitives (sandbox id, region, etc.)
+  PRIMARY KEY (run_id, sandbox_idx)
+);
+
+CREATE INDEX IF NOT EXISTS sandbox_results_run_status
+  ON sandbox_results (run_id, status);
+
+-- Idempotent column add for already-existing tables.
+ALTER TABLE sandbox_results ADD COLUMN IF NOT EXISTS provider_metadata JSONB;
diff --git a/one-hundred-k-mvp-checklist.md b/one-hundred-k-mvp-checklist.md
@@ -0,0 +1,130 @@
+# 100k Burst — Implementation Checklist
+
+Tracker for the work described in [one-hundred-k-mvp-plan.md](one-hundred-k-mvp-plan.md).
+Check items off as they land.
+
+---
+
+## 0. Prerequisites (external / infra)
+
+- [x] Neon Postgres database provisioned; `PG_URL` (pooler endpoint) tested from a laptop
+- [x] `PG_URL` confirmed reachable from a Namespace VM (one-off `nsc ssh` + `psql` round-trip)
+- [x] R2 bucket created; access key has write + multipart permission
+- [x] R2 reachable from a Namespace VM (one-off `aws s3 cp` round-trip)
+- [x] Namespace auth via static token (`NSC_TOKEN` env secret in `burst-100k` environment); OIDC trust deferred
+- [x] First opt-in provider selected: **e2b** (single env var: `E2B_API_KEY`)
+- [x] GitHub `burst-100k` environment created with reviewer protection
+- [x] Environment secrets present: `TIGRIS_STORAGE_ACCESS_KEY_ID`, `TIGRIS_STORAGE_SECRET_ACCESS_KEY`, `TIGRIS_STORAGE_ENDPOINT`, `NSC_TOKEN`
+- [x] Environment variable present: `TIGRIS_STORAGE_BUCKET`
+- [x] Environment secret present: `PG_URL` (Neon connection string)
+- [x] Environment secret present for chosen provider: `E2B_API_KEY`
+- [ ] Open question #1 resolved with Namespace: dedicated egress IP or shared SNAT pool? *(non-blocking — find out before first 100k run)*
+
+## 1. Schema
+
+- [x] `db/burst-100k.sql` written with `CREATE TABLE IF NOT EXISTS` + `CREATE INDEX IF NOT EXISTS`
+- [x] Applied once locally: `psql "$PG_URL" -f db/burst-100k.sql` runs clean
+- [x] Re-applied: second run is a no-op (idempotency confirmed)
+- [x] Sanity insert + select against `runs` and `sandbox_results` works
+
+## 2. Coordinator code (`src/burst-100k/`)
+
+- [x] `types.ts` — `BurstProviderConfig extends ProviderConfig` defined
+- [x] `providers.ts` — entry for e2b, reusing `@computesdk/e2b`
+- [x] `sinks/postgres.ts` — `pg` client, batched 1k inserts, heartbeat `UPDATE`, completion `UPDATE`
+- [x] `sinks/r2.ts` — `@aws-sdk/lib-storage` multipart upload for `raw.jsonl`; `putObject` for `heartbeat.json` and `meta.json`
+- [x] `runner.ts` — `p-limit` concurrency limiter + linear ramp over `rampSeconds` (HTTP agent managed by `@computesdk/e2b` adapter)
+- [x] `coordinator.ts` — wires it all together: bootstraps the `runs` row, validates `requiredEnvVars`, runs burst, heartbeat loop, SIGTERM/SIGINT trap, completion update
+- [x] `package.json`: added deps (`pg`, `p-limit`); `@aws-sdk/client-s3` + `@aws-sdk/lib-storage` already transitively present
+- [x] `package.json`: added dev deps `esbuild`, `@types/pg`
+- [x] `package.json`: added scripts `bundle:burst-100k` and `bench:burst-100k:local`
+- [x] `npm run bundle:burst-100k` produces a working `dist/burst-100k.js` (2.7 MB single file)
+
+## 3. Local smoke (N=100)
+
+- [x] Local env vars set (provider keys, R2, `PG_URL`, `PROVIDER`, `RUN_ID`)
+- [x] `concurrencyTarget` temporarily overridden to 100 (`CONCURRENCY_TARGET=100`)
+- [x] `npm run bench:burst-100k:local` completes without error
+- [x] `runs` row created with `status='done'` on clean exit (p50=148ms, p99=792ms)
+- [x] 100 rows in `sandbox_results` for the run, all `ok`
+- [x] `raw.jsonl` present in R2 at `s3://<bucket>/<run_id>/` (100 lines, first/last span the ~60s ramp)
+- [x] `heartbeat.json` present in R2 and was updated
+- [x] `meta.json` present with final summary
+- [x] SIGTERM mid-run flushes cleanly (`status='failed'` with truthful done count + in-flight rows + raw.jsonl flushed)
+- [x] Bundle output is CJS (`dist/burst-100k.cjs`); repo's `"type": "module"` requires `--format=cjs` and `.cjs` extension
+
+## 4. Launch script
+
+- [x] `scripts/burst-100k-launch.sh` written, executable (`chmod +x`)
+- [x] `esbuild` step in the script produces a working bundle
+- [x] `psql -f db/burst-100k.sql` step runs (idempotent)
+- [x] `nsc create` returns an instance ID (using `--bare --cidfile`)
+- [x] `nsc instance upload` succeeds (coordinator bundle + startup script)
+- [x] `INSERT INTO runs ...` inserts a `running` row (with `ON CONFLICT DO NOTHING`)
+- [x] `nsc ssh ... nohup node coordinator.cjs &` returns immediately (detached); `pgrep node` post-check confirms running
+
+### Notes on what we learned (worth keeping)
+
+- Wolfi `--bare` image has no `node`; install with `apk add -q nodejs` before launch.
+- BusyBox `sh` has no `disown` builtin (`disown: not found`); `nohup ... & </dev/null` alone is sufficient to detach.
+- Passing env via long line-continued `nsc ssh -- env VAR=val \ ...` is fragile — a broken `\` continuation silently truncated the command and caused `env` to print the environment (leaking secrets). The script now writes a `chmod 600` startup script locally with `printf '%q'`-quoted values, uploads it, runs it (which `rm -f`'s itself after detaching node), and confirms with `pgrep -x node`.
+- `nsc destroy` requires `--force` to skip the TTY confirmation in non-interactive contexts (CI).
+
+## 5. Manual Namespace dry-run (N=1000)
+
+Run the launch script from a laptop with `GITHUB_SHA` faked. Cap `concurrencyTarget=1000`.
+
+- [ ] Script completes in under 60s
+- [ ] `nsc ssh <id> tail -f /root/run.log` shows the coordinator working
+- [ ] `runs.last_heartbeat` advances every ~30s
+- [ ] `sandbox_results` row count grows
+- [ ] R2 multipart parts appear under the run prefix
+- [ ] Run reaches completion; `runs.status='done'` with final stats
+- [ ] Instance self-destroys at the duration deadline (or `nsc destroy <id>` works)
+- [ ] `pkill -TERM node` over `nsc ssh` causes a clean flush + `status='failed'` row
+
+## 6. GitHub workflow
+
+- [ ] `.github/workflows/burst-100k.yml` written
+- [ ] Provider env vars passthrough in `env:` block (per-provider, matches existing `src/sandbox/providers.ts`)
+- [ ] Workflow includes `id-token: write` permission for OIDC
+- [ ] `namespacelabs/nscloud-setup@v0` step present
+- [ ] `workflow_dispatch` trigger lists the chosen provider in `inputs.provider.options`
+- [ ] First `workflow_dispatch` run (with `concurrencyTarget=1000`) succeeds end-to-end
+- [ ] Action exits in <1 min; run continues on VM and reaches `status='done'`
+
+## 7. First full 100k run
+
+- [ ] `concurrencyTarget` restored to `100_000` in the provider's entry
+- [ ] `workflow_dispatch` triggers the run
+- [ ] No `EADDRNOTAVAIL` errors in the log (if any → revisit egress IP / shard)
+- [ ] Event loop lag stays under 100ms (if not → upsize to `32x64`)
+- [ ] No OOM (if any → fix coordinator memory; don't just upsize)
+- [ ] Run completes with `status='done'`, final stats populated
+- [ ] `raw.jsonl` in R2 contains ~100k lines
+- [ ] `sandbox_results` row count ≈ `sandboxes_attempted`
+- [ ] Spot-check a handful of R2 raw records vs. their Postgres rows for consistency
+
+## 8. Onboard additional providers
+
+Repeat for each opt-in provider:
+
+- [ ] New entry added to `src/burst-100k/providers.ts`
+- [ ] Provider env vars added to GitHub Secrets (if not already there for daily benchmark)
+- [ ] Provider env vars added to workflow `env:` block and `bash -c` SSH `export` line
+- [ ] Provider name added to `inputs.provider.options`
+- [ ] Low-concurrency `workflow_dispatch` run completes cleanly
+- [ ] Full 100k `workflow_dispatch` run completes cleanly
+
+## 9. Scheduled runs (after a few clean manual runs)
+
+- [ ] Schedule cadence decided (one cron for all, or staggered)
+- [ ] `schedule:` trigger added to workflow
+- [ ] First scheduled run fires and completes
+- [ ] Stuck-run query verified: `SELECT * FROM runs WHERE status='running' AND last_heartbeat < now() - interval '5 minutes';`
+
+## 10. Documentation
+
+- [ ] `README.md` (or a dedicated section) mentions the 100k burst is opt-in, points to the workflow
+- [ ] Operational notes captured in [one-hundred-k-mvp-plan.md](one-hundred-k-mvp-plan.md) match reality after first 100k run (sizing, port exhaustion, etc.)
+- [ ] Open questions from the plan resolved or knowingly deferred
diff --git a/one-hundred-k-mvp-data-inventory.md b/one-hundred-k-mvp-data-inventory.md
@@ -0,0 +1,121 @@
+# Burst-100k Data Inventory
+
+What data the burst-100k benchmark captures today, what's cheap to add, and
+what's harder. Pairs with [one-hundred-k-mvp-plan.md](one-hundred-k-mvp-plan.md)
+and [one-hundred-k-mvp-checklist.md](one-hundred-k-mvp-checklist.md).
+
+---
+
+## Captured right now
+
+| Data | Where | Notes |
+| --- | --- | --- |
+| Per-run summary: provider, commit_sha, instance_id, start/end/heartbeat times, status, attempted/succeeded counts, p50/p99 latency, error_message, tigris prefix | Postgres `runs` | One row per run; easy to query |
+| Per-sandbox: started_at, completed_at, latency_ms, status (ok/timeout/http_error/network_error), http_status, error_code | Postgres `sandbox_results` | One row per sandbox attempt |
+| Same as above + `error_message` (truncated to 500 chars) | Tigris `<run_id>/raw.jsonl` | Source of truth; rebuild Postgres from this if needed |
+| Mid-run progress snapshots: done, in_flight, errors, timestamp | Tigris `<run_id>/heartbeat.json` | Overwritten every 30s |
+| Final summary (run_id, provider, attempted/succeeded, p50/p99, ended_at) | Tigris `<run_id>/meta.json` | Written once at clean exit |
+| Structured coordinator log (timestamped, level-tagged lines with phase markers, per-sandbox events, periodic progress milestones, heartbeats, and a completion summary) | VM `/root/run.log` AND Tigris `<run_id>/coordinator.log` | Uploaded by the coordinator at every heartbeat (30s) and on shutdown ✅ |
+
+---
+
+## Easy to add (~minutes of work)
+
+| Data | Approach | Why it's useful |
+| --- | --- | --- |
+| ~~Full latency histogram (p25, p75, p95, p99, p99.9, max)~~ ✅ Landed | `latency_distribution` object in Tigris `meta.json` carries count, min, p10/p25/p50/p75/p90/p95/p99/p999, max, mean. Postgres `runs` stays p50/p99-only — meta.json is the analytical view. | — |
+| ~~Error-type histogram~~ ✅ Landed | New `timeouts`/`http_errors`/`network_errors` columns on Postgres `runs` (+ matching `error_histogram` object in Tigris `meta.json`). Counted live in the coordinator's `onResult`, no JOIN against `sandbox_results` needed for top-line stats. | — |
+| ~~Ramp-phase latency segments~~ ✅ Landed | `ramp_segments` object in Tigris `meta.json` with `first_25pct` / `middle_50pct` / `last_25pct` buckets, each carrying `idx_range`, `count_ok`, p50/p95/p99/max/mean. Bucketed by `sandbox_idx` since the linear ramp maps idx → start-time. | — |
+| ~~Concurrency at each point in time~~ ✅ Landed | `concurrency_summary` (peak_concurrent, peak_t_ms, mean_concurrent, total_run_ms, sample_interval_ms, ramp_seconds_configured) + `concurrency_timeline` (1Hz samples of `{t_ms, active}`) in Tigris `meta.json`. Computed from per-sandbox `started_at`/`completed_at` via an interval-overlap sweep. | — |
+| ~~Sandbox IDs / region~~ ✅ Landed | `provider_metadata` JSONB column on `sandbox_results` (+ same field in Tigris `raw.jsonl`). Runner reflects every primitive property off the adapter's returned sandbox object, skipping anything that matches a credential-looking regex. On e2b: `{ provider, sandboxId }`. | — |
+
+---
+
+## Moderate effort (~hour of work each)
+
+| Data | Approach | Trade-off |
+| --- | --- | --- |
+| ~~VM system metrics over time~~ ✅ Landed | Coordinator samples every 5s into `<run_id>/metrics.jsonl` (uploaded at every 30s heartbeat for partial-result durability + at shutdown). Captures: cumulative CPU user/system µs, RSS/heap/external MB, event-loop p50/p99/max lag (since previous sample), load averages, `/proc/self/fd` count, `/proc/net/sockstat` (TCP inuse/tw/alloc etc.). Headline numbers in `meta.json.metrics_summary` (peak RSS, peak event-loop lag, peak open FDs, peak TCP inuse/tw, total CPU). `/proc/*` fields null on non-Linux. | — |
+| **DNS / TLS / TTFB breakdown per sandbox** | Hook into `undici`/`http` via `diagnostics_channel` to capture phase timings | Useful for "is this provider slow because of DNS or their backend?" — but requires bypassing the adapter abstraction |
+| **Cost estimate per run** | Track sandboxes_created × known provider rate × wall time | Pretty important for a benchmark, currently absent |
+| **Concurrent-actually-active timeline** (`active_at(t)`) | Compute from `started_at` / `completed_at` overlaps; sample every 1s and store | Verify the ramp profile matches intent |
+
+---
+
+## Harder / costlier
+
+| Data | Why hard |
+| --- | --- |
+| **Raw HTTP request/response per sandbox** (headers, body) | The `@computesdk/<provider>` adapters don't surface these. Would need to either fork the adapter or use a `dispatcher`/interceptor on `undici` |
+| **Provider-side log capture** | Requires each provider's API for fetching their server-side logs per sandbox (and per-provider auth/quotas) |
+| **VM kernel-level instrumentation** (perf, eBPF, tcpdump) | Would need privileged setup on the Wolfi VM; useful for deep network debugging |
+| **End-user experience replay** (run a workload inside the sandbox, not just measure creation) | Different benchmark concern from "burst" — closer to TTI which the daily benchmark already does |
+
+---
+
+## Recommended additions, in priority order
+
+The high-value-to-cost ratio winners worth landing next:
+
+1. ~~**Upload `coordinator.log` to Tigris at shutdown.**~~ ✅ Landed. Coordinator
+   reads `$COORDINATOR_LOG_PATH` (set by launch.sh to `/root/run.log`) and
+   uploads on every heartbeat plus shutdown.
+2. ~~**Full latency histogram in `runs` and `meta.json`.**~~ ✅ Landed in Tigris
+   `meta.json`; Postgres unchanged.
+3. ~~**Error-type histogram column on `runs`**~~ ✅ Landed. `timeouts`,
+   `http_errors`, `network_errors` columns on `runs`; `error_histogram` in
+   `meta.json`.
+
+Everything else is on-demand based on what specific question is hard to
+answer with the current data.
+
+---
+
+## GitHub-Actions-style structured coordinator log ✅ Landed
+
+The coordinator log used to be three or four terse `console.log` lines per
+run (provider, concurrency, heartbeat, completion). It's now a structured,
+timestamped, level-tagged stream similar to GitHub Actions output — useful
+for reading what a run actually did, in order, after the fact.
+
+### Shape
+
+Each line is `<ISO timestamp> [<level>] <message>` with levels
+`info` / `ok` / `warn` / `error` / `stat` / `debug` and dedicated `phase`
+markers rendered as `━━━ … ━━━`. No ANSI colors so the file stored at
+`<run_id>/coordinator.log` in Tigris stays clean.
+
+### What it captures
+
+| Phase | Example lines |
+| --- | --- |
+| **Startup** | `━━━ burst-100k coordinator starting ━━━`, `run_id=…`, `provider=e2b (requires: E2B_API_KEY)`, `concurrency=N ramp=Xs timeout=Yms`, `commit_sha=… instance_id=…`, `tigris_prefix=…` |
+| **Validation** | `━━━ validating environment ━━━`, `all 1 provider env var(s) present` |
+| **Sink setup** | `━━━ opening sinks ━━━`, `Postgres: connecting…` → `connected` → `bootstrapping runs row` → `runs row in place`; `Tigris: opening multipart upload for raw.jsonl` → `sink ready` |
+| **Compute init** | `━━━ initializing compute client ━━━`, `compute client ready for <provider>` |
+| **Burst** | `━━━ burst — firing N requests (ramp Xs) ━━━`, then `[ok] sandbox <idx> created in <ms>ms — sandboxId=…` (sampled at high N) or `[error] sandbox <idx> <status> (http=… code=…): <message>` for failures; periodic `[stat] progress N/total (in_flight=Y errors=Z) rate=R/s eta≈Ts` every ~10% |
+| **Heartbeat** | `[stat] heartbeat done=N/total in_flight=Y errors=Z` every 30s |
+| **Shutdown / completion** | `━━━ flushing sinks and writing summary ━━━`, `Postgres: flushing remaining sandbox_results batch`, `Tigris: closing multipart upload for raw.jsonl`, `Tigris: writing metrics.jsonl`, `Tigris: writing meta.json`, `Postgres: marking run done with final stats`, `━━━ run complete ━━━`, `N/N succeeded (XX.X%)`, `latency p50=…ms p99=…ms`, optional `[warn] errors: …` |
+
+### Volume control
+
+Implemented in [src/burst-100k/logger.ts](src/burst-100k/logger.ts) and
+[src/burst-100k/runner.ts](src/burst-100k/runner.ts). `pickSamplingPeriod()`
+keeps per-sandbox log lines bounded regardless of N:
+
+| Concurrency | Sandbox `[ok]` lines logged | Approx total log size |
+| --- | --- | --- |
+| 25 | 25 (every one) | ~7 KB |
+| 100 | 100 (every one) | ~16 KB |
+| 1,000 | 1,000 (every one) | ~140 KB |
+| 10,000 | ~100 sampled + every error | ~30 KB |
+| 100,000 | ~100 sampled + every error | ~50 KB |
+
+`[error]` lines are always emitted regardless of sampling, so failures are
+never silently dropped. Per-sandbox detail at high N still lives in the
+full `raw.jsonl` — the log is the human-readable timeline.
+
+### Debug verbosity
+
+A `BURST_100K_DEBUG=1` env var enables `log.debug` calls (currently unused
+but available for future verbose diagnostics without recompiling).