diff --git a/.gitignore b/.gitignore index 91e70e1..5098b1f 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,7 @@ vib*.traj # Kubernetes secrets (keep secrets.yaml.template, ignore actual secrets) k8s/secrets.yaml + +# Local private notes / scratchpads — anything matching *.private-local.* stays untracked +*.private-local.* +*.private-local diff --git a/examples/academy/federated-chat/README.md b/examples/academy/federated-chat/README.md new file mode 100644 index 0000000..134588f --- /dev/null +++ b/examples/academy/federated-chat/README.md @@ -0,0 +1,25 @@ +# Federated-Chat + +Two ChemGraph Academy logical agents running on **different HPCs** +(Aurora and Crux), discovering each other through Academy's hosted +HTTP exchange and exchanging messages across the HPC boundary: + +```text +agent-aurora (running on Aurora) +agent-crux (running on Crux) +``` + +The agents play a counter-bouncing game: agent-aurora sends `counter=1` +to agent-crux, agent-crux replies `counter=2`, and so on until the +counter reaches 10. Tiny on purpose — exercises the federated stack +(deterministic peer UIDs, HTTP exchange, multi-site dashboard) without +needing any science tools. + +The campaign assets are packaged under: + +```text +src/chemgraph/academy/campaigns/federated-chat/ +``` + +See [`e2e_guide.md`](e2e_guide.md) for the full four-terminal walkthrough +(dashboard + Aurora compute + Crux compute + bootstrap kickoff). diff --git a/examples/academy/federated-chat/e2e_guide.md b/examples/academy/federated-chat/e2e_guide.md new file mode 100644 index 0000000..1e912b4 --- /dev/null +++ b/examples/academy/federated-chat/e2e_guide.md @@ -0,0 +1,179 @@ +# Federated-Chat E2E Guide + +This guide runs the `federated-chat` ChemGraph Academy campaign across +**two HPCs simultaneously** (Aurora and Crux), with the dashboard on +your laptop merging both sites into one view. + +The campaign is intentionally minimal: two agents bounce a counter back +and forth across the HPC boundary, each incrementing it, until it hits 10. +It exercises every part of the cross-HPC stack (deterministic peer +discovery, HTTP exchange, cross-site send_message, multi-site dashboard) +without needing any science tools. + +```text +agent-aurora agent-crux + ↓ counter=1 ──► receive + receive ◄── counter=2 ↓ + ↓ counter=3 ──► receive + ... ... + receive ◄── counter=10 ↓ + finish_turn finish_turn +``` + +Four terminals: dashboard (Mac), Aurora compute, Crux compute, bootstrap (Mac). + +## Configure Paths + +Set these in every shell (Mac + both HPCs): + +```bash +export ALCF_PROJECT=ChemGraph +export ALCF_USER= # e.g. jinchu +export ALCF_SSH_USER= # may differ, e.g. jinchuli +export ARGO_USER= # e.g. jinchu.li +export LOCAL_CHEMGRAPH= +``` + +`ALCF_USER` is the shared-filesystem path component (`/flare/$ALCF_PROJECT/$ALCF_USER`). +`ALCF_SSH_USER` is the SSH login. They may differ; the loader defaults +`ALCF_SSH_USER` to `ALCF_USER` if you don't set it. + +## One-Time Setup + +You need the same setup as `example-002-mace-ensemble-screening` (sync +ChemGraph, install `[academy]` extra, build Redis once) on **both** Aurora +and Crux. Plus one extra step: log in to Academy's hosted exchange so the +Globus token is cached on both compute environments: + +```bash +# On Aurora compute (inside an interactive allocation): +python -c "from academy.exchange.cloud import HttpExchangeFactory; HttpExchangeFactory()" +# Follow the device-flow URL printed in the terminal. Same on Crux. +``` + +The token is written to `~/.local/share/academy/storage.db` and is +shared across runs. + +## Terminal 1: Dashboard (Mac) + +```bash +cd "$LOCAL_CHEMGRAPH" + +export RUN_ID=federated-chat-001 + +chemgraph academy dashboard -- "$RUN_ID" \ + --system aurora,crux \ + --campaign federated-chat \ + --reverse-port 18190 \ + --overwrite-run +``` + +This brings up: + +- one SSH ControlMaster + UAN relay + rsync mirror **per site** (`aurora` and `crux`), +- a single merged dashboard server at `http://127.0.0.1:8765`. + +Wait for both relays to print `... relay ready at ...` before continuing. + +## Terminal 2: Aurora compute (inside Aurora PBS allocation) + +```bash +module load frameworks +source /flare/$ALCF_PROJECT/$ALCF_USER/venvs/academy-swarm/bin/activate +export PATH=/flare/$ALCF_PROJECT/$ALCF_USER/bin:$PATH + +# HTTP exchange must reach exchange.academy-agents.org through the ALCF proxy. +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export HTTP_PROXY=$http_proxy +export HTTPS_PROXY=$https_proxy +export no_proxy=127.0.0.1,localhost,.alcf.anl.gov +export NO_PROXY=$no_proxy + +chemgraph academy spawn-site -- \ + --system aurora \ + --run-id "$RUN_ID" \ + --campaign federated-chat \ + --agents agent-aurora \ + --exchange-type http +``` + +Look for the lifecycle landmarks: + +```text +[daemon] rank0 registered 'agent-aurora' on the exchange (uid=...) +[daemon] rank0 waiting for peers ['agent-crux'] to come online (timeout 600s)... +[daemon] rank0 all 1 peer(s) are alive: ['agent-crux'] +[daemon] rank0 agent 'agent-aurora' is now running inside Academy Runtime +[daemon] rank0 skipping inline bootstrap (federated mode); waiting for 'chemgraph academy bootstrap'... +``` + +## Terminal 3: Crux compute (inside Crux PBS allocation) + +```bash +source /eagle/$ALCF_PROJECT/$ALCF_USER/venvs/academy-swarm-crux/bin/activate +export PATH=/eagle/$ALCF_PROJECT/$ALCF_USER/bin:$PATH + +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export HTTP_PROXY=$http_proxy +export HTTPS_PROXY=$https_proxy +export no_proxy=127.0.0.1,localhost,.alcf.anl.gov +export NO_PROXY=$no_proxy + +chemgraph academy spawn-site -- \ + --system crux \ + --run-id "$RUN_ID" \ + --campaign federated-chat \ + --agents agent-crux \ + --exchange-type http +``` + +Same landmarks. Both daemons will block at `waiting for 'chemgraph academy +bootstrap'` once they've discovered each other. + +## Terminal 4: Bootstrap kickoff (Mac, once both sites are waiting) + +```bash +chemgraph academy bootstrap -- \ + --campaign federated-chat \ + --run-id "$RUN_ID" \ + --exchange-type http +``` + +Prints `ok: sent bootstrap to agent-aurora (message_id=...)`. + +## What You Should See + +- **Aurora terminal**: `[agent agent-aurora] first message arrived from + 'campaign' ...`, then decisions firing, then `message_sent` to agent-crux. +- **Crux terminal**: `[agent agent-crux] first message arrived from + 'agent-aurora' ...`, then back-and-forth. +- **Dashboard**: agent nodes appear in the graph, metrics tick up, the + cross-site message-flow edge between aurora and crux fills in, counter + messages climb in the activity log from 1 → 10. + +## Troubleshooting + +**Both sides stuck at `waiting for peers` past ~60s** → one site isn't +actually registered. Check each compute terminal for the `registered` line. +If one is missing, the daemon hit an exception before registration; scroll +up. + +**`Address already in use` on relay startup** → a prior crashed launch +left an orphan. The new self-cleaning relay should handle it +automatically; if it doesn't, the local relay log under +`/tmp/chemgraph-academy---relay.log` will have a full `set +-x` trace showing exactly which step failed. + +**Bootstrap times out** → both sites must already be at `waiting for +'chemgraph academy bootstrap'`. If only one is up, bootstrap can't find +the recipient. + +**Argo `` error** → you didn't export `ARGO_USER` before +launching spawn-site. The launcher refuses to ship a config with the +template placeholder; the error message names the fix. + +**`Could not validate Globus token`** → the device-flow login expired. +Re-run the `python -c "from academy.exchange.cloud ..."` snippet from +the one-time setup section. diff --git a/src/chemgraph/academy/campaigns/__init__.py b/src/chemgraph/academy/campaigns/__init__.py index 8c3f5cd..f3a003a 100644 --- a/src/chemgraph/academy/campaigns/__init__.py +++ b/src/chemgraph/academy/campaigns/__init__.py @@ -6,13 +6,16 @@ EXAMPLE_002 = 'example-002-mace-ensemble-screening' +FEDERATED_CHAT = 'federated-chat' CAMPAIGNS = { 'mace-ensemble-screening-20': f'{EXAMPLE_002}/campaign.jsonc', + 'federated-chat': f'{FEDERATED_CHAT}/campaign.jsonc', } LM_CONFIG_TEMPLATES = { 'argo-gpt54-mace-template': f'{EXAMPLE_002}/lm_config.json', + 'argo-gpt5mini-federated-chat': f'{FEDERATED_CHAT}/lm_config.json', } @@ -33,6 +36,16 @@ class CampaignLaunchDefaults: agents_per_node=1, max_decisions=24, ), + # Multi-turn cross-HPC counter chat. ~10 send/receive round-trips + # so the dashboard has actual material to render. Each agent runs + # ~6 reasoning rounds (send, receive, send, ..., reach 10, + # finish_turn). max_decisions=20 gives slack for retries. + 'federated-chat': CampaignLaunchDefaults( + lm_config_template='argo-gpt5mini-federated-chat', + agent_count=2, + agents_per_node=1, + max_decisions=20, + ), } diff --git a/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc b/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc new file mode 100644 index 0000000..622510e --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc @@ -0,0 +1,45 @@ +{ + // --------------------------------------------------------------------- + // Federated-chat: a multi-turn cross-HPC conversation. Two agents + // bounce a counter back and forth N times, each incrementing it, + // until the counter hits a threshold and they finish_turn. Designed + // for demos -- produces ~40 events on the dashboard (decisions, + // send_message, message_received) so the federated UI (Sites + // header, message-flow edge, cross-site Route label) has real + // material to render. + // + // Run as: + // chemgraph academy dashboard -- federated-chat-XXX \ + // --system aurora,crux --campaign federated-chat --overwrite-run + // chemgraph academy spawn-site -- --system aurora --run-id ... \ + // --campaign federated-chat --agents agent-aurora --exchange-type http + // chemgraph academy spawn-site -- --system crux --run-id ... \ + // --campaign federated-chat --agents agent-crux --exchange-type http + // chemgraph academy bootstrap -- --campaign federated-chat \ + // --run-id ... --exchange-type http + // --------------------------------------------------------------------- + "run_id": "federated-chat", + "user_task": "Federated counter chat: bounce an integer counter between agent-aurora and agent-crux, each incrementing it by 1, until it reaches 10. Then both finish_turn.", + "prompt_profile": "prompt_profiles/default.json", + "initial_agent": "agent-aurora", + "resources": {}, + "mcp_servers": [], + "agents": [ + { + "name": "agent-aurora", + "role": "FederatedCounterInitiator", + "mission": "You are agent-aurora, running on the Aurora HPC. You are playing a counter-bouncing game with agent-crux across the HPC boundary. Rules: (1) On the bootstrap round, send EXACTLY ONE message to agent-crux with content 'counter=1' and tldr 'counter=1'. Set reply_requested=true. Then call finish_turn. (2) On every subsequent round where you receive a message from agent-crux containing 'counter=N', if N < 10 then send EXACTLY ONE reply to agent-crux with content 'counter=N+1' (you compute N+1 yourself, e.g. counter=3 if you received counter=2) and tldr 'counter=N+1', reply_requested=true, then finish_turn. If N >= 10, send NOTHING and just call finish_turn -- the game is over. (3) NEVER send more than one message per round. (4) NEVER initiate a new chain; only reply when a peer message arrives.", + "allowed_peers": ["agent-crux"], + "mcp_servers": [], + "resources": [] + }, + { + "name": "agent-crux", + "role": "FederatedCounterResponder", + "mission": "You are agent-crux, running on the Crux HPC. You are playing a counter-bouncing game with agent-aurora across the HPC boundary. Rules: (1) You NEVER initiate a message; you only ever reply. (2) On every round where you receive a message from agent-aurora containing 'counter=N', if N < 10 then send EXACTLY ONE reply to agent-aurora with content 'counter=N+1' (you compute N+1 yourself) and tldr 'counter=N+1', reply_requested=true, then finish_turn. If N >= 10, send NOTHING and just call finish_turn -- the game is over. (3) NEVER send more than one message per round.", + "allowed_peers": ["agent-aurora"], + "mcp_servers": [], + "resources": [] + } + ] +} diff --git a/src/chemgraph/academy/campaigns/federated-chat/lm_config.json b/src/chemgraph/academy/campaigns/federated-chat/lm_config.json new file mode 100644 index 0000000..20c4b7f --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/lm_config.json @@ -0,0 +1,11 @@ +{ + "provider": "openai_compatible_tools", + "base_url": "http://:18186/argoapi/v1", + "model": "GPT-5-mini", + "api_key": "dummy", + "user": "", + "timeout_s": 180, + "max_tokens": 4096, + "max_retries": 3, + "retry_delay_s": 2 +} diff --git a/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json b/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json new file mode 100644 index 0000000..ccbe79f --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json @@ -0,0 +1,12 @@ +{ + "prompt_version": "federated-chat-v1", + "prompt_style": "json_state", + "system_prompt": "You are a persistent ChemGraph-style LM agent hosted inside an Academy daemon on HPC. You communicate with peers ONLY through send_message. This campaign has NO science tools; your only useful actions are send_message and finish_turn. Follow your mission literally; do not invent additional work. The campaign has a clear termination condition (counter reaches 10); when reached, call finish_turn and STOP.", + "protocol_prompt": "Return one or more tool calls. If no action is useful, call finish_turn. Every send_message call must include tldr: one short line for the dashboard. Set reply_requested=true when the peer should answer, otherwise false. Keep arguments concise. Per your mission: send AT MOST ONE message per round. The counter you receive looks like 'counter=N'; parse N, compute N+1, send 'counter=N+1' as both content and tldr. When N>=10 the game is over -- send nothing, just finish_turn.", + "langchain_recursion_limit": 32, + "state_limits": { + "received_messages_last_n": 8, + "tool_results_last_n": 4, + "actions_last_n": 8 + } +} diff --git a/src/chemgraph/academy/core/agent.py b/src/chemgraph/academy/core/agent.py index 6f2c81c..562ee3b 100644 --- a/src/chemgraph/academy/core/agent.py +++ b/src/chemgraph/academy/core/agent.py @@ -90,10 +90,26 @@ async def agent_on_startup(self) -> None: @action async def receive_message(self, message: dict[str, Any]) -> None: validate_message(message) + first_message = not self.received_message_history self.received_message_history.append(message) self._trace('message_received', message) if self._wake_event is not None: self._wake_event.set() + if first_message: + # Operator-visible lifecycle landmark: the FIRST message + # to land on this agent (almost always the campaign + # bootstrap for initial_agent, or a peer's reply on + # everyone else) is the canonical "kickoff arrived" + # signal. Use print so it surfaces on stdout regardless + # of log level configuration on the rank. + sender = message.get('sender', '?') + kind = message.get('kind', '?') + tldr = message.get('tldr') or message.get('content', '')[:60] + print( + f"[agent {self.spec.name}] first message arrived from " + f"{sender!r} (kind={kind}): {tldr}", + flush=True, + ) @action async def get_status(self) -> dict[str, Any]: diff --git a/src/chemgraph/academy/core/campaign.py b/src/chemgraph/academy/core/campaign.py index b87a80d..2593929 100644 --- a/src/chemgraph/academy/core/campaign.py +++ b/src/chemgraph/academy/core/campaign.py @@ -3,7 +3,7 @@ import dataclasses import json import pathlib -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from typing import Any from chemgraph.academy.campaigns import resolve_campaign @@ -153,6 +153,27 @@ class ChemGraphDaemonConfig: local_rank: int | None chemgraph_repo_root: pathlib.Path exchange_type: str = 'redis' + # URL of an HTTP exchange server when exchange_type == 'http'. + # ``None`` selects the Academy-hosted default + # (https://exchange.academy-agents.org/v1), which is gated by + # Globus Auth and uses the bearer token cached at + # ``$XDG_DATA_HOME/academy/storage.db``. Set this for a + # self-hosted ``python -m academy.exchange.cloud`` server. + http_exchange_url: str | None = None + # Optional explicit agent slice for this launch. Empty tuple + # (default) means launch every agent declared on the campaign -- + # i.e. the single-machine ``run-compute`` flow. Non-empty means + # the federated ``spawn-site`` flow: this daemon only owns the + # listed agents, the rest are presumed running elsewhere and + # reachable through the exchange. Order is preserved so MPI ranks + # map to ``agents[rank]`` deterministically. + agents: tuple[str, ...] = () + # When True the rank-0 in-process bootstrap dispatch is skipped. + # Set by ``spawn-site`` to defer kickoff to the standalone + # ``bootstrap`` subcommand the operator runs once all sites are + # up. ``run-compute`` keeps the default (False) for backward + # compatibility with single-machine campaigns. + skip_bootstrap: bool = False def namespace_for_run(run_dir: pathlib.Path) -> str: @@ -380,7 +401,24 @@ def _resolve_campaign_relative_path( return path.resolve() -def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: +def validate_campaign( + campaign: ChemGraphCampaign, + agent_count: int, + *, + federated: bool = False, +) -> None: + """Validate a campaign before the daemon constructs agents from it. + + ``federated=True`` loosens two single-machine assumptions that don't + hold for federated spawn-site launches: + * ``initial_agent`` may name an agent hosted on another site + (this site only has a slice). + * each agent's ``allowed_peers`` may reference agents on other + sites that aren't in this slice. Those are looked up via the + exchange at runtime; the validator can't know about them. + The intra-slice checks (no duplicate names, no self-peer, MCP + server / resource references all resolvable) still run. + """ if len(campaign.agents) != agent_count: raise RuntimeError( f'campaign defines {len(campaign.agents)} agents but ' @@ -389,7 +427,7 @@ def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: names = [agent.name for agent in campaign.agents] if len(set(names)) != len(names): raise RuntimeError('campaign agent names must be unique') - if campaign.initial_agent not in names: + if not federated and campaign.initial_agent not in names: raise RuntimeError( f'initial_agent {campaign.initial_agent!r} is not an agent', ) @@ -398,11 +436,12 @@ def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: raise RuntimeError('campaign MCP server names must be unique') declared_servers = set(server_names) for agent in campaign.agents: - unknown = sorted(set(agent.allowed_peers).difference(names)) - if unknown: - raise RuntimeError( - f'{agent.name} has unknown allowed peers: {unknown}', - ) + if not federated: + unknown = sorted(set(agent.allowed_peers).difference(names)) + if unknown: + raise RuntimeError( + f'{agent.name} has unknown allowed peers: {unknown}', + ) if agent.name in agent.allowed_peers: raise RuntimeError(f'{agent.name} must not list itself as a peer') unknown_servers = sorted(set(agent.mcp_servers).difference(declared_servers)) @@ -436,6 +475,60 @@ def selected_agent(campaign: ChemGraphCampaign, rank: int) -> ChemGraphAgentSpec return campaign.agents[rank] +def parse_agents_selection(raw: str | None) -> tuple[str, ...]: + """Parse a comma-separated ``--agents`` flag into a name tuple. + + Returns an empty tuple when ``raw`` is None or empty (the + single-machine flow where every declared agent is launched). + Whitespace around individual names is trimmed; empty segments + (e.g. trailing comma) are dropped silently. Duplicate-name + detection lives in :func:`filter_agents` so the user-facing + error surfaces in one place regardless of where the list + originated. + """ + if not raw: + return () + return tuple(name.strip() for name in raw.split(',') if name.strip()) + + +def filter_agents( + campaign: ChemGraphCampaign, + agent_names: Sequence[str], +) -> ChemGraphCampaign: + """Return a copy of ``campaign`` with only the named agents. + + Order in the returned ``agents`` tuple matches ``agent_names`` so MPI + rank-to-agent mapping stays deterministic across launches. + + Raises: + RuntimeError: if any name in ``agent_names`` is not declared on + the campaign, or if ``agent_names`` is empty / has duplicates. + + Note: + Subsetting deliberately does NOT rewrite ``initial_agent`` -- in + the federated ``spawn-site`` flow that name may still refer to an + agent hosted on another site. Validation against the subset is + loosened accordingly (callers must not pass the subsetted campaign + through ``validate_campaign`` with the strict ``initial_agent`` + check; use it for per-site daemon launch only). + """ + if not agent_names: + raise RuntimeError('filter_agents requires at least one agent name') + if len(set(agent_names)) != len(agent_names): + raise RuntimeError(f'duplicate agent names in selection: {list(agent_names)}') + + by_name = {agent.name: agent for agent in campaign.agents} + unknown = sorted(set(agent_names).difference(by_name)) + if unknown: + declared = sorted(by_name) + raise RuntimeError( + f'agents not declared on campaign: {unknown} (campaign declares {declared})', + ) + + selected = tuple(by_name[name] for name in agent_names) + return dataclasses.replace(campaign, agents=selected) + + def campaign_bootstrap_text(campaign: ChemGraphCampaign) -> str: initial_agent = next( (agent for agent in campaign.agents if agent.name == campaign.initial_agent), diff --git a/src/chemgraph/academy/dashboard/server.py b/src/chemgraph/academy/dashboard/server.py index 3c50741..233615d 100644 --- a/src/chemgraph/academy/dashboard/server.py +++ b/src/chemgraph/academy/dashboard/server.py @@ -15,6 +15,47 @@ _STATIC_CACHE: dict[str, bytes] = {} +def _iter_site_dirs(run_dir: Path) -> list[tuple[str | None, Path]]: + """Return ``[(site_name, site_dir)]`` for the dashboard to read from. + + Single-site mode (the legacy layout): ``run_dir/events.jsonl`` + exists at the top level. Returns ``[(None, run_dir)]`` and the + dashboard behaves exactly as it did pre-federation. + + Multi-site mode (federated, per-site subdirs): ``run_dir`` does + NOT contain ``events.jsonl`` itself; instead it contains one + subdir per site, each with its own ``events.jsonl``. Returns + ``[(name, subdir), ...]`` for every subdir that looks like a + site mirror. The ``site_name`` is used to tag events and + namespace per-site status / placement / summary in the merged + payload. + + Detection heuristic: ``events.jsonl`` at the top level wins + (single-site, even if subdirs exist for some reason). Otherwise + every immediate subdir whose own ``events.jsonl`` exists OR + which carries a ``dashboard_metadata.json`` (written per-site by + the launcher) counts as a site. The metadata check catches the + pre-startup window where a site is up but no events have been + written yet, so federated dashboards don't briefly look like + "empty single-site" while waiting on the first event. + """ + if (run_dir / "events.jsonl").exists(): + return [(None, run_dir)] + sites: list[tuple[str | None, Path]] = [] + if run_dir.is_dir(): + for child in sorted(run_dir.iterdir()): + if not child.is_dir(): + continue + if (child / "events.jsonl").exists() or (child / "dashboard_metadata.json").exists(): + sites.append((child.name, child)) + if not sites: + # Neither single-site events nor any recognizable site subdirs. + # Fall back to treating the dir as single-site so the empty-run + # case (just-created dir, no events yet) doesn't break. + return [(None, run_dir)] + return sites + + def _static_file(name: str, content_type: str) -> tuple[bytes, str]: if name not in _STATIC_CACHE: resource = files('chemgraph.academy.dashboard').joinpath( @@ -79,17 +120,17 @@ def snapshot(handler: DashboardHandler) -> dict[str, Any]: return data -def status_payload(handler: DashboardHandler) -> dict[str, Any]: - run_dir = handler.run_dir - status_path = run_dir / "status.json" +def _site_status(site_dir: Path) -> dict[str, Any]: + """Compose one site's ``status`` slice (status.json + placement + summary).""" + status_path = site_dir / "status.json" status: dict[str, Any] = {} if status_path.exists(): try: status = json.loads(status_path.read_text(encoding="utf-8")) except json.JSONDecodeError: status = {} - artifacts = write_run_artifacts(run_dir) - manifest = read_json_file(run_dir / "manifest.json", default={}) + artifacts = write_run_artifacts(site_dir) + manifest = read_json_file(site_dir / "manifest.json", default={}) updated = status.get("updated") or status.get("timestamp") schema = ( status.get("mode") @@ -97,7 +138,6 @@ def status_payload(handler: DashboardHandler) -> dict[str, Any]: or "canonical_events" ) return { - "run_dir": str(run_dir), "updated": updated, "schema": schema, "status": status, @@ -106,13 +146,77 @@ def status_payload(handler: DashboardHandler) -> dict[str, Any]: } +def status_payload(handler: DashboardHandler) -> dict[str, Any]: + run_dir = handler.run_dir + sites = _iter_site_dirs(run_dir) + + if len(sites) == 1 and sites[0][0] is None: + # Single-site / legacy layout -- preserve exact pre-federation + # payload shape so existing JS clients keep working. + site_data = _site_status(run_dir) + return { + "run_dir": str(run_dir), + **site_data, + } + + # Federated layout: nest per-site status under ``sites`` and add a + # top-level ``updated`` reflecting the most recent per-site update + # so the dashboard header has something to display. + sites_data: dict[str, dict[str, Any]] = {} + latest_updated: float | None = None + for site_name, site_dir in sites: + assert site_name is not None + sites_data[site_name] = _site_status(site_dir) + site_updated = sites_data[site_name].get("updated") + if isinstance(site_updated, (int, float)): + latest_updated = ( + site_updated if latest_updated is None + else max(latest_updated, float(site_updated)) + ) + return { + "run_dir": str(run_dir), + "updated": latest_updated, + "schema": "canonical_events", + "sites": sites_data, + } + + def events_payload(run_dir: Path) -> dict[str, Any]: - events = [ - event.model_dump(mode="json") for event in read_events(run_dir / "events.jsonl") - ] + sites = _iter_site_dirs(run_dir) + + if len(sites) == 1 and sites[0][0] is None: + # Single-site / legacy layout -- preserve exact event payload + # shape (no per-event ``site`` tag). + events = [ + event.model_dump(mode="json") + for event in read_events(run_dir / "events.jsonl") + ] + return { + "run_dir": str(run_dir), + "events": events, + } + + # Federated: tag each event with its site and merge in timestamp + # order so the dashboard can render a single interleaved stream. + merged: list[dict[str, Any]] = [] + for site_name, site_dir in sites: + for event in read_events(site_dir / "events.jsonl"): + payload = event.model_dump(mode="json") + payload["site"] = site_name + merged.append(payload) + # Sort by timestamp when available; events lacking a timestamp + # sink to the bottom rather than throw off the ordering of + # well-formed ones. + def _ts(e: dict[str, Any]) -> float: + v = e.get("timestamp") or e.get("time") + try: + return float(v) if v is not None else float("inf") + except (TypeError, ValueError): + return float("inf") + merged.sort(key=_ts) return { "run_dir": str(run_dir), - "events": events, + "events": merged, } diff --git a/src/chemgraph/academy/dashboard/static/app.js b/src/chemgraph/academy/dashboard/static/app.js index 1796c0e..45bed8c 100644 --- a/src/chemgraph/academy/dashboard/static/app.js +++ b/src/chemgraph/academy/dashboard/static/app.js @@ -88,6 +88,78 @@ const statusData = await statusRes.json(); const eventsData = await eventsRes.json(); const nextSnapshot = {...statusData, events: eventsData.events || []}; + // Federated detection: server-side B.4c.2 wraps per-site state + // under ``sites: {: {status, placement, summary, updated, schema}}`` + // for multi-site runs and tags every event with ``site``. Build + // a flat agent->site index so renderers can ask "which site does + // this agent live on?" without re-walking sites every time. + nextSnapshot.federated = !!(nextSnapshot.sites && Object.keys(nextSnapshot.sites).length); + nextSnapshot.siteNames = nextSnapshot.federated + ? Object.keys(nextSnapshot.sites).sort() + : []; + nextSnapshot.sitesByAgent = {}; + if (nextSnapshot.federated) { + // Merge per-site status/placement/summary up to the top level so + // the rest of the app (agents(), renderMetrics(), workflow code) + // can read snapshot.status / snapshot.placement exactly like in + // single-site mode. Without this merge `snapshot.status?.agents` + // is undefined in federated runs and the entire graph + metrics + // panel renders empty even though events stream in. + const mergedAgents = []; + const mergedPlacements = {}; + const seenAgentIds = new Set(); + let mergedSchema = null; + const mergedStatusExtras = {}; + for (const [siteName, siteData] of Object.entries(nextSnapshot.sites)) { + const agents = (siteData?.status?.agents) || []; + agents.forEach(spec => { + const agentId = spec.agent_id || spec.agent_name || spec.name; + if (agentId) { + nextSnapshot.sitesByAgent[agentId] = siteName; + if (!seenAgentIds.has(agentId)) { + seenAgentIds.add(agentId); + mergedAgents.push({...spec, site: siteName}); + } + } + }); + const placements = (siteData?.placement?.agents) || {}; + Object.entries(placements).forEach(([agentId, placement]) => { + if (!(agentId in nextSnapshot.sitesByAgent)) { + nextSnapshot.sitesByAgent[agentId] = siteName; + } + if (!(agentId in mergedPlacements)) { + mergedPlacements[agentId] = placement; + } + }); + // Carry a representative schema + common status scalars so + // isWorkflowMode() and "campaign / mode" lookups behave the + // same as single-site. Last-write-wins is fine here -- all + // sites in a federated run share the same campaign/mode. + if (siteData?.schema) mergedSchema = siteData.schema; + const siteStatus = siteData?.status || {}; + ['campaign', 'campaign_kind', 'mode', 'converged', 'query', + 'workflow_type', 'model_name'].forEach(key => { + if (siteStatus[key] !== undefined && mergedStatusExtras[key] === undefined) { + mergedStatusExtras[key] = siteStatus[key]; + } + }); + } + // Backfill from events too, since the per-event ``site`` tag + // is authoritative for the agent that emitted each event. + (nextSnapshot.events || []).forEach(event => { + if (event.site && event.agent_id && !(event.agent_id in nextSnapshot.sitesByAgent)) { + nextSnapshot.sitesByAgent[event.agent_id] = event.site; + } + }); + nextSnapshot.status = { + ...mergedStatusExtras, + agents: mergedAgents, + }; + nextSnapshot.placement = {agents: mergedPlacements}; + if (mergedSchema && !nextSnapshot.schema) { + nextSnapshot.schema = mergedSchema; + } + } const nextIdentity = identityForSnapshot(nextSnapshot); const previousEventCount = snapshot?.events?.length || 0; const nextEventCount = nextSnapshot.events.length; @@ -304,6 +376,57 @@ return agent?.placement?.short_hostname || agent?.placement?.hostname || (agent?.started ? 'unknown host' : 'pending'); } + function agentSite(agent) { + // In federated runs the meaningful grouping is "which HPC" + // (aurora vs crux), not individual compute hostnames. The + // server tags every event with ``site`` and we built a + // sitesByAgent index in load(); fall back to "pending" so + // not-yet-registered agents still render. + if (!snapshot?.federated) return null; + const agentId = agent?.agent_id || agent?.agent_name || agent?.name; + return snapshot.sitesByAgent[agentId] || 'pending'; + } + + function agentGroup(agent) { + // Single source of truth for "what bucket does this agent + // belong to in the current view": site (federated) or host + // (single-machine). Renderers that ask "what color / what + // label" go through here so federated vs single-site rendering + // diverges in exactly one place. + return agentSite(agent) || agentHost(agent); + } + + function renderSitesBadge() { + // Header-bar federation indicator. Hidden in single-site runs + // so the existing single-site UI looks unchanged; in federated + // runs it shows e.g. "Sites: aurora · crux (2 agents on aurora, + // 1 on crux, 0 events from aurora)" so operators / demo + // viewers can confirm at a glance that the campaign really + // is spanning multiple HPCs. + const badge = document.getElementById('sitesBadge'); + if (!badge) return; + if (!snapshot?.federated || !snapshot.siteNames.length) { + badge.style.display = 'none'; + badge.textContent = ''; + return; + } + const eventCounts = {}; + (snapshot.events || []).forEach(e => { + if (e.site) eventCounts[e.site] = (eventCounts[e.site] || 0) + 1; + }); + const agentCounts = {}; + Object.values(snapshot.sitesByAgent || {}).forEach(site => { + agentCounts[site] = (agentCounts[site] || 0) + 1; + }); + const parts = snapshot.siteNames.map(site => { + const a = agentCounts[site] || 0; + const e = eventCounts[site] || 0; + return `${site} (${a}🤖 / ${e}📨)`; + }); + badge.textContent = 'Sites: ' + parts.join(' · '); + badge.style.display = ''; + } + function hostColor(index) { const colors = ['#dbeafe', '#dcfce7', '#fef3c7', '#fce7f3', '#e0e7ff', '#ccfbf1', '#fee2e2', '#ede9fe']; return colors[index % colors.length]; @@ -318,6 +441,7 @@ const detailScroll = captureDetailScrollSnapshot(); document.getElementById('updated').textContent = snapshot.updated ? new Date(snapshot.updated * 1000).toLocaleTimeString() : ''; document.getElementById('runPath').textContent = snapshot.run_dir || ''; + renderSitesBadge(); document.getElementById('graphTitle').textContent = isWorkflowMode() ? 'ChemGraph Workflow' : 'Agent Graph'; renderTimeline(); renderMetrics(); @@ -358,8 +482,12 @@ events.forEach(event => { counts[event.event] = (counts[event.event] || 0) + 1; }); const currentAgents = agents(); const startedAgents = currentAgents.filter(agent => agent.started); - const hostByAgent = new Map(currentAgents.map(agent => [agent.agent_id, agentHost(agent)])); - const hosts = new Set(startedAgents.map(agentHost).filter(host => host && host !== 'pending')); + // Group by site in federated mode, by host otherwise. The + // "cross-node messages" metric below stays meaningful either + // way -- in federated mode it becomes "messages that crossed + // the HPC boundary," which is exactly what we want to surface. + const hostByAgent = new Map(currentAgents.map(agent => [agent.agent_id, agentGroup(agent)])); + const hosts = new Set(startedAgents.map(agentGroup).filter(host => host && host !== 'pending')); const finish = latestEventOf('campaign_finished')?.payload || {}; const messageEvents = events.filter(event => event.event === 'message_sent'); const crossNodeMessages = messageEvents.filter(event => { @@ -1251,9 +1379,14 @@ return; } + // Group by site (federated) or hostname (single-site). The + // graph layout is unchanged either way -- only the swimlane + // labels and the band per-agent-group differ. In federated + // mode you see "aurora" and "crux" swimlanes instead of + // "x4708..." and "x1000...". Same nodes, clearer story. const byHost = new Map(); currentAgents.forEach(agent => { - const host = agentHost(agent); + const host = agentGroup(agent); if (!byHost.has(host)) byHost.set(host, []); byHost.get(host).push(agent); }); @@ -1944,21 +2077,28 @@ const currentAgents = agents(); const senderAgent = currentAgents.find(agent => agent.agent_id === sender); const recipientAgent = currentAgents.find(agent => agent.agent_id === recipient); - const senderHost = agentHost(senderAgent); - const recipientHost = agentHost(recipientAgent); + // "Group" = site in federated mode, host in single-machine. + // Route label becomes "cross-site" or "cross-node" accordingly, + // which is what the operator actually wants to see at a glance. + const senderGroup = agentGroup(senderAgent); + const recipientGroup = agentGroup(recipientAgent); + const groupLabel = snapshot?.federated ? 'site' : 'host'; const messages = eventsOf('message_sent').filter(e => { const p = e.payload || {}; return p.sender === sender && p.recipient === recipient; }); const latest = messages.length ? messages[messages.length - 1] : null; const latestPayload = latest?.payload || {}; - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; document.getElementById('detailTitle').textContent = `${sender} -> ${recipient}`; document.getElementById('detailCards').innerHTML = detailCards([ ['Route', route], ['Messages', messages.length], - ['From host', senderHost], - ['To host', recipientHost], + [`From ${groupLabel}`, senderGroup], + [`To ${groupLabel}`, recipientGroup], ]); setDetailHtmlBlock('detailPrimaryTitle', 'Latest Message', 'detailPrimary', latest ? messageDetailHtml(latest) @@ -2256,16 +2396,20 @@ const currentAgents = agents(); const sender = currentAgents.find(agent => agent.agent_id === p.sender); const recipient = currentAgents.find(agent => agent.agent_id === p.recipient); - const senderHost = agentHost(sender); - const recipientHost = agentHost(recipient); - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const senderGroup = agentGroup(sender); + const recipientGroup = agentGroup(recipient); + const groupLabel = snapshot?.federated ? 'site' : 'host'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; return detailRich( detailSection('Route', detailKvGrid([ ['Type', route], - ['Sender host', senderHost], - ['Recipient host', recipientHost], + [`Sender ${groupLabel}`, senderGroup], + [`Recipient ${groupLabel}`, recipientGroup], ['Message id', p.message_id || '-', 'mono'], - ]), route === 'cross-node' ? 'ok' : 'info'), + ]), crossGroup ? 'ok' : 'info'), ); } @@ -2510,13 +2654,17 @@ const currentAgents = agents(); const sender = currentAgents.find(agent => agent.agent_id === p.sender); const recipient = currentAgents.find(agent => agent.agent_id === p.recipient); - const senderHost = agentHost(sender); - const recipientHost = agentHost(recipient); - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const senderGroup = agentGroup(sender); + const recipientGroup = agentGroup(recipient); + const groupLabel = snapshot?.federated ? 'site' : 'host'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; return [ `Route: ${route}`, - `Sender host: ${senderHost}`, - `Recipient host: ${recipientHost}`, + `Sender ${groupLabel}: ${senderGroup}`, + `Recipient ${groupLabel}: ${recipientGroup}`, `Message id: ${p.message_id || '-'}`, ].join('\n'); } diff --git a/src/chemgraph/academy/dashboard/static/index.html b/src/chemgraph/academy/dashboard/static/index.html index f26c106..bdd2ab4 100644 --- a/src/chemgraph/academy/dashboard/static/index.html +++ b/src/chemgraph/academy/dashboard/static/index.html @@ -590,6 +590,11 @@

ChemGraph Academy Dashboard

+ +
diff --git a/src/chemgraph/academy/observability/event_log.py b/src/chemgraph/academy/observability/event_log.py index c42a41c..a61ef72 100644 --- a/src/chemgraph/academy/observability/event_log.py +++ b/src/chemgraph/academy/observability/event_log.py @@ -45,6 +45,7 @@ "daemon_started", "daemon_stopped", "bootstrap_message_dispatched", + "bootstrap_message_skipped", "llm_tool_calls", "turn_finished_without_external_action", "chemgraph_reasoning_turn_started", diff --git a/src/chemgraph/academy/observability/run_artifacts.py b/src/chemgraph/academy/observability/run_artifacts.py index 11fa8b4..083d2e2 100644 --- a/src/chemgraph/academy/observability/run_artifacts.py +++ b/src/chemgraph/academy/observability/run_artifacts.py @@ -265,7 +265,6 @@ async def wait_for_agent_statuses_finished( def clear_run_outputs(run_dir: pathlib.Path) -> None: for name in ( - 'academy_registrations.json', 'messages.jsonl', 'events.jsonl', 'placement.json', diff --git a/src/chemgraph/academy/runtime/bootstrap.py b/src/chemgraph/academy/runtime/bootstrap.py new file mode 100644 index 0000000..c2a95b4 --- /dev/null +++ b/src/chemgraph/academy/runtime/bootstrap.py @@ -0,0 +1,244 @@ +"""Standalone campaign-bootstrap dispatch for federated runs. + +In single-machine campaigns rank 0 of the daemon dispatches the +``campaign`` -> ``initial_agent`` bootstrap message in-process as the +last step of startup. The federated ``spawn-site`` flow can't do +that: at startup time the agent that owns ``initial_agent`` may live +on a different machine that hasn't come up yet, so each site skips +the inline dispatch (``--no-bootstrap``) and the operator triggers +kickoff once every site is up by running ``chemgraph academy +bootstrap`` from anywhere with the cached Globus token. + +This module is intentionally light: it does not load a system profile, +does not need a run-dir, and does not invoke ``mpiexec``. It just +opens an exchange user-client, discovers the recipient by name, and +sends one message. +""" +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import Any + +from academy.handle import Handle + +from chemgraph.academy.campaigns import resolve_campaign +from chemgraph.academy.core.agent import ChemGraphLogicalAgent +from chemgraph.academy.core.campaign import campaign_bootstrap_text +from chemgraph.academy.core.campaign import ChemGraphCampaign +from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +from chemgraph.academy.core.campaign import load_campaign +from chemgraph.academy.core.campaign import namespace_for_run +from chemgraph.academy.core.peer_protocol import build_message +from chemgraph.academy.runtime.exchange import build_exchange_factory +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import wait_for_peers_alive + +logger = logging.getLogger(__name__) + + +def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog='chemgraph academy bootstrap', + description=( + 'Dispatch the campaign bootstrap message to the initial agent. ' + 'Run this once every site of a federated campaign is up; the ' + 'recipient is looked up by name on the exchange.' + ), + ) + parser.add_argument( + '--campaign', required=True, + help='Campaign config (packaged name or path to campaign.jsonc).', + ) + parser.add_argument( + '--run-id', required=True, + help=( + "The run-id used by the spawn-site invocations. The bootstrap " + "recipient's mailbox UID is derived deterministically from " + "(run-id, agent-name); the same run-id must be passed here " + "and to every spawn-site in the campaign." + ), + ) + parser.add_argument( + '--recipient', + help=( + "Name of the agent that should receive the bootstrap " + "message. Defaults to the campaign's ``initial_agent``." + ), + ) + parser.add_argument( + '--exchange-type', + choices=SUPPORTED_EXCHANGE_TYPES, + default='http', + help=( + "Academy exchange backend. Defaults to 'http' since " + "federated bootstrap is the main use case; pass 'local' / " + "'redis' / 'hybrid' if you're re-bootstrapping a single-" + "machine campaign for some reason." + ), + ) + parser.add_argument( + '--http-exchange-url', + help='Override URL for --exchange-type=http (defaults to Academy-hosted).', + ) + parser.add_argument( + '--redis-host', default='127.0.0.1', + help='Redis host (only used for redis / hybrid exchanges).', + ) + parser.add_argument( + '--redis-port', type=int, default=6379, + help='Redis port (only used for redis / hybrid exchanges).', + ) + parser.add_argument( + '--redis-namespace', + help='Redis namespace (only used for hybrid; defaults from run-id).', + ) + parser.add_argument( + '--discover-timeout-s', type=float, default=600.0, + help=( + "How long to wait for the recipient agent's mailbox to be " + "visible on the exchange. Defaults to 10 minutes to match " + "spawn-site's startup_timeout_s; bump it higher if a " + "federated site is unusually slow to come up." + ), + ) + return parser.parse_args(argv) + + +def _config_for_factory(args: argparse.Namespace) -> ChemGraphDaemonConfig: + """Build the minimal DaemonConfig that ``build_exchange_factory`` reads. + + Most fields are unused for bootstrap and get throwaway values; what + matters is ``exchange_type``, ``http_exchange_url``, and the redis + triple. ``run_dir`` is a placeholder because the factory builder + only consults a couple of fields. + """ + run_dir = Path.cwd() / '.bootstrap-tmp' + return ChemGraphDaemonConfig( + run_dir=run_dir, + run_token='bootstrap', + agent_count=0, + campaign_config=Path(args.campaign), + lm_config=run_dir / 'lm.json', + max_decisions=0, + poll_timeout_s=1.0, + idle_timeout_s=1.0, + startup_timeout_s=args.discover_timeout_s, + completion_timeout_s=1.0, + status_interval_s=1.0, + redis_host=args.redis_host, + redis_port=args.redis_port, + redis_namespace=args.redis_namespace or namespace_for_run(run_dir), + rank=0, + local_rank=0, + chemgraph_repo_root=Path.cwd(), + exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, + ) + + +async def dispatch_bootstrap( + *, + campaign: ChemGraphCampaign, + run_id: str, + recipient: str, + exchange_factory: Any, + discover_timeout_s: float, +) -> str: + """Send the campaign bootstrap message to ``recipient`` over the exchange. + + Returns the dispatched message_id so the operator can correlate it + with what shows up on the recipient site's event log. + + The recipient's AgentId is constructed deterministically from + ``(run_id, recipient_name)`` -- same scheme spawn-site uses on + the daemon side -- so no name-based discovery is needed (the + hosted exchange strips names from discover() responses, which + made the old discover-by-name approach silently fail). + """ + client = await exchange_factory.create_user_client( + name='chemgraph-bootstrap', + start_listener=False, + ) + # ``Handle.action`` reads its outbound exchange from a + # ``ContextVar`` that ``UserExchangeClient.__aenter__`` sets to + # self. Without entering the client as an async-context-manager + # the contextvar stays unset and Handle.action raises + # ``ExchangeClientNotFoundError``. The daemon-side path gets this + # for free because Academy's Runtime enters the client; the + # standalone bootstrap command has to do it explicitly. + async with client: + try: + recipient_id = deterministic_agent_id( + run_id=run_id, agent_name=recipient, + ) + # Liveness probe: wait for the recipient's mailbox to + # actually be registered on the exchange before sending. + # Without this we'd happily POST a message to a mailbox + # that doesn't exist yet -- the exchange would reject it. + await wait_for_peers_alive( + client._transport, + [recipient_id], + agent_class=ChemGraphLogicalAgent, + timeout_s=discover_timeout_s, + ) + + message = build_message( + sender='campaign', + recipient=recipient, + content=campaign_bootstrap_text(campaign), + kind='message', + tldr='Campaign bootstrap', + reason='Initial campaign task dispatch (operator-triggered).', + confidence=1.0, + ) + handle: Handle[Any] = Handle(recipient_id) + await handle.action('receive_message', message) + logger.info( + 'Bootstrap message dispatched: recipient=%s message_id=%s', + recipient, message['message_id'], + ) + return message['message_id'] + finally: + # __aexit__ does close + clear the contextvar; close() + # here would double-close. The async with handles it. + pass + + +def main(argv: Sequence[str] | None = None) -> int: + logging.basicConfig(level=logging.INFO, format='%(message)s') + args = parse_args(argv) + + campaign_path = resolve_campaign(args.campaign) + if not campaign_path.exists(): + campaign_path = Path(args.campaign).resolve() + campaign = load_campaign(campaign_path) + + recipient = args.recipient or campaign.initial_agent + config = _config_for_factory(args) + factory = build_exchange_factory(config) + + try: + message_id = asyncio.run( + dispatch_bootstrap( + campaign=campaign, + run_id=args.run_id, + recipient=recipient, + exchange_factory=factory, + discover_timeout_s=args.discover_timeout_s, + ), + ) + except TimeoutError as exc: + print(f'bootstrap failed: {exc}', file=sys.stderr) + return 2 + print(f'ok: sent bootstrap to {recipient} (message_id={message_id})') + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/chemgraph/academy/runtime/compute_launcher.py b/src/chemgraph/academy/runtime/compute_launcher.py index 3ba9ad4..6f0a0cc 100644 --- a/src/chemgraph/academy/runtime/compute_launcher.py +++ b/src/chemgraph/academy/runtime/compute_launcher.py @@ -15,6 +15,9 @@ from chemgraph.academy.campaigns import campaign_launch_defaults from chemgraph.academy.campaigns import resolve_campaign from chemgraph.academy.campaigns import resolve_lm_config_template +from chemgraph.academy.core.campaign import parse_agents_selection +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.exchange import exchange_uses_redis from chemgraph.academy.runtime.profiles import list_builtin_system_profiles from chemgraph.academy.runtime.profiles import load_system_profile from chemgraph.academy.runtime.profiles.system import SystemProfile @@ -48,6 +51,13 @@ class AllocationPlan: mpiexec: str chemgraph_repo_root: Path exchange_type: str = "redis" + http_exchange_url: str | None = None + # Federated spawn-site fields. Empty ``agents`` = single-machine + # run-compute flow (every agent on the campaign is launched). + # Non-empty = this allocation only owns the listed agents; the + # other sites are presumed running elsewhere on the same exchange. + agents: tuple[str, ...] = () + skip_bootstrap: bool = False def parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -79,8 +89,55 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser.add_argument("--redis-port", type=int) parser.add_argument( "--exchange-type", - choices=("redis", "local", "hybrid"), + choices=SUPPORTED_EXCHANGE_TYPES, default="redis", + help=( + "Academy exchange backend. 'http' targets an HTTP exchange " + "server (Academy-hosted default unless --http-exchange-url " + "is given) and reaches across HPC sites; requires a cached " + "Globus token (run scripts/academy_login_globus.py once) " + "and on Aurora compute nodes the ALCF http(s)_proxy env " + "vars must be set before launch." + ), + ) + parser.add_argument( + "--http-exchange-url", + help=( + "Override URL for --exchange-type=http. Omit to use the " + "Academy-hosted default." + ), + ) + parser.add_argument( + "--agents", + default=None, + help=( + "Comma-separated subset of agent names this allocation owns " + "(federated spawn-site mode). When given, --agent-count is " + "derived from the slice length and the daemon receives " + "--agents so it filters the campaign down to the slice. " + "Omit to launch every declared agent (single-machine mode)." + ), + ) + parser.add_argument( + "--no-bootstrap", + action="store_true", + help=( + "Skip rank-0's in-process bootstrap dispatch. Set by " + "spawn-site so kickoff is deferred to the separate " + "'chemgraph academy bootstrap' subcommand." + ), + ) + parser.add_argument( + "--startup-timeout-s", + type=float, + default=None, + help=( + "How long the daemon's cross-site peer-discovery loop " + "waits before giving up. Default 600s. Bump higher for " + "federated launches where HPC queue waits + Python " + "imports + cold-cache rsyncs can push one site's startup " + "well past the other site's discovery patience." + ), ) parser.add_argument("--no-start-redis", action="store_true") return parser.parse_args(argv) @@ -94,8 +151,29 @@ def _prepend_path(name: str, entries: list[str]) -> None: os.environ[name] = os.pathsep.join(values) -def _prepare_environment(profile: SystemProfile) -> None: +_PROXY_ENV_NAMES = frozenset({ + "http_proxy", "HTTP_PROXY", + "https_proxy", "HTTPS_PROXY", + "all_proxy", "ALL_PROXY", +}) + + +def _prepare_environment( + profile: SystemProfile, + *, + exchange_type: str = "redis", +) -> None: + # Aurora's profile lists http(s)_proxy in unset_env so that LM traffic + # going through the local UAN relay (127.0.0.1:) doesn't pick up + # a stray site proxy. That's correct for Redis-based campaigns, but for + # --exchange-type=http the ranks MUST reach exchange.academy-agents.org + # over the public internet, which on Aurora compute nodes only works + # through the ALCF HTTP proxy. Keep the proxy vars when the exchange + # needs them. The no_proxy list set below still excludes 127.0.0.1 + + # .alcf.anl.gov so the LM relay continues to bypass the proxy. for name in profile.unset_env: + if exchange_type == "http" and name in _PROXY_ENV_NAMES: + continue os.environ.pop(name, None) _prepend_path("PATH", profile.path_entries) _prepend_path("PYTHONPATH", profile.pythonpath_entries) @@ -166,6 +244,18 @@ def _write_lm_config( if max_tokens is not None: data["max_tokens"] = max_tokens + # Refuse to ship a config whose `user` is still the template + # placeholder. Argo rejects requests with an unknown user, but + # only at first-call time on the compute node, after the whole + # daemon + relay stack is already running -- expensive to debug. + # Fail here instead, with a message pointing at the fix. + if data.get("user") in (None, "", ""): + raise RuntimeError( + f"lm_config.json was written with user={data.get('user')!r}. " + "Pass --lm-user or export ARGO_USER " + "before launching spawn-site / run-compute." + ) + path = run_dir / "lm_config.json" path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") return path @@ -192,7 +282,7 @@ def _run_token() -> str: def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: """Resolve a system profile and dashboard metadata into an allocation plan.""" profile = load_system_profile(args.system) - _prepare_environment(profile) + _prepare_environment(profile, exchange_type=args.exchange_type) defaults = campaign_launch_defaults(args.campaign) run_dir = Path(args.run_dir or Path(profile.run_root) / args.run_id).resolve() @@ -210,16 +300,35 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: profile=profile, metadata=metadata, ) + # Default lm-user from $ARGO_USER so HPC users who already export it + # for their normal ChemGraph workflow don't have to pass --lm-user + # again. The packaged template ships with a literal "" + # placeholder so it can't accidentally point at someone else's + # Argo account; _write_lm_config refuses to keep that placeholder. + lm_user = args.lm_user or os.environ.get("ARGO_USER") lm_config = _write_lm_config( run_dir=run_dir, template_name=defaults.lm_config_template, base_url=lm_base_url, lm_model=args.lm_model, - lm_user=args.lm_user, + lm_user=lm_user, max_tokens=args.max_tokens, ) _export_workflow_lm_environment(lm_config) - agent_count = args.agent_count or defaults.agent_count + # When --agents is given the slice length is authoritative. Otherwise + # use the CLI / packaged default. We refuse to mix --agents with an + # explicit --agent-count that disagrees, to avoid silent surprises. + agents_slice = parse_agents_selection(getattr(args, "agents", None)) + if agents_slice: + derived_count = len(agents_slice) + if args.agent_count and args.agent_count != derived_count: + raise RuntimeError( + f"--agent-count={args.agent_count} contradicts --agents " + f"(which implies {derived_count}). Pass --agents alone." + ) + agent_count = derived_count + else: + agent_count = args.agent_count or defaults.agent_count agents_per_node = args.agents_per_node or defaults.agents_per_node max_decisions = args.max_decisions or defaults.max_decisions redis_port = args.redis_port or profile.redis_port @@ -238,7 +347,14 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: max_decisions=max_decisions, poll_timeout_s=2.0, idle_timeout_s=600.0, - startup_timeout_s=120.0, + # Default 600s (was 120s). Single-machine runs reach this + # codepath in seconds; the realistic worst case is federated + # launches where one site's HPC queue wait + Python imports + # outpaces the other site's peer-discovery patience. 10 min + # comfortably covers debug-scaling / workq scheduling delays + # without making single-machine failures slow to surface + # (the daemon prints a clear "missing=..." message regardless). + startup_timeout_s=(getattr(args, "startup_timeout_s", None) or 600.0), completion_timeout_s=60.0, status_interval_s=5.0, redis_host=socket.getfqdn(), @@ -250,6 +366,9 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: mpiexec=profile.mpiexec, chemgraph_repo_root=Path(profile.repo_root).resolve(), exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, + agents=agents_slice, + skip_bootstrap=bool(getattr(args, "no_bootstrap", False)), ) @@ -274,7 +393,7 @@ def run_allocation(plan: AllocationPlan) -> int: """Start Redis if requested and run per-rank daemons under mpiexec.""" plan.run_dir.mkdir(parents=True, exist_ok=True) redis_proc: subprocess.Popen[bytes] | None = None - uses_redis = plan.exchange_type in {"redis", "hybrid"} + uses_redis = exchange_uses_redis(plan.exchange_type) if plan.start_redis and uses_redis: redis_server = shutil.which("redis-server") if redis_server is None: @@ -324,10 +443,37 @@ def run_allocation(plan: AllocationPlan) -> int: "--exchange-type", plan.exchange_type, "--chemgraph-repo-root", str(plan.chemgraph_repo_root), ] + if plan.http_exchange_url: + daemon_args += ["--http-exchange-url", plan.http_exchange_url] + # Federated spawn-site forwarding. Both flags are omitted on the + # single-machine run-compute flow so the daemon's argparse + # defaults (full campaign, bootstrap enabled) keep prior behavior. + if plan.agents: + daemon_args += ["--agents", ",".join(plan.agents)] + if plan.skip_bootstrap: + daemon_args += ["--no-bootstrap"] + # When using the HTTP exchange on HPC compute nodes (e.g. Aurora), + # ranks must reach exchange.academy-agents.org through the site's + # outbound HTTP proxy. PALS/MPICH mpiexec strips most parent-shell + # env vars from spawned ranks, so http_proxy / https_proxy don't + # propagate automatically -- force forwarding via --genvall and + # set the proxy vars in our own environ first so they are part + # of the parent env that --genvall snapshots. + genv_flags: list[str] = [] + if plan.exchange_type == "http": + for name in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): + value = os.environ.get(name) + if value: + genv_flags += ["--genv", f"{name}={value}"] + # --genvall snapshots the whole parent env as belt-and-braces; + # the explicit per-var entries surface in launch_command.txt for + # post-hoc debugging when a rank can't reach the exchange. + genv_flags = ["--genvall", *genv_flags] cmd = [ plan.mpiexec, "-n", str(plan.agent_count), "--ppn", str(plan.agents_per_node), + *genv_flags, sys.executable, "-m", "chemgraph.cli.main", "academy", "mpi-daemon", "--", *daemon_args, ] diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index e6cb05b..4361640 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -4,16 +4,19 @@ import asyncio import pathlib import signal +from typing import Any +from academy.exchange.cloud.client import HttpAgentRegistration from academy.handle import Handle from academy.runtime import Runtime from academy.runtime import RuntimeConfig from chemgraph.academy.core.peer_protocol import build_message from chemgraph.academy.runtime.exchange import build_exchange_factory -from chemgraph.academy.runtime.registration import load_academy_registrations -from chemgraph.academy.runtime.registration import wait_academy_registrations -from chemgraph.academy.runtime.registration import write_academy_registrations +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import register_agent_with_uid +from chemgraph.academy.runtime.registration import wait_for_peers_alive from chemgraph.academy.observability.run_artifacts import initialize_run_files from chemgraph.academy.observability.run_artifacts import ( wait_for_agent_statuses_finished, @@ -21,8 +24,10 @@ from chemgraph.academy.observability.run_artifacts import write_status_snapshot from chemgraph.academy.core.campaign import campaign_bootstrap_text from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +from chemgraph.academy.core.campaign import filter_agents from chemgraph.academy.core.campaign import load_campaign from chemgraph.academy.core.campaign import namespace_for_run +from chemgraph.academy.core.campaign import parse_agents_selection from chemgraph.academy.core.campaign import resolve_campaign_resources from chemgraph.academy.core.campaign import selected_agent from chemgraph.academy.core.campaign import validate_campaign @@ -45,7 +50,20 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: config.run_dir, ) prompt_profile = load_prompt_profile(campaign.prompt_profile) - validate_campaign(campaign, config.agent_count) + # When this site only owns a slice (federated spawn-site flow), + # filter the campaign down to that slice BEFORE validation so the + # daemon's downstream rank-indexing (selected_agent, mpiexec -n) + # all agree on the same agent ordering. + if config.agents: + campaign = filter_agents(campaign, config.agents) + # Loosen cross-site peer / initial_agent checks for federated + # slices -- those names may legitimately reference agents this + # site doesn't own (they're discovered through the exchange). + validate_campaign( + campaign, + config.agent_count, + federated=bool(config.agents), + ) agent_spec = selected_agent(campaign, config.rank) placement = placement_payload(config, agent_spec.name) supervisor = MCPServerSupervisor( @@ -68,55 +86,84 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: academy_factory = build_exchange_factory(config) if config.rank == 0: + # Rank 0 still owns the one-shot init-files dance, but it + # is NO LONGER special for registration -- every rank + # registers its own agent independently and discovers peers + # via the exchange. initialize_run_files( run_dir=config.run_dir, campaign=campaign, config=config, llm_settings=llm_settings, ) - registrar = await academy_factory.create_user_client( - name=f'{config.run_dir.name}-registrar', - start_listener=False, - ) - try: - registered = await registrar.register_agents( - [ - (ChemGraphLogicalAgent, spec.name) - for spec in campaign.agents - ], - ) - finally: - await registrar.close() - registrations = dict( - zip( - (spec.name for spec in campaign.agents), - registered, - strict=True, - ), + + # Each rank registers ONLY its own agent on the exchange and + # discovers cross-rank / cross-site peers by polling + # ``transport.discover()``. This works identically whether the + # peers are on the same node (LocalExchange), same allocation + # (Redis), or a different HPC entirely (HttpExchange against + # the hosted Academy server) -- the discovery protocol is the + # same. There is no longer a shared-filesystem dependency. + registrar = await academy_factory.create_user_client( + name=f'{config.run_dir.name}-rank{config.rank}-registrar', + start_listener=False, + ) + try: + # Register THIS rank's agent with a deterministic UID + # derived from (run_id, agent_name). The hosted exchange + # strips AgentId.name from discover() responses, so the + # name-based filter we used to do never matched on the + # public HttpExchange. Deterministic UIDs let every site + # construct the same AgentId for the same (run, agent) + # without ever needing discover() to echo the name back. + my_agent_id = deterministic_agent_id( + run_id=config.run_dir.name, agent_name=agent_spec.name, ) - write_academy_registrations( - run_dir=config.run_dir, - run_token=config.run_token, - registrations=registrations, + await register_agent_with_uid( + registrar._transport, ChemGraphLogicalAgent, my_agent_id, ) - else: - registrations = await wait_academy_registrations( - config.run_dir, - run_token=config.run_token, - timeout_s=config.startup_timeout_s, + registration = HttpAgentRegistration(agent_id=my_agent_id) + print( + f"[daemon] rank{config.rank} registered " + f"{agent_spec.name!r} on the exchange " + f"(uid={my_agent_id.uid})", + flush=True, ) - if config.rank == 0: - registrations = load_academy_registrations( - config.run_dir, - run_token=config.run_token, - ) - registration = registrations[agent_spec.name] - peer_agent_ids = { - peer: registrations[peer].agent_id - for peer in agent_spec.allowed_peers - if peer in registrations - } + wanted_peers = [ + p for p in agent_spec.allowed_peers if p != agent_spec.name + ] + # Compute peer AgentIds locally -- no discovery polling + # needed for the identity itself, since every site agrees + # on the deterministic UID. We still poll discover() as a + # LIVENESS PROBE so we don't proceed to bootstrap before + # the other side's mailbox is actually up. + peer_agent_ids: dict[str, Any] = { + peer: deterministic_agent_id( + run_id=config.run_dir.name, agent_name=peer, + ) + for peer in wanted_peers + } + if wanted_peers: + print( + f"[daemon] rank{config.rank} waiting for peers " + f"{wanted_peers} to come online " + f"(timeout {config.startup_timeout_s:.0f}s)...", + flush=True, + ) + await wait_for_peers_alive( + registrar._transport, + peer_agent_ids.values(), + agent_class=ChemGraphLogicalAgent, + timeout_s=config.startup_timeout_s, + ) + print( + f"[daemon] rank{config.rank} all {len(peer_agent_ids)} " + f"peer(s) are alive: {sorted(peer_agent_ids.keys())}", + flush=True, + ) + finally: + await registrar.close() agent = ChemGraphLogicalAgent( agent_spec, @@ -144,8 +191,27 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: ) async with runtime: await agent.write_runtime_status() + print( + f"[daemon] rank{config.rank} agent {agent_spec.name!r} " + "is now running inside Academy Runtime", + flush=True, + ) - if config.rank == 0: + # Rank 0 normally dispatches the campaign bootstrap message + # to ``initial_agent``. Two conditions skip it: + # * ``--no-bootstrap`` was set (spawn-site flow -- kickoff + # happens via the separate ``bootstrap`` subcommand once + # every federated site has come up). + # * ``initial_agent`` is not this rank's own agent (it + # lives on another rank / site; that owner's rank 0 + # handles it, or the operator triggers bootstrap once + # every site is up). + initial_is_local = campaign.initial_agent == agent_spec.name + if ( + config.rank == 0 + and not config.skip_bootstrap + and initial_is_local + ): bootstrap = build_message( sender='campaign', recipient=campaign.initial_agent, @@ -155,9 +221,7 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: reason='Initial campaign task dispatch.', confidence=1.0, ) - initial_handle: Handle[Any] = Handle( - registrations[campaign.initial_agent].agent_id, - ) + initial_handle: Handle[Any] = Handle(registration.agent_id) await initial_handle.action( 'receive_message', bootstrap, @@ -171,6 +235,39 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: 'via': 'academy_action', }, ) + print( + f"[daemon] rank{config.rank} dispatched inline " + f"bootstrap to {campaign.initial_agent!r}", + flush=True, + ) + elif config.rank == 0: + # Record the reason for skipping so investigators can + # tell "deferred to operator" apart from "silently + # forgot". + append_system_trace( + config.run_dir, + 'bootstrap_message_skipped', + { + 'initial_agent': campaign.initial_agent, + 'skip_bootstrap_flag': bool(config.skip_bootstrap), + 'initial_is_local': bool(initial_is_local), + }, + ) + if config.skip_bootstrap: + print( + f"[daemon] rank{config.rank} skipping inline " + f"bootstrap (federated mode); waiting for " + f"'chemgraph academy bootstrap' to deliver " + f"the kickoff message...", + flush=True, + ) + else: + print( + f"[daemon] rank{config.rank} initial_agent " + f"{campaign.initial_agent!r} is not on this " + f"site; another site owns the bootstrap", + flush=True, + ) await runtime.wait_shutdown() @@ -216,9 +313,35 @@ def parse_args() -> argparse.Namespace: parser.add_argument('--redis-namespace') parser.add_argument( '--exchange-type', - choices=('redis', 'local', 'hybrid'), + choices=SUPPORTED_EXCHANGE_TYPES, default='redis', ) + parser.add_argument( + '--http-exchange-url', + default=None, + help=( + "Override URL for --exchange-type=http. Omit to use the " + "Academy-hosted default. Ignored for other exchange types." + ), + ) + parser.add_argument( + '--agents', + default=None, + help=( + "Comma-separated subset of agent names to launch (federated " + "spawn-site mode). Omit to launch every agent declared on the " + "campaign (single-machine run-compute mode)." + ), + ) + parser.add_argument( + '--no-bootstrap', + action='store_true', + help=( + "Skip the rank-0 in-process bootstrap dispatch. Used by " + "spawn-site so kickoff can be triggered separately via the " + "'chemgraph academy bootstrap' subcommand once every site is up." + ), + ) parser.add_argument('--chemgraph-repo-root') return parser.parse_args() @@ -247,6 +370,9 @@ def config_from_args(args: argparse.Namespace) -> ChemGraphDaemonConfig: redis_port=args.redis_port, redis_namespace=args.redis_namespace or namespace_for_run(run_dir), exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, + agents=parse_agents_selection(args.agents), + skip_bootstrap=bool(args.no_bootstrap), rank=rank_from_env(), local_rank=local_rank_from_env(), chemgraph_repo_root=( diff --git a/src/chemgraph/academy/runtime/dashboard_launcher.py b/src/chemgraph/academy/runtime/dashboard_launcher.py index 0116176..6a28d17 100644 --- a/src/chemgraph/academy/runtime/dashboard_launcher.py +++ b/src/chemgraph/academy/runtime/dashboard_launcher.py @@ -1,13 +1,20 @@ from __future__ import annotations import argparse +import dataclasses import json -import os, shlex, shutil, signal, subprocess, threading +import os +import shlex +import shutil +import signal +import subprocess +import threading import time import urllib.error import urllib.request from importlib.resources import files from pathlib import Path +from typing import Any from chemgraph.academy.dashboard import serve_dashboard from chemgraph.academy.campaigns import campaign_launch_defaults @@ -15,11 +22,63 @@ from chemgraph.academy.runtime.profiles import load_system_profile from chemgraph.academy.runtime.profiles.system import SystemProfile + +@dataclasses.dataclass +class _SiteHandle: + """Per-site state held by the launcher's main loop. + + One of these per ``--system`` value when launching a federated + dashboard. Single-site invocations build exactly one. The fields + track everything the cleanup ``finally`` block needs to tear down + (relay subprocess, ControlMaster ownership) plus the values the + rsync loop and dashboard server need (local mirror dir, the + composed ``lm_base_url`` for the site's compute nodes). + """ + + profile: SystemProfile + remote_host: str + control_path: str + local_mirror_dir: Path # the per-site dir (multi) or top-level (single) + relay_port: int + relay_host: str | None = None + lm_base_url: str | None = None + relay_process: subprocess.Popen[str] | None = None + started_master: bool = False + + +def _parse_systems_list(raw: str) -> tuple[str, ...]: + """Parse a comma-list of system profile names ('aurora,crux'). + + Whitespace-tolerant; trailing commas dropped. Empty input is a + user error and surfaces a clean message at argparse-resolve time + rather than later in the setup loop. + """ + names = tuple(name.strip() for name in raw.split(',') if name.strip()) + if not names: + raise argparse.ArgumentTypeError( + "--system requires at least one profile name", + ) + if len(set(names)) != len(names): + raise argparse.ArgumentTypeError( + f"--system has duplicate profile names: {names}", + ) + return names + def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(prog="chemgraph academy dashboard") a = p.add_argument a("run_id") - a("--system", default="aurora", help="Built-ins: " + ", ".join(list_builtin_system_profiles())) + a( + "--system", + type=_parse_systems_list, + default=("aurora",), + help=( + "One profile name for a single-site campaign, or a comma " + "list ('aurora,crux') for a federated dashboard that brings " + "up per-site relays + rsync mirrors and serves a merged " + "view. Built-ins: " + ", ".join(list_builtin_system_profiles()) + ), + ) a("--campaign", default="mace-ensemble-screening-20") a("--lm-connect", choices=("mac-argo-relay", "direct"), default="mac-argo-relay") a("--lm-base-url") @@ -28,7 +87,13 @@ def parse_args() -> argparse.Namespace: a("--keep-ssh-master", action="store_true") a("--local-argo-host", default="127.0.0.1") a("--local-argo-port", type=int, default=18085) - a("--reverse-port", type=int, default=18185) + a( + "--reverse-port", type=int, default=18185, + help=( + "Reverse-tunnel local port. In multi-site mode each site " + "gets reverse_port + offset (offset = i for the i-th system)." + ), + ) a("--relay-port", type=int) a("--relay-python") a("--rsync-interval-s", type=float, default=2.0) @@ -39,7 +104,31 @@ def parse_args() -> argparse.Namespace: a("--local", action="store_true", help="Only serve an already mirrored local run.") a("--no-dashboard", action="store_true") a("--overwrite-run", action="store_true") - return p.parse_args() + args = p.parse_args() + # Per-site override flags only make sense in single-site mode -- + # in multi-site they'd silently apply to all sites and almost + # always be wrong (e.g. one Aurora remote_host doesn't fit Crux). + # Force operators to encode site-specific quirks in the profile JSON. + if len(args.system) > 1: + forbidden = [ + (name, getattr(args, attr)) + for name, attr in ( + ("--remote-host", "remote_host"), + ("--ssh-control-path", "ssh_control_path"), + ("--relay-port", "relay_port"), + ("--lm-base-url", "lm_base_url"), + ("--local-run-dir", "local_run_dir"), + ) + if getattr(args, attr) + ] + if forbidden: + names = ", ".join(flag for flag, _ in forbidden) + p.error( + f"multi-site --system rejects single-site overrides {names}; " + f"encode per-site differences in the system profile JSON " + f"instead.", + ) + return args def template(name: str) -> str: return files("chemgraph.academy.runtime.templates").joinpath(name).read_text() @@ -131,16 +220,187 @@ def compute_lines(profile: SystemProfile, wrapper_path: str, run_id: str, campai lines = [" module use /soft/modulefiles", " module load conda", " conda activate base"] if profile.name == "polaris" else [" module load frameworks"] return lines + [f" source {profile.remote_root}/venvs/academy-swarm/bin/activate", f" export PATH={profile.remote_root}/bin:$PATH", " chemgraph academy run-compute \\", f" --system {profile.name} \\", f" --run-id {run_id} \\", f" --campaign {campaign}", "", "If PATH is not configured, use:", f" {wrapper_path} \\", f" --system {profile.name} \\", f" --run-id {run_id} \\", f" --campaign {campaign}"] +def _resolve_local_run_root(args: argparse.Namespace) -> Path: + """Top-level dashboard dir on the Mac. + + Single-site mode: ``//`` -- byte-identical to the + pre-multi-site layout, so existing dashboard URLs / mirror paths + keep working unchanged. + + Multi-site mode: ``//`` is a PARENT containing + per-site subdirs (``//aurora/``, ``.../crux/``). + The dashboard server walks that tree and merges per-site event + streams into one view. + """ + if args.local_run_dir: + return Path(args.local_run_dir).expanduser() + return (Path(args.local_mirror_root) / args.run_id).expanduser() + + +def _site_mirror_dir( + local_run_root: Path, + profile_name: str, + *, + multi_site: bool, +) -> Path: + return local_run_root / profile_name if multi_site else local_run_root + + +def _setup_site( + *, + profile_name: str, + args: argparse.Namespace, + local_run_root: Path, + multi_site: bool, + site_index: int, + stop: threading.Event, +) -> _SiteHandle: + """Bring up one site's ControlMaster + UAN relay + rsync mirror. + + Pulled out of ``main`` so the multi-site loop has one place to call. + The single-site path also goes through this function (with + ``multi_site=False`` so the mirror dir + reverse-port stay + backward-compatible). Returns a ``_SiteHandle`` carrying everything + the cleanup ``finally`` needs. + """ + profile = load_system_profile(profile_name) + remote_host = args.remote_host or profile.remote_host + control_path = ( + args.ssh_control_path + or str(Path.home() / f".ssh/{profile.name}-dashboard-%r@%h:%p") + ) + relay_port = args.relay_port or profile.relay_port + remote_run_dir = f"{profile.run_root}/{args.run_id}" + local_mirror_dir = _site_mirror_dir(local_run_root, profile.name, multi_site=multi_site) + site = _SiteHandle( + profile=profile, + remote_host=remote_host, + control_path=control_path, + local_mirror_dir=local_mirror_dir, + relay_port=relay_port, + ) + + Path(control_path).expanduser().parent.mkdir(parents=True, exist_ok=True) + if ssh(remote_host, None, control_path=control_path, extra=["-O", "check"], check=False, batch_mode=False).returncode != 0: + print(f"[{profile.name}] Starting SSH ControlMaster for {remote_host}...", flush=True) + ssh(remote_host, None, control_path=control_path, extra=["-M", "-N", "-f", "-o", "ControlMaster=yes"], batch_mode=False) + site.started_master = True + + if args.overwrite_run: + if not args.run_id or "/" in args.run_id or args.run_id in {".", ".."}: + raise RuntimeError(f"Refusing to overwrite unsafe run id: {args.run_id!r}") + print(f"[{profile.name}] Deleting existing run artifacts (--overwrite-run):", flush=True) + print(f" remote: {remote_host}:{remote_run_dir}", flush=True) + print(f" local: {local_mirror_dir}", flush=True) + delete = f"set -euo pipefail; run_root={shlex.quote(profile.run_root)}; run_id={shlex.quote(args.run_id)}; case \"$run_id\" in \"\"|.|..|*/*) echo \"unsafe run id\" >&2; exit 2;; esac; run_dir=\"$run_root/$run_id\"; trash_root=\"$run_root/.deleted-runs\"; if [ -e \"$run_dir\" ]; then mkdir -p \"$trash_root\"; trash_dir=\"$trash_root/${{run_id}}.$(date +%Y%m%d%H%M%S).$$\"; mv -- \"$run_dir\" \"$trash_dir\"; for delay in 0 1 2 5 10; do sleep \"$delay\"; if rm -rf -- \"$trash_dir\" 2>/dev/null; then break; fi; done; fi; mkdir -p \"$run_dir\"" + ssh(remote_host, delete, control_path=control_path) + if local_mirror_dir.exists(): + shutil.rmtree(local_mirror_dir) + + wrapper_path = f"{profile.remote_root}/bin/chemgraph-academy-run" + print(f"[{profile.name}] Installing compute wrapper at {wrapper_path}...", flush=True) + ssh(remote_host, f"mkdir -p {shlex.quote(profile.remote_root + '/bin')} && cat > {shlex.quote(wrapper_path)} && chmod +x {shlex.quote(wrapper_path)}", control_path=control_path, input_text=wrapper(profile)) + + relay_host = None + if args.lm_connect == "mac-argo-relay": + # Each site gets its own reverse port (base + site_index) so two + # SSH -R tunnels don't fight over the same local port. The remote + # relay always listens on the profile's relay_port; only the SSH + # tunneling end on the Mac shifts. + per_site_args = argparse.Namespace(**vars(args)) + per_site_args.reverse_port = args.reverse_port + site_index + print(f"[{profile.name}] Staging UAN relay script...", flush=True) + relay_script = stage_relay_script(profile, remote_host, control_path) + print(f"[{profile.name}] Starting UAN relay through {remote_host} (reverse port {per_site_args.reverse_port})...", flush=True) + relay_log = Path(f"/tmp/chemgraph-academy-{args.run_id}-{profile.name}-relay.log") + site.relay_process = start_relay( + profile, remote_host, control_path, per_site_args, + relay_port, args.relay_python or profile.venv_python, + relay_log, relay_script, + ) + relay_host = wait_relay(profile, remote_host, control_path, relay_port, site.relay_process, relay_log) + site.relay_host = relay_host + + lm_base_url = ( + f"http://{relay_host}:{relay_port}/argoapi/v1" + if relay_host else str(args.lm_base_url) + ) + site.lm_base_url = lm_base_url + print(f"[{profile.name}] Compute-node LM URL: {lm_base_url}", flush=True) + + metadata: dict[str, Any] = { + "created_at": time.time(), + "created_by": "chemgraph academy dashboard", + "run_id": args.run_id, + "system": profile.name, + "campaign": args.campaign, + "remote_run_dir": remote_run_dir, + "remote_host": remote_host, + "lm_connect": args.lm_connect, + "lm_base_url": lm_base_url, + "workspace_root": profile.remote_root, + "chemgraph_repo_root": profile.repo_root, + } + if relay_host: + metadata.update({"relay_host": relay_host, "relay_port": relay_port}) + print(f"[{profile.name}] Writing run metadata: {remote_host}:{remote_run_dir}/dashboard_metadata.json", flush=True) + ssh(remote_host, f"mkdir -p {shlex.quote(remote_run_dir)} && cat > {shlex.quote(remote_run_dir + '/dashboard_metadata.json')}", control_path=control_path, input_text=json.dumps(metadata, indent=2) + "\n") + + print(f"[{profile.name}] Starting rsync mirror:", flush=True) + print(f" {remote_host}:{remote_run_dir}/", flush=True) + print(f" {local_mirror_dir}/", flush=True) + start_rsync(remote_host, control_path, remote_run_dir, local_mirror_dir, args.rsync_interval_s, stop) + + print(f"\n[{profile.name}] Compute-node command:", flush=True) + print("\n".join(compute_lines(profile, wrapper_path, args.run_id, args.campaign)), flush=True) + + return site + + +def _teardown_site(site: _SiteHandle, *, keep_ssh_master: bool) -> None: + if site.relay_process is not None and site.relay_process.poll() is None: + site.relay_process.terminate() + try: + site.relay_process.wait(timeout=5) + except subprocess.TimeoutExpired: + site.relay_process.kill() + if site.started_master and not keep_ssh_master: + ssh( + site.remote_host, None, + control_path=site.control_path, + extra=["-O", "exit"], check=False, batch_mode=False, + ) + + +# Note about local-argo reachability: we only check the local argo-shim +# once at the top of main(), even in multi-site mode -- all sites share +# the same Mac shim, so one check covers them all. + + def main() -> int: args = parse_args() - profile = load_system_profile(args.system) + # Tolerate args.system being a plain string (legacy single-site + # callers / older tests) as well as the tuple form produced by the + # new --system parser. Without this, "aurora" would iterate + # character-by-character. + systems: tuple[str, ...] = ( + (args.system,) if isinstance(args.system, str) else tuple(args.system) + ) + multi_site = len(systems) > 1 campaign_launch_defaults(args.campaign) - local_run_dir = Path(args.local_run_dir or Path(args.local_mirror_root) / args.run_id).expanduser() - local_run_dir.mkdir(parents=True, exist_ok=True) + local_run_root = _resolve_local_run_root(args) + local_run_root.mkdir(parents=True, exist_ok=True) + if args.local: if args.overwrite_run: raise RuntimeError("--overwrite-run cannot be used with --local") - return 0 if args.no_dashboard else serve_dashboard(run_dir=local_run_dir, host=args.dashboard_host, port=args.dashboard_port) + # Dashboard server walks the tree either way -- single-site + # mirror dir or multi-site parent both work as inputs. + return 0 if args.no_dashboard else serve_dashboard( + run_dir=local_run_root, + host=args.dashboard_host, port=args.dashboard_port, + ) + if args.lm_connect == "direct" and not args.lm_base_url: raise RuntimeError("--lm-connect direct requires --lm-base-url") if args.lm_connect == "mac-argo-relay": @@ -151,68 +411,38 @@ def main() -> int: except (OSError, urllib.error.URLError, urllib.error.HTTPError) as exc: raise RuntimeError("Local argo-shim is not reachable. Start it before using --lm-connect mac-argo-relay.") from exc - remote_host = args.remote_host or profile.remote_host - control_path = args.ssh_control_path or str(Path.home() / f".ssh/{profile.name}-dashboard-%r@%h:%p") - relay_port = args.relay_port or profile.relay_port - remote_run_dir = f"{profile.run_root}/{args.run_id}" - relay_process: subprocess.Popen[str] | None = None stop = threading.Event() - started_master = False + sites: list[_SiteHandle] = [] try: - Path(control_path).expanduser().parent.mkdir(parents=True, exist_ok=True) - if ssh(remote_host, None, control_path=control_path, extra=["-O", "check"], check=False, batch_mode=False).returncode != 0: - print(f"Starting SSH ControlMaster for {remote_host}...", flush=True) - ssh(remote_host, None, control_path=control_path, extra=["-M", "-N", "-f", "-o", "ControlMaster=yes"], batch_mode=False) - started_master = True - if args.overwrite_run: - if not args.run_id or "/" in args.run_id or args.run_id in {".", ".."}: - raise RuntimeError(f"Refusing to overwrite unsafe run id: {args.run_id!r}") - print("Deleting existing run artifacts because --overwrite-run was set:", flush=True) - print(f" remote: {remote_host}:{remote_run_dir}", flush=True) - print(f" local: {local_run_dir}", flush=True) - delete = f"set -euo pipefail; run_root={shlex.quote(profile.run_root)}; run_id={shlex.quote(args.run_id)}; case \"$run_id\" in \"\"|.|..|*/*) echo \"unsafe run id\" >&2; exit 2;; esac; run_dir=\"$run_root/$run_id\"; trash_root=\"$run_root/.deleted-runs\"; if [ -e \"$run_dir\" ]; then mkdir -p \"$trash_root\"; trash_dir=\"$trash_root/${{run_id}}.$(date +%Y%m%d%H%M%S).$$\"; mv -- \"$run_dir\" \"$trash_dir\"; for delay in 0 1 2 5 10; do sleep \"$delay\"; if rm -rf -- \"$trash_dir\" 2>/dev/null; then break; fi; done; fi; mkdir -p \"$run_dir\"" - ssh(remote_host, delete, control_path=control_path) - if local_run_dir.exists(): - shutil.rmtree(local_run_dir) - wrapper_path = f"{profile.remote_root}/bin/chemgraph-academy-run" - print(f"Installing compute wrapper at {wrapper_path}...", flush=True) - ssh(remote_host, f"mkdir -p {shlex.quote(profile.remote_root + '/bin')} && cat > {shlex.quote(wrapper_path)} && chmod +x {shlex.quote(wrapper_path)}", control_path=control_path, input_text=wrapper(profile)) - relay_host = None - if args.lm_connect == "mac-argo-relay": - print(f"Staging UAN relay script under {profile.remote_root}/{REMOTE_RELAY_SUBPATH}...", flush=True) - relay_script = stage_relay_script(profile, remote_host, control_path) - print(f"Starting {profile.name} UAN relay through {remote_host}...", flush=True) - relay_process = start_relay(profile, remote_host, control_path, args, relay_port, args.relay_python or profile.venv_python, Path(f"/tmp/chemgraph-academy-{args.run_id}-relay.log"), relay_script) - relay_host = wait_relay(profile, remote_host, control_path, relay_port, relay_process, Path(f"/tmp/chemgraph-academy-{args.run_id}-relay.log")) - lm_base_url = f"http://{relay_host}:{relay_port}/argoapi/v1" if relay_host else str(args.lm_base_url) - print(f"Compute-node LM URL: {lm_base_url}", flush=True) - metadata = {"created_at": time.time(), "created_by": "chemgraph academy dashboard", "run_id": args.run_id, "system": profile.name, "campaign": args.campaign, "remote_run_dir": remote_run_dir, "remote_host": remote_host, "lm_connect": args.lm_connect, "lm_base_url": lm_base_url, "workspace_root": profile.remote_root, "chemgraph_repo_root": profile.repo_root} - if relay_host: - metadata.update({"relay_host": relay_host, "relay_port": relay_port}) - print(f"Writing run metadata: {remote_host}:{remote_run_dir}/dashboard_metadata.json", flush=True) - ssh(remote_host, f"mkdir -p {shlex.quote(remote_run_dir)} && cat > {shlex.quote(remote_run_dir + '/dashboard_metadata.json')}", control_path=control_path, input_text=json.dumps(metadata, indent=2) + "\n") - print("Starting rsync mirror:", flush=True) - print(f" {remote_host}:{remote_run_dir}/", flush=True) - print(f" {local_run_dir}/", flush=True) - start_rsync(remote_host, control_path, remote_run_dir, local_run_dir, args.rsync_interval_s, stop) - print("\nDashboard launcher is ready.\n", flush=True) - print(f"On the {profile.name} compute node, use:", flush=True) - print("\n".join(compute_lines(profile, wrapper_path, args.run_id, args.campaign)), flush=True) + for index, profile_name in enumerate(systems): + site = _setup_site( + profile_name=profile_name, + args=args, + local_run_root=local_run_root, + multi_site=multi_site, + site_index=index, + stop=stop, + ) + sites.append(site) + + print("\nDashboard launcher is ready.", flush=True) + if multi_site: + print(f"Federated mirror tree: {local_run_root}//", flush=True) + if args.no_dashboard: return 0 + print(f"\nStarting dashboard at http://{args.dashboard_host}:{args.dashboard_port}", flush=True) - print("Ctrl-C stops the local dashboard, rsync loop, and relay tunnel.", flush=True) - return serve_dashboard(run_dir=local_run_dir, host=args.dashboard_host, port=args.dashboard_port) + print("Ctrl-C stops the local dashboard, rsync loops, and relay tunnels.", flush=True) + return serve_dashboard( + run_dir=local_run_root, + host=args.dashboard_host, port=args.dashboard_port, + ) finally: stop.set() - if relay_process is not None and relay_process.poll() is None: - relay_process.terminate() - try: - relay_process.wait(timeout=5) - except subprocess.TimeoutExpired: - relay_process.kill() - if started_master and not args.keep_ssh_master: - ssh(remote_host, None, control_path=control_path, extra=["-O", "exit"], check=False, batch_mode=False) + for site in sites: + _teardown_site(site, keep_ssh_master=args.keep_ssh_master) + if __name__ == "__main__": raise SystemExit(main()) diff --git a/src/chemgraph/academy/runtime/exchange.py b/src/chemgraph/academy/runtime/exchange.py index 6a8b2b2..bc7ed37 100644 --- a/src/chemgraph/academy/runtime/exchange.py +++ b/src/chemgraph/academy/runtime/exchange.py @@ -7,6 +7,26 @@ from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +SUPPORTED_EXCHANGE_TYPES: tuple[str, ...] = ('redis', 'local', 'hybrid', 'http') +"""All exchange types this module knows how to build. + +Used by the CLI to enforce ``--exchange-type`` choices and by tests +to assert the supported set stays in sync with the dispatch table +below. +""" + + +def exchange_uses_redis(exchange_type: str) -> bool: + """Return True when the exchange type requires a running Redis server. + + The compute launcher uses this to decide whether to start a Redis + subprocess on rank 0. Exchanges that don't talk to Redis (``local``, + ``http``) don't need one and skipping the subprocess avoids a port- + binding failure when Redis isn't installed on the compute node. + """ + return exchange_type in {'redis', 'hybrid'} + + def build_exchange_factory(config: ChemGraphDaemonConfig) -> Any: """Return the Academy exchange factory matching ``config.exchange_type``.""" exchange_type = config.exchange_type @@ -33,7 +53,25 @@ def build_exchange_factory(config: ChemGraphDaemonConfig) -> Any: namespace=config.redis_namespace, ) + if exchange_type == 'http': + # Academy's HTTP exchange. Passing url=None selects the + # hosted default (https://exchange.academy-agents.org/v1) + # with Globus Auth. The bearer token is read from + # $XDG_DATA_HOME/academy/storage.db -- the user (or the + # launcher's env-prep step) must have logged in already + # via the device flow before any agent constructs this. + # On Aurora compute nodes, http_proxy / https_proxy must be + # set to the ALCF proxy (http://proxy.alcf.anl.gov:3128) + # before the daemon starts; otherwise the first PUT will + # hang at the connection-timeout boundary. + from academy.exchange.cloud import HttpExchangeFactory + + kwargs: dict[str, Any] = {} + if config.http_exchange_url: + kwargs['url'] = config.http_exchange_url + return HttpExchangeFactory(**kwargs) + raise ValueError( f"Unsupported exchange type {exchange_type!r}; expected one of " - "'redis', 'local', 'hybrid'.", + f"{sorted(SUPPORTED_EXCHANGE_TYPES)}.", ) diff --git a/src/chemgraph/academy/runtime/mpi.py b/src/chemgraph/academy/runtime/mpi.py index 7439f58..2ad3531 100644 --- a/src/chemgraph/academy/runtime/mpi.py +++ b/src/chemgraph/academy/runtime/mpi.py @@ -8,7 +8,6 @@ from typing import Any from chemgraph.academy.observability.event_log import EventLog -from chemgraph.academy.observability.run_files import write_json_atomic MPI_RANK_ENV = ( 'PMI_RANK', @@ -96,6 +95,7 @@ def placement_payload(config: Any, agent_name: str) -> dict[str, Any]: 'rank': config.rank, 'local_rank': config.local_rank, 'exchange_type': config.exchange_type, + 'http_exchange_url': config.http_exchange_url, 'redis_host': config.redis_host, 'redis_port': config.redis_port, 'redis_namespace': config.redis_namespace, diff --git a/src/chemgraph/academy/runtime/profiles/__init__.py b/src/chemgraph/academy/runtime/profiles/__init__.py index 2ead8a2..740d00e 100644 --- a/src/chemgraph/academy/runtime/profiles/__init__.py +++ b/src/chemgraph/academy/runtime/profiles/__init__.py @@ -7,6 +7,7 @@ BUILTIN_SYSTEM_PROFILES = { "aurora": "aurora.template.json", "polaris": "polaris.template.json", + "crux": "crux.template.json", } diff --git a/src/chemgraph/academy/runtime/profiles/aurora.template.json b/src/chemgraph/academy/runtime/profiles/aurora.template.json index 1e3e40a..921155a 100644 --- a/src/chemgraph/academy/runtime/profiles/aurora.template.json +++ b/src/chemgraph/academy/runtime/profiles/aurora.template.json @@ -1,6 +1,6 @@ { "name": "aurora", - "remote_host": "${ALCF_USER}@aurora.alcf.anl.gov", + "remote_host": "${ALCF_SSH_USER}@aurora.alcf.anl.gov", "remote_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}/runs", diff --git a/src/chemgraph/academy/runtime/profiles/crux.template.json b/src/chemgraph/academy/runtime/profiles/crux.template.json new file mode 100644 index 0000000..810d2c8 --- /dev/null +++ b/src/chemgraph/academy/runtime/profiles/crux.template.json @@ -0,0 +1,36 @@ +{ + "name": "crux", + "remote_host": "${ALCF_SSH_USER}@crux.alcf.anl.gov", + "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", + "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", + "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", + "relay_host_file": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/uan-relay-18187-crux.host", + "relay_port": 18187, + "venv_python": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/venvs/academy-swarm-crux/bin/python", + "redis_bin_dir": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/tools/redis-crux/bin", + "redis_port": 6392, + "redis_bind": "0.0.0.0", + "redis_protected_mode": "no", + "mpiexec": "mpiexec", + "pythonpath_entries": [ + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph/src" + ], + "path_entries": [ + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/tools/redis-crux/bin", + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/bin" + ], + "env": { + "NUMEXPR_MAX_THREADS": "256", + "NUMEXPR_NUM_THREADS": "64", + "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_ACADEMY_PY": "0.0.0+crux" + }, + "unset_env": [ + "http_proxy", + "HTTP_PROXY", + "https_proxy", + "HTTPS_PROXY", + "all_proxy", + "ALL_PROXY" + ], + "no_proxy": "127.0.0.1,localhost,.alcf.anl.gov,*.alcf.anl.gov" +} diff --git a/src/chemgraph/academy/runtime/profiles/polaris.template.json b/src/chemgraph/academy/runtime/profiles/polaris.template.json index 7be57c9..2af485c 100644 --- a/src/chemgraph/academy/runtime/profiles/polaris.template.json +++ b/src/chemgraph/academy/runtime/profiles/polaris.template.json @@ -1,6 +1,6 @@ { "name": "polaris", - "remote_host": "${ALCF_USER}@polaris.alcf.anl.gov", + "remote_host": "${ALCF_SSH_USER}@polaris.alcf.anl.gov", "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", diff --git a/src/chemgraph/academy/runtime/profiles/system.py b/src/chemgraph/academy/runtime/profiles/system.py index 02ed6dc..0de3cd7 100644 --- a/src/chemgraph/academy/runtime/profiles/system.py +++ b/src/chemgraph/academy/runtime/profiles/system.py @@ -39,7 +39,16 @@ class SystemProfile(BaseModel): def load_system_profile(path_or_name: str | Path) -> SystemProfile: profile_path = resolve_builtin_system_profile(path_or_name) - text = os.path.expandvars(profile_path.read_text(encoding="utf-8")) + # Default ALCF_SSH_USER to ALCF_USER when unset. This separates the + # *SSH login* (used in ``remote_host``) from the *path component* + # (used everywhere else), which matters for accounts whose login + # differs from their workspace dir name -- e.g. login ``jinchuli`` + # but workspace under ``/flare/.../jinchu/``. Most users have one + # equal to the other and the default keeps their setup unchanged. + env = os.environ.copy() + if "ALCF_USER" in env and not env.get("ALCF_SSH_USER"): + env["ALCF_SSH_USER"] = env["ALCF_USER"] + text = _expand_with(profile_path.read_text(encoding="utf-8"), env) unresolved = sorted(set(re.findall(r"\$\{([^}]+)\}", text))) if unresolved: raise ValueError( @@ -48,3 +57,19 @@ def load_system_profile(path_or_name: str | Path) -> SystemProfile: ) data = json.loads(text) return SystemProfile.model_validate(data) + + +def _expand_with(text: str, env: dict[str, str]) -> str: + """``os.path.expandvars`` but reading from a caller-supplied env dict. + + The stdlib's ``expandvars`` always reads ``os.environ`` directly, + which means ``ALCF_SSH_USER`` defaulted to ``ALCF_USER`` only by + mutating the process environment. That'd leak the default into + every subsequent caller. Substituting via regex keeps the change + local. + """ + return re.sub( + r"\$\{([^}]+)\}", + lambda m: env.get(m.group(1), m.group(0)), + text, + ) diff --git a/src/chemgraph/academy/runtime/registration.py b/src/chemgraph/academy/runtime/registration.py index ef8823d..638a24c 100644 --- a/src/chemgraph/academy/runtime/registration.py +++ b/src/chemgraph/academy/runtime/registration.py @@ -1,122 +1,143 @@ +"""Peer-agent identity + readiness for federated ChemGraph campaigns. + +Cross-site peer rendezvous needs each rank to be able to address every +other site's agents without a shared filesystem. The first cut polled +``transport.discover(ChemGraphLogicalAgent)`` and filtered by +``AgentId.name``, which works on the local/redis/hybrid exchanges but +breaks on Academy's hosted HTTP exchange: ``discover()`` returns +``AgentId`` objects with ``name=None`` and ``role='agent'`` only. +Names round-trip through the server's mailbox state but are not +echoed back in the discovery response. + +The replacement: agree on a **deterministic UID** for every (run-id, +agent-name) pair. Both sites compute the same UID from the same +inputs, so each side knows the recipient's UID before either rank +boots. ``discover()`` is still useful as a liveness probe (matching +on UID, which IS preserved across the server round-trip) so a rank +can wait until its peers have actually registered before proceeding. + +Side effect of this scheme: agent names become campaign-scoped. Two +operators running the SAME ``federated-chat`` campaign concurrently +would clash on the same UIDs and crash the registration POST with +"mailbox already exists". The run-id is part of the UID namespace, +so as long as operators bump ``--run-id`` (federated-chat-001 vs +federated-chat-002) the UIDs differ and the campaigns don't see +each other. +""" from __future__ import annotations import asyncio -import json -import pathlib +import logging import time -from collections.abc import Mapping +import uuid +from collections.abc import Iterable from typing import Any -from academy.exchange.hybrid import HybridAgentRegistration -from academy.exchange.local import LocalAgentRegistration -from academy.exchange.redis import RedisAgentRegistration -from academy.exchange.transport import AgentRegistration +from academy.exchange.transport import ExchangeTransportT from academy.identifier import AgentId -from pydantic import BaseModel -from chemgraph.academy.observability.run_files import write_json_atomic +logger = logging.getLogger(__name__) -_REGISTRATION_TYPES: dict[str, type[BaseModel]] = { - 'local': LocalAgentRegistration, - 'hybrid': HybridAgentRegistration, - 'redis': RedisAgentRegistration, -} +# Stable namespace UUID used as the seed for uuid5 derivation. The +# value itself doesn't matter -- only that every site computes the +# same UID for the same (run_id, agent_name) pair. Bumping this +# constant would invalidate every running deployment, so don't. +_PEER_UID_NAMESPACE = uuid.UUID('1e7eda44-1b34-4f5a-b2a1-cf5ca5db8e8b') -def academy_registration_path(run_dir: pathlib.Path) -> pathlib.Path: - return run_dir / 'academy_registrations.json' +def deterministic_agent_uid(*, run_id: str, agent_name: str) -> uuid.UUID: + """Derive the AgentId.uid that every site will use for ``agent_name``. + Same inputs on Aurora and Crux ⇒ same UID. The recipient side + registers with this UID; the sender side constructs an + ``AgentId`` with the same UID locally and uses it to build a + ``Handle`` without ever calling ``discover()``. + """ + return uuid.uuid5(_PEER_UID_NAMESPACE, f"{run_id}/{agent_name}") -def _exchange_type_of(registration: AgentRegistration[Any]) -> str: - value = getattr(registration, 'exchange_type', None) - if not isinstance(value, str): - raise TypeError( - f'Registration {type(registration).__name__} has no string ' - '`exchange_type` field; cannot persist.', - ) - return value - - -def registration_payload( - *, - run_token: str, - registrations: Mapping[str, AgentRegistration[Any]], -) -> dict[str, Any]: - if not registrations: - raise ValueError('at least one registration is required') - exchange_types = {_exchange_type_of(r) for r in registrations.values()} - if len(exchange_types) > 1: - raise ValueError( - f'mixed exchange types in one campaign: {sorted(exchange_types)}', - ) - (exchange_type,) = exchange_types - return { - 'run_token': run_token, - 'exchange_type': exchange_type, - 'agents': { - name: registration.agent_id.model_dump(mode='json') - for name, registration in registrations.items() - }, - } - -def write_academy_registrations( - *, - run_dir: pathlib.Path, - run_token: str, - registrations: Mapping[str, AgentRegistration[Any]], -) -> None: - write_json_atomic( - academy_registration_path(run_dir), - registration_payload(run_token=run_token, registrations=registrations), +def deterministic_agent_id(*, run_id: str, agent_name: str) -> AgentId[Any]: + """Construct the ``AgentId`` peers can reconstruct from name alone.""" + return AgentId( + uid=deterministic_agent_uid(run_id=run_id, agent_name=agent_name), + name=agent_name, + role='agent', ) -def load_academy_registrations( - run_dir: pathlib.Path, - *, - run_token: str, -) -> dict[str, AgentRegistration[Any]]: - path = academy_registration_path(run_dir) - data = json.loads(path.read_text(encoding='utf-8')) - if data.get('run_token') != run_token: - raise RuntimeError( - f'Academy registration file {path} belongs to a different run', - ) - exchange_type = data.get('exchange_type') - if exchange_type not in _REGISTRATION_TYPES: - raise RuntimeError( - f'Academy registration file has unsupported exchange_type ' - f'{exchange_type!r}; expected one of ' - f'{sorted(_REGISTRATION_TYPES)}', - ) - cls = _REGISTRATION_TYPES[exchange_type] - agents = data.get('agents') - if not isinstance(agents, dict): - raise RuntimeError(f'Academy registration file is malformed: {path}') - return { - name: cls(agent_id=AgentId[Any].model_validate(agent_id)) - for name, agent_id in agents.items() +async def register_agent_with_uid( + transport: ExchangeTransportT, + agent_class: type, + agent_id: AgentId[Any], +) -> AgentId[Any]: + """Register ``agent_id`` on the exchange, reusing the supplied UID. + + Bypasses ``transport.register_agent`` (which always calls + ``AgentId.new`` and generates a random UID) by POSTing directly + to the same mailbox endpoint with our pre-built AgentId. Returns + the same AgentId on success so callers can hand it to Runtime. + """ + # Reach into the transport for the same session + URL the SDK uses. + # The shape mirrors HttpExchangeTransport.register_agent exactly, + # we just swap the auto-generated AgentId for the deterministic one. + session = transport._session + mailbox_url = transport._mailbox_url + payload = { + 'mailbox': agent_id.model_dump_json(), + 'agent': ','.join(agent_class._agent_mro()), } + async with session.post(mailbox_url, json=payload) as response: + # _raise_for_status is what the SDK uses; reach in for it too. + from academy.exchange.cloud.client import _raise_for_status + _raise_for_status(response, transport.mailbox_id, agent_id) + return agent_id -async def wait_academy_registrations( - run_dir: pathlib.Path, +async def wait_for_peers_alive( + transport: ExchangeTransportT, + peer_ids: Iterable[AgentId[Any]], *, - run_token: str, + agent_class: type, timeout_s: float, -) -> dict[str, AgentRegistration[Any]]: - path = academy_registration_path(run_dir) + poll_interval_s: float = 1.0, +) -> None: + """Block until every peer in ``peer_ids`` is visible to ``discover()``. + + UID-based matching: ``discover()`` strips names but preserves + UIDs, so we filter the discover response by UID and wait until + every expected peer's mailbox shows up. If ``peer_ids`` is empty + (single-agent or self-only slice), return immediately. + + Raises ``TimeoutError`` listing the missing peers' UIDs after + ``timeout_s`` so the operator can correlate with their other + site's launch logs. + """ + wanted = {peer.uid: peer for peer in peer_ids} + if not wanted: + return + seen: set[uuid.UUID] = set() deadline = time.monotonic() + timeout_s while True: - if path.exists(): - return load_academy_registrations( - run_dir, - run_token=run_token, - ) - if time.monotonic() > deadline: + agent_ids = await transport.discover(agent_class) + for aid in agent_ids: + if aid.uid in wanted: + seen.add(aid.uid) + missing = set(wanted).difference(seen) + if not missing: + return + if time.monotonic() >= deadline: + missing_descs = sorted(f'{wanted[u].name}({u})' for u in missing) raise TimeoutError( - f'Timed out waiting for Academy registrations at {path}', + f'Timed out after {timeout_s:.1f}s waiting for peer agents ' + f'to register on the exchange: missing={missing_descs}. ' + f'Confirm every site of the federated campaign has started ' + f'and that all sites are using the same --run-id (the run-id ' + f'is part of the UID namespace; mismatches make the peers ' + f'invisible to each other).', ) - await asyncio.sleep(0.25) + logger.debug( + 'wait_for_peers_alive: missing %d (%s); sleeping %.1fs', + len(missing), sorted(missing), poll_interval_s, + ) + await asyncio.sleep(poll_interval_s) diff --git a/src/chemgraph/academy/runtime/templates/start_relay.sh b/src/chemgraph/academy/runtime/templates/start_relay.sh index 1bb9e5f..2e93356 100644 --- a/src/chemgraph/academy/runtime/templates/start_relay.sh +++ b/src/chemgraph/academy/runtime/templates/start_relay.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -euo pipefail +set -uo pipefail REMOTE_ROOT="$1" RELAY_SCRIPT="$2" @@ -10,16 +10,97 @@ RELAY_PORT="$6" REVERSE_PORT="$7" RELAY_PYTHON="$8" +# Trace every line so the local relay log captures exactly which step +# fails. Without xtrace, a bash failure (e.g. cd, redirect) produces +# zero diagnostics and the launcher just reports "Local relay log: +# ". The output goes through the SSH stdout pipe so it lands +# in the Mac-side relay log -- no remote-side tail needed. +exec 2>&1 +echo "[start_relay] uan=$(hostname -f) port=${RELAY_PORT} reverse=${REVERSE_PORT} pid=$$" +set -x + cd "${REMOTE_ROOT}" UAN_HOST="$(hostname -f)" +UAN_SHORT="$(hostname -s)" printf '%s\n' "${UAN_HOST}" > "${RELAY_HOST_FILE}" -if [ -f "${RELAY_PID_FILE}" ]; then - OLD_PID="$(cat "${RELAY_PID_FILE}" 2>/dev/null || true)" - case "${OLD_PID}" in - ''|*[!0-9]*) ;; - *) kill "${OLD_PID}" 2>/dev/null || true ;; - esac +# Pid bookkeeping has to be per-UAN. Aurora's login alias round-robins +# across uan-0001..uan-0010, the shared filesystem is the same, but a +# pid only means something on the UAN that holds the process. Without +# per-host scoping, the launcher would happily kill the wrong pid (or +# none at all) on a sibling UAN and leave an orphan holding the port. +PER_HOST_PID_FILE="${RELAY_PID_FILE%.pid}.${UAN_SHORT}.pid" + +find_orphan_pids() { + # Match python processes whose first argv after the interpreter is + # the relay script path. Using `comm=python` + an argv prefix is far + # more precise than `pgrep -f `, which would also match this + # very bash script (because the path is in OUR argv too) and the + # subsequent `pgrep` invocation itself -- killing them all and + # taking the whole start_relay session down with them. That bug + # produced the silent "log shows kill then nothing" + # failure mode. + local self_pid="$$" + local parent_pid="${PPID:-0}" + ps -u "${USER}" -o pid=,comm=,args= 2>/dev/null \ + | awk -v rs="${RELAY_SCRIPT}" -v me="${self_pid}" -v pp="${parent_pid}" ' + $1 == me || $1 == pp { next } + $2 ~ /python/ { + for (i = 3; i <= NF; i++) if ($i == rs) { print $1; next } + } + ' +} + +kill_local_orphans() { + # Kill prior relay processes on THIS UAN. Scope: only python + # processes that have the relay script as an argv element, owned + # by us, excluding our own pid/ppid. + local pids + pids="$(find_orphan_pids)" + if [ -n "${pids}" ]; then + echo "[start_relay] killing prior relay pids on $(hostname -s): ${pids}" + # shellcheck disable=SC2086 + kill ${pids} 2>/dev/null || true + sleep 1 + pids="$(find_orphan_pids)" + if [ -n "${pids}" ]; then + echo "[start_relay] forcing kill -9 on stubborn pids: ${pids}" + # shellcheck disable=SC2086 + kill -9 ${pids} 2>/dev/null || true + sleep 1 + fi + fi + # Also try the per-host pid file in case the process was renamed or + # something matched a previous launch's bookkeeping that the ps + # scan didn't see. Best-effort. + if [ -f "${PER_HOST_PID_FILE}" ]; then + local old_pid + old_pid="$(cat "${PER_HOST_PID_FILE}" 2>/dev/null || true)" + case "${old_pid}" in + ''|*[!0-9]*) ;; + *) + # Don't kill ourselves or our parent even if a stale file + # happens to record our pid (shouldn't happen, but cheap). + if [ "${old_pid}" != "$$" ] && [ "${old_pid}" != "${PPID:-0}" ]; then + kill "${old_pid}" 2>/dev/null || true + fi + ;; + esac + fi +} + +kill_local_orphans + +# After reaping local orphans, fail fast and clearly if the port is +# still held -- it means another user (or another UAN's process via +# some unusual route) owns it and we can't take it over. +if command -v ss >/dev/null 2>&1; then + if ss -tln 2>/dev/null | awk '{print $4}' | grep -qE "[:.]${RELAY_PORT}\$"; then + echo "ERROR: port ${RELAY_PORT} on ${UAN_SHORT} is still in use after" >&2 + echo " cleaning up our own relays. Inspect with: ss -tlnp | grep ${RELAY_PORT}" >&2 + ss -tlnp 2>/dev/null | grep -E "[:.]${RELAY_PORT}\\b" >&2 || true + exit 1 + fi fi "${RELAY_PYTHON}" "${RELAY_SCRIPT}" \ @@ -29,10 +110,14 @@ fi --target-port "${REVERSE_PORT}" \ > "${RELAY_LOG_FILE}" 2>&1 & RELAY_PID="$!" +printf '%s\n' "${RELAY_PID}" > "${PER_HOST_PID_FILE}" +# Also write the legacy pid path so older launchers / debugging scripts +# that look for the bare uan-relay-.pid see *something* sensible. printf '%s\n' "${RELAY_PID}" > "${RELAY_PID_FILE}" cleanup_remote() { kill "${RELAY_PID}" 2>/dev/null || true + rm -f "${PER_HOST_PID_FILE}" 2>/dev/null || true } trap cleanup_remote EXIT diff --git a/src/chemgraph/academy/runtime/templates/uan_http_relay.py b/src/chemgraph/academy/runtime/templates/uan_http_relay.py index 8ce424f..442492e 100644 --- a/src/chemgraph/academy/runtime/templates/uan_http_relay.py +++ b/src/chemgraph/academy/runtime/templates/uan_http_relay.py @@ -19,7 +19,9 @@ from __future__ import annotations import argparse +import signal import socket +import sys import threading @@ -72,17 +74,36 @@ def parse_args() -> argparse.Namespace: def main() -> int: args = parse_args() - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server: - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind((args.listen_host, args.listen_port)) - server.listen(128) - print( - f'relay listening on {args.listen_host}:{args.listen_port} ' - f'-> {args.target_host}:{args.target_port}', - flush=True, - ) + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind((args.listen_host, args.listen_port)) + server.listen(128) + print( + f'relay listening on {args.listen_host}:{args.listen_port} ' + f'-> {args.target_host}:{args.target_port}', + flush=True, + ) + + def shutdown(_signo: int, _frame: object) -> None: + # Closing the listen socket inside the handler interrupts + # server.accept() with EBADF / OSError, which we catch below + # to fall through to a clean exit. Without this the relay + # ignores SIGTERM (default action) and orphans the port. + try: + server.close() + except OSError: + pass + sys.exit(0) + + signal.signal(signal.SIGTERM, shutdown) + signal.signal(signal.SIGINT, shutdown) + + try: while True: - client, addr = server.accept() + try: + client, addr = server.accept() + except OSError: + break print(f'accepted connection from {addr[0]}:{addr[1]}', flush=True) thread = threading.Thread( target=handle_client, @@ -90,6 +111,12 @@ def main() -> int: daemon=True, ) thread.start() + finally: + try: + server.close() + except OSError: + pass + return 0 if __name__ == '__main__': diff --git a/src/chemgraph/agent/turn.py b/src/chemgraph/agent/turn.py index e565213..61a98b5 100644 --- a/src/chemgraph/agent/turn.py +++ b/src/chemgraph/agent/turn.py @@ -161,6 +161,14 @@ def _custom_openai_compatible_kwargs( user = argo_user or os.getenv("ARGO_USER") if base_url and "argoapi" in base_url and user: kwargs["model_kwargs"] = {"user": user} + # GPT-5* / o-series reject any non-default temperature + sampling + # knobs. Drop them so the request payload matches what the model + # accepts. Import is local to avoid an import cycle with + # chemgraph.models.openai which itself imports langchain_openai. + from chemgraph.models.openai import is_reasoning_model + if is_reasoning_model(model_name): + for k in ("temperature", "top_p", "frequency_penalty", "presence_penalty"): + kwargs.pop(k, None) return kwargs diff --git a/src/chemgraph/cli/main.py b/src/chemgraph/cli/main.py index 788d5aa..0c97c29 100644 --- a/src/chemgraph/cli/main.py +++ b/src/chemgraph/cli/main.py @@ -284,6 +284,38 @@ def create_argument_parser() -> argparse.ArgumentParser: help="Arguments forwarded to chemgraph.academy.runtime.compute_launcher.", ) + spawn_site_parser = academy_sub.add_parser( + "spawn-site", + help=( + "Launch one site of a federated ChemGraph Academy campaign. " + "Like run-compute but only spawns the agent slice given via " + "--agents and skips internal bootstrap (use the 'bootstrap' " + "subcommand once every site is up)." + ), + ) + spawn_site_parser.add_argument( + "spawn_site_args", + nargs=argparse.REMAINDER, + help=( + "Arguments forwarded to chemgraph.academy.runtime.compute_launcher. " + "--agents is required; --no-bootstrap is auto-added." + ), + ) + + bootstrap_parser = academy_sub.add_parser( + "bootstrap", + help=( + "Dispatch the campaign bootstrap message to the initial agent " + "via the exchange. Run after every site of a federated " + "campaign is up; the recipient is discovered by name." + ), + ) + bootstrap_parser.add_argument( + "bootstrap_args", + nargs=argparse.REMAINDER, + help="Arguments forwarded to chemgraph.academy.runtime.bootstrap.", + ) + dashboard_parser = academy_sub.add_parser( "dashboard", help="Start the local dashboard launcher for a ChemGraph Academy run.", @@ -625,6 +657,27 @@ def _handle_academy(args: argparse.Namespace) -> None: if code: sys.exit(code) return + if command == "spawn-site": + from chemgraph.academy.runtime.compute_launcher import main as compute_main + + # spawn-site = run-compute with --no-bootstrap forced on. The + # compute_launcher's argparse tolerates a repeated --no-bootstrap, + # so we prepend it unconditionally rather than try to detect + # whether the operator already passed it. + forwarded = _strip_remainder_separator(args.spawn_site_args) + if "--no-bootstrap" not in forwarded: + forwarded = ["--no-bootstrap", *forwarded] + code = compute_main(forwarded) + if code: + sys.exit(code) + return + if command == "bootstrap": + from chemgraph.academy.runtime.bootstrap import main as bootstrap_main + + code = bootstrap_main(_strip_remainder_separator(args.bootstrap_args)) + if code: + sys.exit(code) + return if command == "campaigns": from chemgraph.academy.campaigns import list_campaigns @@ -633,7 +686,7 @@ def _handle_academy(args: argparse.Namespace) -> None: return console.print( "Usage: chemgraph academy " - "{mpi-daemon,run-compute,dashboard,campaigns}.", + "{mpi-daemon,run-compute,spawn-site,bootstrap,dashboard,campaigns}.", ) diff --git a/src/chemgraph/models/openai.py b/src/chemgraph/models/openai.py index bb33d38..230ffb2 100644 --- a/src/chemgraph/models/openai.py +++ b/src/chemgraph/models/openai.py @@ -157,6 +157,36 @@ def _is_local_http_endpoint(base_url: str | None) -> bool: } +# Reasoning-model detection. These models (GPT-5*, o1/o3/o4*) reject +# any non-default ``temperature`` (only 1 is supported) and similarly +# reject ``top_p`` / ``frequency_penalty`` / ``presence_penalty``. +# The Argo shim passes these through to OpenAI with the same +# constraint, so the LLM construction sites must drop the field +# entirely from the request payload. Match is case-insensitive and +# covers "GPT-5", "GPT-5-mini", "GPT-5.1" ... "GPT-5.5", "o1", "o3", +# "o3-mini", "o4-mini", their argo: variants, and the hosted-wire +# short forms ("gpt5", "gpt5mini", "gpto3mini"). +_REASONING_MODEL_PREFIXES: tuple[str, ...] = ("gpt-5", "o1", "o3", "o4") + + +def is_reasoning_model(model_name: str | None) -> bool: + """Return True when ``model_name`` is an OpenAI reasoning model. + + See module-level note above for rationale. + """ + if not model_name: + return False + name = model_name.strip().lower().removeprefix("argo:") + if name.startswith("gpto"): + return True # gpto1, gpto3mini, gpto4mini hosted-wire forms + if name.startswith("gpt5"): + return True # gpt5, gpt5mini, gpt5nano hosted-wire forms + return any( + name == p or name.startswith(p + "-") or name.startswith(p + ".") + for p in _REASONING_MODEL_PREFIXES + ) + + def load_openai_model( model_name: str, temperature: float, @@ -264,15 +294,25 @@ def load_openai_model( logger.info( "Using Argo user from config/ARGO_USER/default: %s", argo_user ) + if is_reasoning_model(model_name): + # Reasoning models (GPT-5*, o-series) reject any non- + # default temperature + sampling knobs. Drop them at + # construction time so the request payload matches what + # the model accepts. + for k in ("temperature", "top_p", "frequency_penalty", "presence_penalty"): + llm_kwargs.pop(k, None) llm = ChatOpenAI(**llm_kwargs) else: logger.info(f"Loading OpenAI model: {model_name}") - llm = ChatOpenAI( + openai_kwargs = dict( model=model_name, temperature=temperature, api_key=api_key, max_tokens=6000, ) + if is_reasoning_model(model_name): + openai_kwargs.pop("temperature", None) + llm = ChatOpenAI(**openai_kwargs) # Authentication happens only during invocation. logger.info(f"Requested model: {model_name}") logger.info("OpenAI model loaded successfully") diff --git a/tests/test_academy_bootstrap.py b/tests/test_academy_bootstrap.py new file mode 100644 index 0000000..3074b1b --- /dev/null +++ b/tests/test_academy_bootstrap.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import asyncio +from typing import Any + +import pytest +from academy.identifier import AgentId + +from chemgraph.academy.runtime import bootstrap +from chemgraph.academy.runtime.registration import deterministic_agent_id + + +# --------------------------------------------------------------------------- +# parse_args -- CLI surface +# --------------------------------------------------------------------------- + + +def test_parse_args_requires_campaign() -> None: + """``--campaign`` is one of the two required fields. Bootstrap is + useless without knowing which campaign's user_task to send.""" + with pytest.raises(SystemExit): + bootstrap.parse_args(['--run-id', 'r-001']) + + +def test_parse_args_requires_run_id() -> None: + """``--run-id`` is required because the recipient's mailbox UID + is derived deterministically from (run_id, agent_name). Without + it the bootstrap would address a different mailbox than the + spawn-site invocations registered.""" + with pytest.raises(SystemExit): + bootstrap.parse_args(['--campaign', 'mace-ensemble-screening-20']) + + +def test_parse_args_defaults_exchange_type_to_http() -> None: + """Federated bootstrap is the main use case so http is the right + default. Operators on single-machine runs can override but they + rarely need this command at all (run-compute auto-bootstraps).""" + args = bootstrap.parse_args([ + '--campaign', 'mace-ensemble-screening-20', + '--run-id', 'r-001', + ]) + assert args.exchange_type == 'http' + assert args.recipient is None # defaults to campaign.initial_agent + # discover-timeout-s default now matches spawn-site's 600s. + assert args.discover_timeout_s == pytest.approx(600.0) + + +def test_parse_args_accepts_recipient_override() -> None: + """Operator should be able to bootstrap a non-initial agent for + e.g. partial re-runs or debugging.""" + args = bootstrap.parse_args([ + '--campaign', 'foo.jsonc', + '--run-id', 'r-001', + '--recipient', 'worker-a', + ]) + assert args.recipient == 'worker-a' + + +# --------------------------------------------------------------------------- +# dispatch_bootstrap -- the core async path +# +# The hosted HttpExchange strips AgentId.name from discover() responses, +# so our fake transport returns UID-only AgentIds to mirror that. The +# bootstrap path constructs the recipient AgentId deterministically +# from (run_id, recipient_name) -- no name lookup happens. +# --------------------------------------------------------------------------- + + +class _FakeTransport: + """``transport.discover()`` returns a fixed agent list.""" + def __init__(self, agents): + self._agents = tuple(agents) + + async def discover(self, agent_class): # noqa: ARG002 - sig match only + return self._agents + + +class _FakeClient: + """Async-context-manager stand-in for academy's UserExchangeClient. + + The real client's __aenter__ sets a contextvar that Handle.action + reads to find the outbound exchange. We don't reproduce that + contextvar plumbing in the fake -- the test monkeypatches + ``bootstrap.Handle`` with a recording stand-in that bypasses the + contextvar lookup entirely -- but we DO support the + async-with shape so the bootstrap code path runs unchanged. + """ + def __init__(self, transport): + self._transport = transport + self.enter_count = 0 + self.exit_count = 0 + + async def __aenter__(self): + self.enter_count += 1 + return self + + async def __aexit__(self, exc_type, exc, tb): + self.exit_count += 1 + return False + + +class _FakeFactory: + def __init__(self, client): + self._client = client + + async def create_user_client(self, *, name, start_listener): # noqa: ARG002 + return self._client + + +class _FakeCampaign: + """Minimal stand-in for ChemGraphCampaign with just what + ``campaign_bootstrap_text`` reads. Avoids the full file-load path.""" + def __init__(self, user_task: str = 'do the thing'): + self.user_task = user_task + self.initial_agent = 'coordinator-agent' + self.agents = () + self.resources = {} + + +def _seen_agent_id(name: str, run_id: str) -> AgentId[Any]: + """Mirror what the hosted exchange returns from discover(): + deterministic UID, but with the name stripped to None.""" + aid = deterministic_agent_id(run_id=run_id, agent_name=name) + return AgentId(uid=aid.uid, name=None, role='agent') + + +def test_dispatch_bootstrap_sends_one_message_to_deterministic_recipient( + monkeypatch, +) -> None: + """Happy path: recipient's mailbox is visible on the exchange, + the wait succeeds, one Handle.action call gets made. The + recipient AgentId is built deterministically from (run_id, + recipient_name), NOT discovered by name.""" + run_id = 'demo-001' + seen = _seen_agent_id('coordinator-agent', run_id) + transport = _FakeTransport(agents=[ + seen, + _seen_agent_id('some-other-campaign-agent', 'unrelated'), + ]) + client = _FakeClient(transport) + factory = _FakeFactory(client) + + sent: list[tuple[Any, str, dict]] = [] + + class _RecordingHandle: + def __init__(self, agent_id): + self._agent_id = agent_id + + async def action(self, name, message): + sent.append((self._agent_id, name, message)) + + monkeypatch.setattr(bootstrap, 'Handle', _RecordingHandle) + + message_id = asyncio.run( + bootstrap.dispatch_bootstrap( + campaign=_FakeCampaign(), + run_id=run_id, + recipient='coordinator-agent', + exchange_factory=factory, + discover_timeout_s=1.0, + ), + ) + + assert len(sent) == 1 + agent_id, action_name, message = sent[0] + # Handle is built with the DETERMINISTIC AgentId -- same UID as + # what the recipient daemon registered, so the message routes to + # the right mailbox. + expected_uid = deterministic_agent_id( + run_id=run_id, agent_name='coordinator-agent', + ).uid + assert agent_id.uid == expected_uid + assert action_name == 'receive_message' + assert message['recipient'] == 'coordinator-agent' + assert message['sender'] == 'campaign' + assert message['message_id'] == message_id + assert 'do the thing' in message['content'] + # Client entered as async-context-manager (which sets the exchange + # context the Handle needs) and exited cleanly. The __aexit__ + # closes the aiohttp session backing the http exchange transport. + assert client.enter_count == 1 + assert client.exit_count == 1 + + +def test_dispatch_bootstrap_closes_client_on_recipient_timeout( + monkeypatch, +) -> None: + """If the recipient's mailbox never appears on the exchange the + helper must raise TimeoutError -- AND the client must still be + closed so we don't leak the underlying network resources.""" + # Transport returns SOME unrelated agent but not our recipient. + transport = _FakeTransport(agents=[ + _seen_agent_id('not-our-recipient', 'unrelated-run'), + ]) + client = _FakeClient(transport) + factory = _FakeFactory(client) + + monkeypatch.setattr( + bootstrap, 'Handle', + lambda agent_id: pytest.fail("Handle must not be built on timeout"), + ) + + with pytest.raises(TimeoutError): + asyncio.run( + bootstrap.dispatch_bootstrap( + campaign=_FakeCampaign(), + run_id='demo-001', + recipient='coordinator-agent', + exchange_factory=factory, + discover_timeout_s=0.05, + ), + ) + # async-with __aexit__ runs even on the error path -- the aiohttp + # session is released regardless of whether dispatch succeeded. + assert client.enter_count == 1 + assert client.exit_count == 1 + + +# --------------------------------------------------------------------------- +# main() -- end-to-end exit codes +# --------------------------------------------------------------------------- + + +def test_main_returns_2_on_recipient_timeout(monkeypatch, capsys) -> None: + """Operators need a non-zero exit so wrapping shell scripts know + bootstrap didn't actually happen. The stderr message should be the + TimeoutError's text (which names the missing recipient).""" + async def _raise(*args, **kwargs): + raise TimeoutError("Timed out ... missing=['coordinator-agent']") + monkeypatch.setattr(bootstrap, 'dispatch_bootstrap', _raise) + monkeypatch.setattr(bootstrap, 'load_campaign', + lambda path: _FakeCampaign()) + monkeypatch.setattr(bootstrap, 'build_exchange_factory', + lambda config: None) + + code = bootstrap.main([ + '--campaign', 'mace-ensemble-screening-20', + '--run-id', 'demo-001', + '--exchange-type', 'local', + ]) + assert code == 2 + err = capsys.readouterr().err + assert 'coordinator-agent' in err diff --git a/tests/test_academy_campaign.py b/tests/test_academy_campaign.py index 1fa2956..af1218c 100644 --- a/tests/test_academy_campaign.py +++ b/tests/test_academy_campaign.py @@ -358,3 +358,107 @@ def test_validate_campaign_rejects_allowed_tools_without_servers(tmp_path) -> No match="allowed_tools but no mcp_servers", ): validate_campaign(campaign, 1) + + +# --------------------------------------------------------------------------- +# Phase B.1: filter_agents + parse_agents_selection (federated spawn-site) +# --------------------------------------------------------------------------- + + +def test_parse_agents_selection_handles_trimming_and_empty_segments() -> None: + """The CLI's ``--agents`` value passes through this helper before + reaching the daemon. Tolerate whitespace + trailing commas so + operators don't get bitten by shell-quoting quirks.""" + from chemgraph.academy.core.campaign import parse_agents_selection + assert parse_agents_selection(None) == () + assert parse_agents_selection("") == () + assert parse_agents_selection("worker-a") == ("worker-a",) + assert parse_agents_selection(" worker-a , worker-b ") == ("worker-a", "worker-b") + assert parse_agents_selection("worker-a,,worker-b,") == ("worker-a", "worker-b") + + +def test_filter_agents_returns_slice_in_caller_order() -> None: + """MPI rank-to-agent mapping must match the order the operator + asked for. Don't accidentally re-sort or use the campaign's + declaration order.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + selected = filter_agents(campaign, ["mace-agent", "structure-agent-a"]) + assert [a.name for a in selected.agents] == ["mace-agent", "structure-agent-a"] + # initial_agent is intentionally NOT rewritten -- in the federated + # flow it may name an agent hosted on another site. + assert selected.initial_agent == campaign.initial_agent + # Untouched campaign fields stay intact. + assert selected.resources == campaign.resources + assert selected.mcp_servers == campaign.mcp_servers + + +def test_filter_agents_rejects_unknown_names() -> None: + """An unknown name almost certainly indicates an operator typo or + a campaign-file/CLI drift. Fail closed.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="not declared on campaign"): + filter_agents(campaign, ["mace-agent", "no-such-agent"]) + + +def test_filter_agents_rejects_empty_selection() -> None: + """A zero-length slice means 'launch nothing,' which is never what + the operator means. The launcher should never even construct an + empty selection (parse_agents_selection returns () on no input, + and the launcher short-circuits on empty), but the helper itself + must still fail closed if reached.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="at least one agent"): + filter_agents(campaign, []) + + +def test_filter_agents_rejects_duplicate_names() -> None: + """Duplicates would shadow each other in the post-filter campaign + and silently confuse the rank-to-agent mapping. Fail closed.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="duplicate agent names"): + filter_agents(campaign, ["mace-agent", "mace-agent"]) + + +def test_validate_campaign_federated_loosens_cross_site_peer_check() -> None: + """In a federated spawn-site slice, allowed_peers / initial_agent + may legitimately reference agents owned by another site. Strict + validation (the default) rejects those; ``federated=True`` lets + them through because the daemon will discover those peers via the + exchange at runtime instead of from this slice's agent list.""" + from chemgraph.academy.core.campaign import ( + filter_agents, load_campaign, validate_campaign, + ) + campaign = load_campaign("federated-chat") + slice_aurora = filter_agents(campaign, ["agent-aurora"]) + + # Strict validation rejects the cross-site peer reference. + with pytest.raises(RuntimeError, match="unknown allowed peers"): + validate_campaign(slice_aurora, agent_count=1) + + # federated=True accepts it. + validate_campaign(slice_aurora, agent_count=1, federated=True) + + +def test_validate_campaign_federated_still_rejects_self_peer() -> None: + """The 'agent must not list itself as a peer' invariant is local + to the slice and stays a hard error even in federated mode -- + self-peering would loop messages back to the sender, regardless + of how many sites the campaign spans.""" + from chemgraph.academy.core.campaign import ( + ChemGraphAgentSpec, ChemGraphCampaign, validate_campaign, + ) + import pathlib + bad = ChemGraphCampaign( + run_id="r", user_task="t", initial_agent="a", + prompt_profile=pathlib.Path("p"), + agents=(ChemGraphAgentSpec( + name="a", role="r", mission="m", + allowed_peers=("a",), # <-- self-peer + ),), + ) + with pytest.raises(RuntimeError, match="must not list itself as a peer"): + validate_campaign(bad, agent_count=1, federated=True) diff --git a/tests/test_academy_compute_launcher.py b/tests/test_academy_compute_launcher.py index d73098f..bae9540 100644 --- a/tests/test_academy_compute_launcher.py +++ b/tests/test_academy_compute_launcher.py @@ -62,3 +62,307 @@ def test_run_allocation_builds_single_mpiexec_command(tmp_path, monkeypatch) -> assert "--exchange-type" in cmd assert "--chemgraph-repo-root" in cmd assert (tmp_path / "launch_command.txt").exists() + + +# --------------------------------------------------------------------------- +# Phase B.1: --exchange-type http + cross-HPC plumbing +# --------------------------------------------------------------------------- + + +def _plan_http(tmp_path: Path, *, http_exchange_url: str | None = None) -> AllocationPlan: + base = _plan(tmp_path) + import dataclasses + return dataclasses.replace( + base, + exchange_type="http", + http_exchange_url=http_exchange_url, + ) + + +def test_run_allocation_with_http_exchange_does_not_start_redis( + tmp_path, monkeypatch, +) -> None: + """When the exchange doesn't talk to Redis (``http``, ``local``), + rank 0 must NOT start a redis-server subprocess. Otherwise compute + nodes without redis-server installed fail at launch, and nodes with + it pointlessly bind a port we never use.""" + started_subprocess: list[list[str]] = [] + + def fake_popen(cmd, **kwargs): # pragma: no cover - exercised via assert below + started_subprocess.append(list(cmd)) + raise AssertionError( + f"Popen should not be called for http exchange; got {cmd!r}", + ) + + monkeypatch.setattr(compute_launcher.subprocess, "Popen", fake_popen) + # wait_redis is the other Redis-touching site; assert it's not called. + def boom(*args, **kwargs): + raise AssertionError("wait_redis should not run for http exchange") + monkeypatch.setattr(compute_launcher, "wait_redis", boom) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: 0, + ) + + plan = _plan_http(tmp_path) + # start_redis is True by default; verify the http-exchange code path + # still skips Redis. This is the "operator forgot --no-start-redis" + # case, which used to fail loudly on nodes without redis-server. + import dataclasses + plan = dataclasses.replace(plan, start_redis=True) + assert compute_launcher.run_allocation(plan) == 0 + assert started_subprocess == [] + + +def test_run_allocation_forwards_http_exchange_url_when_set( + tmp_path, monkeypatch, +) -> None: + """``--http-exchange-url`` (operator override for a self-hosted + exchange) must flow into the daemon's argv. Otherwise the daemon + silently falls back to the hosted default.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + custom = "https://my-private-exchange.example.com/v1" + plan = _plan_http(tmp_path, http_exchange_url=custom) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--http-exchange-url" in cmd + assert custom in cmd + # Sanity: also confirm --exchange-type http rode along. + type_idx = cmd.index("--exchange-type") + assert cmd[type_idx + 1] == "http" + + +def test_run_allocation_omits_http_exchange_url_flag_when_unset( + tmp_path, monkeypatch, +) -> None: + """When no override is given, the daemon argv must NOT carry an + empty ``--http-exchange-url`` (which argparse would happily parse + as a literal empty-string URL and pass to HttpExchangeFactory).""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_http(tmp_path, http_exchange_url=None) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--http-exchange-url" not in cmd + + +# --------------------------------------------------------------------------- +# Phase B.1: agent subsetting + spawn-site --no-bootstrap forwarding +# --------------------------------------------------------------------------- + + +def _plan_subset( + tmp_path: Path, + *, + agents: tuple[str, ...], + skip_bootstrap: bool = True, +) -> AllocationPlan: + """An AllocationPlan that mimics what ``spawn-site`` would build.""" + import dataclasses + base = _plan(tmp_path) + return dataclasses.replace( + base, + agent_count=len(agents), # spawn-site derives count from slice + agents=agents, + skip_bootstrap=skip_bootstrap, + ) + + +def test_run_allocation_forwards_agents_flag_when_slice_given( + tmp_path, monkeypatch, +) -> None: + """When ``plan.agents`` is non-empty the daemon must receive + ``--agents worker-a,worker-b``, otherwise it would launch the + full campaign on every rank index and the rank-to-agent mapping + would diverge across sites.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_subset(tmp_path, agents=("worker-a", "worker-b")) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--agents" in cmd + idx = cmd.index("--agents") + assert cmd[idx + 1] == "worker-a,worker-b" + # Slice length must drive mpiexec -n so rank ordering matches the + # daemon's post-filter view of campaign.agents. + assert cmd[: cmd.index("--ppn") + 2] == ["mpiexec", "-n", "2", "--ppn", "1"] + + +def test_run_allocation_omits_agents_flag_for_single_machine_runs( + tmp_path, monkeypatch, +) -> None: + """The single-machine ``run-compute`` flow leaves ``plan.agents`` + empty so the daemon falls back to its launch-everything default. + A spurious ``--agents`` flag here would cause subsetting to fail + closed (``filter_agents`` rejects unknown names).""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + assert compute_launcher.run_allocation(_plan(tmp_path)) == 0 + + cmd = calls[0] + assert "--agents" not in cmd + + +def test_run_allocation_forwards_no_bootstrap_when_requested( + tmp_path, monkeypatch, +) -> None: + """``spawn-site`` sets ``plan.skip_bootstrap=True`` because kickoff + must be deferred until every federated site is up. The launcher + must propagate this -- otherwise rank 0 dispatches the bootstrap + locally and the campaign starts before remote agents have + registered on the exchange.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_subset(tmp_path, agents=("worker-a",), skip_bootstrap=True) + assert compute_launcher.run_allocation(plan) == 0 + assert "--no-bootstrap" in calls[0] + + +def test_run_allocation_omits_no_bootstrap_for_single_machine_runs( + tmp_path, monkeypatch, +) -> None: + """``run-compute`` keeps its inline bootstrap so the + single-machine UX doesn't regress -- the flag must be absent + when ``plan.skip_bootstrap`` is False.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + assert compute_launcher.run_allocation(_plan(tmp_path)) == 0 + assert "--no-bootstrap" not in calls[0] + + +def test_prepare_compute_launch_derives_agent_count_from_agents( + tmp_path, monkeypatch, +) -> None: + """When ``--agents worker-a,worker-b`` is given the launcher must + derive agent_count=2 from the slice length. An operator who also + passes ``--agent-count`` that disagrees should hit a loud error + -- silent precedence would let the two values drift, and the + daemon's MPI -n would not equal its post-filter agent ordering.""" + import argparse + args = argparse.Namespace( + run_id="r", campaign="mace-ensemble-screening-20", run_dir=None, + lm_base_url="http://stub:0/v1", relay_host=None, lm_model=None, lm_user=None, + max_tokens=None, agents_per_node=None, max_decisions=None, + redis_port=None, exchange_type="local", http_exchange_url=None, + no_start_redis=True, system="aurora", + agents="structure-agent-a,mace-agent", + no_bootstrap=True, + agent_count=None, + ) + # Avoid the actual aurora profile load (we'd need ALCF_USER set, + # the campaign template, etc). Stub the prep helpers that touch + # the filesystem. + monkeypatch.setattr(compute_launcher, "load_system_profile", + lambda name: _stub_profile(tmp_path)) + monkeypatch.setattr(compute_launcher, "_prepare_environment", + lambda profile, *, exchange_type: None) + monkeypatch.setattr(compute_launcher, "_load_dashboard_metadata", + lambda run_dir: {}) + monkeypatch.setattr(compute_launcher, "_write_lm_config", + lambda **kw: tmp_path / "lm.json") + monkeypatch.setattr(compute_launcher, "_export_workflow_lm_environment", + lambda lm_config: None) + + plan = compute_launcher.prepare_compute_launch(args) + assert plan.agent_count == 2 + assert plan.agents == ("structure-agent-a", "mace-agent") + assert plan.skip_bootstrap is True + + +def test_prepare_compute_launch_rejects_disagreeing_agent_count( + tmp_path, monkeypatch, +) -> None: + """Disagreeing ``--agent-count`` + ``--agents`` is a footgun: + silent precedence would let the operator think they were + launching 3 agents when only 2 ranks actually fire. Refuse loudly.""" + import argparse + import pytest + args = argparse.Namespace( + run_id="r", campaign="mace-ensemble-screening-20", run_dir=None, + lm_base_url="http://stub:0/v1", relay_host=None, lm_model=None, lm_user=None, + max_tokens=None, agents_per_node=None, max_decisions=None, + redis_port=None, exchange_type="local", http_exchange_url=None, + no_start_redis=True, system="aurora", + agents="structure-agent-a,mace-agent", + no_bootstrap=True, + agent_count=3, # mismatched -- 2 names but operator says 3 + ) + monkeypatch.setattr(compute_launcher, "load_system_profile", + lambda name: _stub_profile(tmp_path)) + monkeypatch.setattr(compute_launcher, "_prepare_environment", + lambda profile, *, exchange_type: None) + monkeypatch.setattr(compute_launcher, "_load_dashboard_metadata", + lambda run_dir: {}) + monkeypatch.setattr(compute_launcher, "_write_lm_config", + lambda **kw: tmp_path / "lm.json") + monkeypatch.setattr(compute_launcher, "_export_workflow_lm_environment", + lambda lm_config: None) + + with pytest.raises(RuntimeError, match="contradicts --agents"): + compute_launcher.prepare_compute_launch(args) + + +def _stub_profile(tmp_path: Path): + """Minimal SystemProfile-shaped stub for prepare_compute_launch tests.""" + from chemgraph.academy.runtime.profiles.system import SystemProfile + return SystemProfile( + name="aurora", + remote_host="jinchuli@aurora", + remote_root=str(tmp_path), + repo_root=str(tmp_path / "ChemGraph"), + run_root=str(tmp_path / "runs"), + relay_host_file=str(tmp_path / "relay.host"), + relay_port=18186, + venv_python=str(tmp_path / "venv/bin/python"), + redis_bin_dir=str(tmp_path / "redis/bin"), + redis_port=6392, + redis_bind="0.0.0.0", + redis_protected_mode="no", + mpiexec="mpiexec", + pythonpath_entries=[], + path_entries=[], + env={}, + unset_env=[], + no_proxy="127.0.0.1,localhost", + ) diff --git a/tests/test_academy_dashboard.py b/tests/test_academy_dashboard.py index 0abec32..be4259d 100644 --- a/tests/test_academy_dashboard.py +++ b/tests/test_academy_dashboard.py @@ -144,3 +144,228 @@ def test_dashboard_ignores_legacy_trace_jsonl(tmp_path) -> None: ) assert dashboard.events_payload(run_dir)["events"] == [] + + +# --------------------------------------------------------------------------- +# B.4c: federated dashboard (multi-site subdir layout) +# --------------------------------------------------------------------------- + + +def _seed_site(site_dir, *, status_timestamp, events) -> None: + """Write a minimal per-site mirror: status.json + events.jsonl. + + Touches an empty events.jsonl even when ``events`` is empty so the + federated-dashboard detector recognizes the dir as a real site + (EventLog itself only creates the file on first emit, which is + too late for the iterator's existence check). + """ + site_dir.mkdir(parents=True) + (site_dir / "status.json").write_text( + json.dumps({"mode": "mpi_daemon", "timestamp": status_timestamp, "agents": []}) + + "\n", + encoding="utf-8", + ) + (site_dir / "events.jsonl").touch() + log = EventLog(site_dir / "events.jsonl") + for event_name, payload in events: + log.emit( + event_name, + agent_id=payload["agent_id"], + role="observer", + payload=payload, + ) + + +def test_events_payload_merges_sites_and_tags_each_event(tmp_path) -> None: + """Federated dashboard's core promise: pointed at a parent dir with + per-site subdirs, ``events_payload`` returns a single timestamp- + sorted stream where each event carries a ``site`` field. UI uses + that field to color-code per-site events in the merged view.""" + root = tmp_path / "federated-run" + _seed_site( + root / "aurora", + status_timestamp=10.0, + events=[ + ("agent_started", { + "agent_id": "agent-00", "role": "observer", + "placement": {"hostname": "aur1", "short_hostname": "aur1"}, + "hostname": "aur1", "short_hostname": "aur1", + }), + ], + ) + _seed_site( + root / "crux", + status_timestamp=20.0, + events=[ + ("agent_started", { + "agent_id": "agent-01", "role": "observer", + "placement": {"hostname": "crux1", "short_hostname": "crux1"}, + "hostname": "crux1", "short_hostname": "crux1", + }), + ], + ) + + payload = dashboard.events_payload(root) + + assert {e["site"] for e in payload["events"]} == {"aurora", "crux"} + # Both sites' agents are visible in the merged stream. + agents_by_site = {e["site"]: e["agent_id"] for e in payload["events"]} + assert agents_by_site == {"aurora": "agent-00", "crux": "agent-01"} + + +def test_events_payload_sorts_merged_stream_by_timestamp(tmp_path) -> None: + """Per-site clocks don't have to agree, but the merged dashboard + view must be readable -- order by event timestamp regardless of + which site emitted each.""" + root = tmp_path / "federated-run" + + # Sites are seeded in reverse-time order; the merge must still + # produce timestamp-ascending output. + aurora_dir = root / "aurora" + aurora_dir.mkdir(parents=True) + (aurora_dir / "status.json").write_text("{}", encoding="utf-8") + aurora_log = EventLog(aurora_dir / "events.jsonl") + aurora_log.emit("agent_started", agent_id="ag-aur", role="r", payload={ + "agent_id": "ag-aur", "role": "r", + "placement": {"hostname": "h"}, "hostname": "h", + }) + + crux_dir = root / "crux" + crux_dir.mkdir(parents=True) + (crux_dir / "status.json").write_text("{}", encoding="utf-8") + crux_log = EventLog(crux_dir / "events.jsonl") + crux_log.emit("agent_started", agent_id="ag-crux", role="r", payload={ + "agent_id": "ag-crux", "role": "r", + "placement": {"hostname": "h"}, "hostname": "h", + }) + + events = dashboard.events_payload(root)["events"] + timestamps = [e["timestamp"] for e in events] + assert timestamps == sorted(timestamps) + + +def test_status_payload_nests_under_sites_for_federated_layout(tmp_path) -> None: + """Single-site clients use ``payload['status']`` / + ``payload['summary']`` etc directly. Federated clients want a + ``sites: {: {...}}`` shape so the UI can render per-site + sub-panels. Pin the structural difference so a future "make them + uniform" refactor must be a conscious choice.""" + root = tmp_path / "federated-run" + _seed_site( + root / "aurora", + status_timestamp=10.0, + events=[], + ) + _seed_site( + root / "crux", + status_timestamp=15.0, + events=[], + ) + + class Handler: + pass + handler = Handler() + handler.run_dir = root + + payload = dashboard.status_payload(handler) + assert "sites" in payload + assert set(payload["sites"]) == {"aurora", "crux"} + for site_name, site_payload in payload["sites"].items(): + assert "status" in site_payload + assert "summary" in site_payload + assert "placement" in site_payload + # Top-level 'updated' reflects the latest per-site update so the + # dashboard header has a meaningful timestamp. + assert payload["updated"] == 15.0 + + +def test_status_payload_keeps_legacy_shape_for_single_site(tmp_path) -> None: + """Existing single-site dashboard clients must see exactly the + pre-federation payload shape. The federated nesting only kicks in + when ``events.jsonl`` is absent at the top level.""" + root = tmp_path / "single-run" + _seed_site( + root, + status_timestamp=10.0, + events=[], + ) + + class Handler: + pass + handler = Handler() + handler.run_dir = root + + payload = dashboard.status_payload(handler) + # Single-site keys, no ``sites`` nesting. + assert "sites" not in payload + assert set(payload) == { + "placement", "run_dir", "schema", "status", "summary", "updated", + } + + +def test_iter_site_dirs_recognizes_metadata_only_sites(tmp_path) -> None: + """A site that's started but hasn't emitted any events yet still + has a ``dashboard_metadata.json`` written by the launcher. The + iterator must recognize it so a federated dashboard doesn't + briefly look like 'empty single-site' during startup.""" + from chemgraph.academy.dashboard.server import _iter_site_dirs + + root = tmp_path / "early-startup" + (root / "aurora").mkdir(parents=True) + (root / "aurora" / "dashboard_metadata.json").write_text("{}", encoding="utf-8") + (root / "crux").mkdir(parents=True) + (root / "crux" / "dashboard_metadata.json").write_text("{}", encoding="utf-8") + + sites = _iter_site_dirs(root) + assert {name for name, _ in sites} == {"aurora", "crux"} + + +def test_iter_site_dirs_falls_back_to_single_site_when_empty(tmp_path) -> None: + """Just-created run dir with neither events.jsonl nor recognizable + subdirs: behave as single-site so the dashboard renders an + empty-but-valid view instead of erroring out.""" + from chemgraph.academy.dashboard.server import _iter_site_dirs + + root = tmp_path / "brand-new" + root.mkdir() + sites = _iter_site_dirs(root) + assert sites == [(None, root)] + + +# --------------------------------------------------------------------------- +# Multi-site launcher argument parsing +# --------------------------------------------------------------------------- + + +def test_parse_systems_list_accepts_single_name() -> None: + """Single-site invocation is the legacy case; tuple-of-one keeps + the rest of the launcher uniform.""" + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + assert _parse_systems_list("aurora") == ("aurora",) + + +def test_parse_systems_list_accepts_comma_list_and_trims() -> None: + """The federated UX. Whitespace-tolerant for paste-from-doc.""" + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + assert _parse_systems_list(" aurora , crux ") == ("aurora", "crux") + assert _parse_systems_list("aurora,crux,") == ("aurora", "crux") + + +def test_parse_systems_list_rejects_empty() -> None: + """Operator typo or unexpected expansion -- fail at argparse-resolve + time with a clean message.""" + import argparse + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + with pytest.raises(argparse.ArgumentTypeError, match="at least one"): + _parse_systems_list("") + with pytest.raises(argparse.ArgumentTypeError, match="at least one"): + _parse_systems_list(",") + + +def test_parse_systems_list_rejects_duplicates() -> None: + """Listing the same site twice would set up duplicate tunnels + + rsync threads racing on the same mirror dir. Fail closed.""" + import argparse + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + with pytest.raises(argparse.ArgumentTypeError, match="duplicate"): + _parse_systems_list("aurora,crux,aurora") diff --git a/tests/test_academy_exchange_registration.py b/tests/test_academy_exchange_registration.py index 0c1f95c..582706a 100644 --- a/tests/test_academy_exchange_registration.py +++ b/tests/test_academy_exchange_registration.py @@ -1,26 +1,34 @@ from __future__ import annotations +import asyncio from pathlib import Path +from types import SimpleNamespace +from typing import Any import pytest # Skip when the optional 'academy' extra is absent; this module -# imports academy.exchange.* directly at top level. +# imports academy.* directly at top level. pytest.importorskip("academy") -from academy.exchange.hybrid import HybridAgentRegistration -from academy.exchange.local import LocalAgentRegistration -from academy.exchange.redis import RedisAgentRegistration from academy.identifier import AgentId +from academy.exchange.cloud.client import DEFAULT_EXCHANGE_URL from chemgraph.academy.core.campaign import ChemGraphDaemonConfig from chemgraph.academy.runtime.exchange import build_exchange_factory -from chemgraph.academy.runtime.registration import load_academy_registrations -from chemgraph.academy.runtime.registration import registration_payload -from chemgraph.academy.runtime.registration import write_academy_registrations +from chemgraph.academy.runtime.exchange import exchange_uses_redis +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import deterministic_agent_uid +from chemgraph.academy.runtime.registration import wait_for_peers_alive -def _config(tmp_path: Path, exchange_type: str) -> ChemGraphDaemonConfig: +def _config( + tmp_path: Path, + exchange_type: str, + *, + http_exchange_url: str | None = None, +) -> ChemGraphDaemonConfig: return ChemGraphDaemonConfig( run_dir=tmp_path, run_token='token-1', @@ -40,22 +48,45 @@ def _config(tmp_path: Path, exchange_type: str) -> ChemGraphDaemonConfig: local_rank=0, chemgraph_repo_root=tmp_path, exchange_type=exchange_type, + http_exchange_url=http_exchange_url, ) +class HttpExchangeFactory: + def __init__(self, url: str = DEFAULT_EXCHANGE_URL, **kwargs: Any) -> None: + self._info = SimpleNamespace(url=url) + self.kwargs = kwargs + + +def _stub_http_exchange_factory(monkeypatch: pytest.MonkeyPatch) -> None: + import academy.exchange.cloud as cloud + + monkeypatch.setattr(cloud, 'HttpExchangeFactory', HttpExchangeFactory) + + +# --------------------------------------------------------------------------- +# Exchange factory dispatch +# --------------------------------------------------------------------------- + + @pytest.mark.parametrize( ('exchange_type', 'expected_class'), [ ('redis', 'RedisExchangeFactory'), ('local', 'LocalExchangeFactory'), ('hybrid', 'HybridExchangeFactory'), + ('http', 'HttpExchangeFactory'), ], ) def test_build_exchange_factory_dispatches_by_config( tmp_path, exchange_type, expected_class, + monkeypatch, ) -> None: + if exchange_type == 'http': + _stub_http_exchange_factory(monkeypatch) + factory = build_exchange_factory(_config(tmp_path, exchange_type)) assert type(factory).__name__ == expected_class @@ -66,46 +97,214 @@ def test_build_exchange_factory_rejects_unknown_exchange(tmp_path) -> None: build_exchange_factory(_config(tmp_path, 'bad')) -@pytest.mark.parametrize( - 'registration_cls', - [ - RedisAgentRegistration, - LocalAgentRegistration, - HybridAgentRegistration, - ], -) -def test_academy_registration_round_trips_by_exchange_type( +def test_supported_exchange_types_exposes_full_dispatch_table() -> None: + """SUPPORTED_EXCHANGE_TYPES drives both the CLI ``choices`` argument + on ``compute_launcher.parse_args`` and ``daemon.parse_args``. If the + set drifts from what ``build_exchange_factory`` actually handles, + the CLI happily accepts a value that then raises at runtime.""" + assert set(SUPPORTED_EXCHANGE_TYPES) == {'redis', 'local', 'hybrid', 'http'} + + +def test_exchange_uses_redis_helper_matches_dispatch_table() -> None: + """The compute launcher uses this helper to decide whether to start a + Redis subprocess on rank 0. Pin the answers for every supported type + so adding a new exchange forces a conscious yes/no decision here.""" + assert exchange_uses_redis('redis') is True + assert exchange_uses_redis('hybrid') is True + assert exchange_uses_redis('local') is False + assert exchange_uses_redis('http') is False + + +def test_http_exchange_factory_uses_hosted_default_when_url_omitted( tmp_path, - registration_cls, + monkeypatch, ) -> None: - registration = registration_cls(agent_id=AgentId.new('agent-a')) - write_academy_registrations( - run_dir=tmp_path, - run_token='token-1', - registrations={'agent-a': registration}, + """A ``None`` ``http_exchange_url`` must select Academy's hosted + default (https://exchange.academy-agents.org/v1). This is the path + every cross-HPC campaign takes unless the operator stands up a + self-hosted exchange.""" + _stub_http_exchange_factory(monkeypatch) + factory = build_exchange_factory(_config(tmp_path, 'http')) + + # Upstream stores connection details on factory._info; reach into + # it to make sure we hand off the URL we mean to. + assert factory._info.url == DEFAULT_EXCHANGE_URL + + +def test_http_exchange_factory_honors_custom_url(tmp_path, monkeypatch) -> None: + """Operators must be able to point at a self-hosted HTTP exchange + server (``python -m academy.exchange.cloud``). This is the escape + hatch when the public Academy server is unavailable or undesired.""" + custom = 'https://my-private-exchange.example.com/v1' + _stub_http_exchange_factory(monkeypatch) + factory = build_exchange_factory( + _config(tmp_path, 'http', http_exchange_url=custom), + ) + + assert factory._info.url == custom + + +# --------------------------------------------------------------------------- +# Deterministic peer identity + wait_for_peers_alive +# +# The hosted HttpExchange strips AgentId.name from discover() responses +# (only ``uid`` and ``role`` round-trip). Name-based discovery was +# silently never finding any peer across sites. The replacement: derive +# each peer's mailbox UID deterministically from (run_id, agent_name) +# so every site can construct the same AgentId locally without needing +# discover() to echo the name back. discover() stays useful as a +# liveness probe (matching on UID, which IS preserved). +# --------------------------------------------------------------------------- + + +class _FakeTransport: + """Minimal ``transport.discover()`` stand-in for the discovery tests. + + Configure with a list of "rounds"; each call to ``discover()`` + returns (and consumes) one round. After the configured rounds run + out the last one keeps being returned, so timeout tests can assert + 'never converged'. + """ + + def __init__(self, rounds: list[list[AgentId[Any]]]) -> None: + self._rounds = rounds + self._calls = 0 + + async def discover(self, agent_class): # noqa: ARG002 - sig match only + index = min(self._calls, len(self._rounds) - 1) + self._calls += 1 + return tuple(self._rounds[index]) + + +def test_deterministic_agent_uid_is_stable() -> None: + """Same (run_id, agent_name) inputs must produce the same UID, + every call, on every machine. This is the load-bearing invariant + of the federated rendezvous: Aurora and Crux compute the same + UID locally and addressing works without any shared lookup.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker') + b = deterministic_agent_uid(run_id='r-001', agent_name='worker') + assert a == b + + +def test_deterministic_agent_uid_differs_by_run_id() -> None: + """Different run-ids must yield different UIDs so two operators + running the SAME campaign with different --run-ids don't collide + on the same mailboxes.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker') + b = deterministic_agent_uid(run_id='r-002', agent_name='worker') + assert a != b + + +def test_deterministic_agent_uid_differs_by_agent_name() -> None: + """Different agent names within the same run must yield different + UIDs so per-agent mailboxes don't collide.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_uid(run_id='r-001', agent_name='worker-b') + assert a != b + + +def test_deterministic_agent_id_preserves_name_locally() -> None: + """The AgentId we build for our OWN registration carries the + name so it shows up in trace events; the name is only stripped + when the AgentId is round-tripped through the hosted exchange's + discover() response.""" + aid = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + assert aid.name == 'worker-a' + assert aid.uid == deterministic_agent_uid( + run_id='r-001', agent_name='worker-a', ) - loaded = load_academy_registrations(tmp_path, run_token='token-1') - assert isinstance(loaded['agent-a'], registration_cls) - assert loaded['agent-a'].agent_id == registration.agent_id +def test_wait_for_peers_alive_returns_immediately_for_empty_list() -> None: + """When the local agent has no allowed_peers the helper short- + circuits -- it must not poll the exchange unnecessarily.""" + transport = _FakeTransport(rounds=[[]]) + asyncio.run( + wait_for_peers_alive( + transport, [], agent_class=object, timeout_s=0.01, + ), + ) + assert transport._calls == 0 + +def test_wait_for_peers_alive_succeeds_when_all_uids_present() -> None: + """Happy path: every expected peer's mailbox is on the exchange. + Match by UID (the field discover() preserves), not name.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_id(run_id='r-001', agent_name='worker-b') + # Simulate what the hosted exchange actually returns: AgentIds + # with the right UID but name stripped. Tests would have caught + # the original bug if they'd matched this shape. + a_seen = AgentId(uid=a.uid, name=None, role='agent') + b_seen = AgentId(uid=b.uid, name=None, role='agent') + transport = _FakeTransport(rounds=[[a_seen, b_seen]]) + asyncio.run( + wait_for_peers_alive( + transport, [a, b], agent_class=object, timeout_s=1.0, + ), + ) -def test_registration_payload_rejects_mixed_exchange_types() -> None: - with pytest.raises(ValueError, match='mixed exchange types'): - registration_payload( - run_token='token-1', - registrations={ - 'redis-agent': RedisAgentRegistration( - agent_id=AgentId.new('redis-agent'), - ), - 'local-agent': LocalAgentRegistration( - agent_id=AgentId.new('local-agent'), - ), - }, + +def test_wait_for_peers_alive_waits_across_polls_for_late_peer() -> None: + """The federated convergence story: bring sites up in any order; + the wait keeps polling and unblocks the moment all UIDs are seen.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_id(run_id='r-001', agent_name='worker-b') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + b_seen = AgentId(uid=b.uid, name=None, role='agent') + rounds = [ + [a_seen], # poll 1: only A visible + [a_seen], # poll 2: still waiting for B + [a_seen, b_seen], # poll 3: B comes up + ] + transport = _FakeTransport(rounds=rounds) + asyncio.run( + wait_for_peers_alive( + transport, [a, b], + agent_class=object, + timeout_s=2.0, + poll_interval_s=0.01, + ), + ) + + +def test_wait_for_peers_alive_times_out_naming_missing_uids() -> None: + """When a remote site never registers, raise with a message + naming the missing peers (name + uid). Operators reading the + log can correlate with the missing site's launch logs.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + missing = deterministic_agent_id(run_id='r-001', agent_name='no-such-peer') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + transport = _FakeTransport(rounds=[[a_seen]]) + with pytest.raises(TimeoutError, match='no-such-peer'): + asyncio.run( + wait_for_peers_alive( + transport, [a, missing], + agent_class=object, + timeout_s=0.05, + poll_interval_s=0.01, + ), ) -def test_registration_payload_rejects_empty_registrations() -> None: - with pytest.raises(ValueError, match='at least one registration'): - registration_payload(run_token='token-1', registrations={}) +def test_wait_for_peers_alive_ignores_unrelated_agents_with_same_class() -> None: + """The hosted exchange returns every ChemGraphLogicalAgent registered + across all operators / campaigns. The wait must filter strictly by + UID and not get confused by other operators' agents.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + # Lots of noise from other operators / runs: + noise = [ + AgentId.new('stranger-1'), + AgentId.new('stranger-2'), + AgentId.new('stranger-3'), + ] + transport = _FakeTransport(rounds=[noise + [a_seen]]) + asyncio.run( + wait_for_peers_alive( + transport, [a], + agent_class=object, + timeout_s=1.0, + poll_interval_s=0.01, + ), + )