From 103d7168689820b05f8ddd43f86d6e79b9d98b17 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Thu, 18 Jun 2026 15:03:57 -0500 Subject: [PATCH 01/25] chore(.gitignore): ignore *.private-local.* (local scratchpads) Mirrors the pattern already used on academy-synth-topology. Allows local journals (e.g. symlinked from ~/.config/chemgraph-journals/) to coexist in the repo without ever being staged. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 91e70e1..5098b1f 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,7 @@ vib*.traj # Kubernetes secrets (keep secrets.yaml.template, ignore actual secrets) k8s/secrets.yaml + +# Local private notes / scratchpads — anything matching *.private-local.* stays untracked +*.private-local.* +*.private-local From 2bffa2a6c6d1f864d88f4f1d4452656c264f4ec2 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 08:08:30 -0500 Subject: [PATCH 02/25] feat(academy/runtime): add http exchange for cross-HPC messaging Wire Academy's HTTP exchange (default URL: Academy-hosted https://exchange.academy-agents.org/v1, Globus-Auth gated) as a fourth exchange type alongside redis/local/hybrid. Validated end-to-end on an Aurora compute node running example-002: 5 agents register against the hosted exchange, coordinator receives bootstrap, LM traffic flows through the existing UAN relay. This is the first time a ChemGraph Academy campaign has run on Aurora without Redis as the messaging substrate, and the technical groundwork for cross-HPC (e.g. Mac<->Aurora<->Polaris) campaigns. Plumbing - runtime/exchange.py: SUPPORTED_EXCHANGE_TYPES constant covers ('redis', 'local', 'hybrid', 'http') so CLI choices and dispatch table can't drift. New 'http' branch constructs HttpExchangeFactory with optional override URL. exchange_uses_redis() helper lets the launcher gate the rank-0 Redis subprocess without inlining the set. - core/campaign.py: ChemGraphDaemonConfig.http_exchange_url field (None = use Academy-hosted default). - runtime/registration.py: HttpAgentRegistration added to the _REGISTRATION_TYPES dispatch so per-rank registration files can round-trip through disk for the http exchange. - runtime/daemon.py, runtime/mpi.py: matching --exchange-type choices, --http-exchange-url flag, observability snapshot. Aurora-specific compute_launcher.py fixes - _prepare_environment: do NOT strip http_proxy/https_proxy from os.environ when exchange_type=='http'. Aurora's profile lists those in unset_env for the LM relay path (loopback 127.0.0.1) which is correct for redis runs but breaks http exchange. Without this fix the parent Python had no proxy vars so the --genv flags never got populated, and ranks couldn't reach the public internet. - mpiexec cmd: append --genvall plus explicit --genv KEY=VAL pairs for proxy vars when exchange_type=='http'. PALS's documented --genvall default empirically did not forward our parent env; explicit per-var flags were required. - run_allocation: skip rank-0 redis-server subprocess for any exchange that doesn't need Redis (was inline 'in {redis,hybrid}', now uses exchange_uses_redis helper). Tests (19 passing across the two suites) - exchange dispatch parametrized over all four types - SUPPORTED_EXCHANGE_TYPES integrity vs the dispatch table - exchange_uses_redis answers pinned per type - HttpExchangeFactory built with hosted default when url is None, with custom URL when provided - HttpAgentRegistration round-trips through write/load - run_allocation skips Redis subprocess for http exchange - --http-exchange-url forwarded to daemon argv when set, omitted when None - compute_launcher tests pass with the new env-prep signature Operator prerequisites for --exchange-type http on Aurora - Globus token cached at ~/local/share/academy/storage.db (run any HttpExchangeFactory() once interactively to log in via Globus). - http_proxy / https_proxy set to the ALCF proxy (http://proxy.alcf.anl.gov:3128) before invoking 'chemgraph academy run-compute'. - ALCF_USER set to the *workspace* username (e.g. jinchu), which may differ from the SSH login (e.g. jinchuli). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/core/campaign.py | 7 ++ .../academy/runtime/compute_launcher.py | 68 ++++++++++++- src/chemgraph/academy/runtime/daemon.py | 12 ++- src/chemgraph/academy/runtime/exchange.py | 40 +++++++- src/chemgraph/academy/runtime/mpi.py | 1 + src/chemgraph/academy/runtime/registration.py | 2 + tests/test_academy_compute_launcher.py | 98 +++++++++++++++++++ tests/test_academy_exchange_registration.py | 58 ++++++++++- 8 files changed, 279 insertions(+), 7 deletions(-) diff --git a/src/chemgraph/academy/core/campaign.py b/src/chemgraph/academy/core/campaign.py index b87a80d..87299e6 100644 --- a/src/chemgraph/academy/core/campaign.py +++ b/src/chemgraph/academy/core/campaign.py @@ -153,6 +153,13 @@ class ChemGraphDaemonConfig: local_rank: int | None chemgraph_repo_root: pathlib.Path exchange_type: str = 'redis' + # URL of an HTTP exchange server when exchange_type == 'http'. + # ``None`` selects the Academy-hosted default + # (https://exchange.academy-agents.org/v1), which is gated by + # Globus Auth and uses the bearer token cached at + # ``$XDG_DATA_HOME/academy/storage.db``. Set this for a + # self-hosted ``python -m academy.exchange.cloud`` server. + http_exchange_url: str | None = None def namespace_for_run(run_dir: pathlib.Path) -> str: diff --git a/src/chemgraph/academy/runtime/compute_launcher.py b/src/chemgraph/academy/runtime/compute_launcher.py index 3ba9ad4..1647b7c 100644 --- a/src/chemgraph/academy/runtime/compute_launcher.py +++ b/src/chemgraph/academy/runtime/compute_launcher.py @@ -15,6 +15,8 @@ from chemgraph.academy.campaigns import campaign_launch_defaults from chemgraph.academy.campaigns import resolve_campaign from chemgraph.academy.campaigns import resolve_lm_config_template +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.exchange import exchange_uses_redis from chemgraph.academy.runtime.profiles import list_builtin_system_profiles from chemgraph.academy.runtime.profiles import load_system_profile from chemgraph.academy.runtime.profiles.system import SystemProfile @@ -48,6 +50,7 @@ class AllocationPlan: mpiexec: str chemgraph_repo_root: Path exchange_type: str = "redis" + http_exchange_url: str | None = None def parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -79,8 +82,23 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser.add_argument("--redis-port", type=int) parser.add_argument( "--exchange-type", - choices=("redis", "local", "hybrid"), + choices=SUPPORTED_EXCHANGE_TYPES, default="redis", + help=( + "Academy exchange backend. 'http' targets an HTTP exchange " + "server (Academy-hosted default unless --http-exchange-url " + "is given) and reaches across HPC sites; requires a cached " + "Globus token (run scripts/academy_login_globus.py once) " + "and on Aurora compute nodes the ALCF http(s)_proxy env " + "vars must be set before launch." + ), + ) + parser.add_argument( + "--http-exchange-url", + help=( + "Override URL for --exchange-type=http. Omit to use the " + "Academy-hosted default." + ), ) parser.add_argument("--no-start-redis", action="store_true") return parser.parse_args(argv) @@ -94,8 +112,29 @@ def _prepend_path(name: str, entries: list[str]) -> None: os.environ[name] = os.pathsep.join(values) -def _prepare_environment(profile: SystemProfile) -> None: +_PROXY_ENV_NAMES = frozenset({ + "http_proxy", "HTTP_PROXY", + "https_proxy", "HTTPS_PROXY", + "all_proxy", "ALL_PROXY", +}) + + +def _prepare_environment( + profile: SystemProfile, + *, + exchange_type: str = "redis", +) -> None: + # Aurora's profile lists http(s)_proxy in unset_env so that LM traffic + # going through the local UAN relay (127.0.0.1:) doesn't pick up + # a stray site proxy. That's correct for Redis-based campaigns, but for + # --exchange-type=http the ranks MUST reach exchange.academy-agents.org + # over the public internet, which on Aurora compute nodes only works + # through the ALCF HTTP proxy. Keep the proxy vars when the exchange + # needs them. The no_proxy list set below still excludes 127.0.0.1 + + # .alcf.anl.gov so the LM relay continues to bypass the proxy. for name in profile.unset_env: + if exchange_type == "http" and name in _PROXY_ENV_NAMES: + continue os.environ.pop(name, None) _prepend_path("PATH", profile.path_entries) _prepend_path("PYTHONPATH", profile.pythonpath_entries) @@ -192,7 +231,7 @@ def _run_token() -> str: def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: """Resolve a system profile and dashboard metadata into an allocation plan.""" profile = load_system_profile(args.system) - _prepare_environment(profile) + _prepare_environment(profile, exchange_type=args.exchange_type) defaults = campaign_launch_defaults(args.campaign) run_dir = Path(args.run_dir or Path(profile.run_root) / args.run_id).resolve() @@ -250,6 +289,7 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: mpiexec=profile.mpiexec, chemgraph_repo_root=Path(profile.repo_root).resolve(), exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, ) @@ -274,7 +314,7 @@ def run_allocation(plan: AllocationPlan) -> int: """Start Redis if requested and run per-rank daemons under mpiexec.""" plan.run_dir.mkdir(parents=True, exist_ok=True) redis_proc: subprocess.Popen[bytes] | None = None - uses_redis = plan.exchange_type in {"redis", "hybrid"} + uses_redis = exchange_uses_redis(plan.exchange_type) if plan.start_redis and uses_redis: redis_server = shutil.which("redis-server") if redis_server is None: @@ -324,10 +364,30 @@ def run_allocation(plan: AllocationPlan) -> int: "--exchange-type", plan.exchange_type, "--chemgraph-repo-root", str(plan.chemgraph_repo_root), ] + if plan.http_exchange_url: + daemon_args += ["--http-exchange-url", plan.http_exchange_url] + # When using the HTTP exchange on HPC compute nodes (e.g. Aurora), + # ranks must reach exchange.academy-agents.org through the site's + # outbound HTTP proxy. PALS/MPICH mpiexec strips most parent-shell + # env vars from spawned ranks, so http_proxy / https_proxy don't + # propagate automatically -- force forwarding via --genvall and + # set the proxy vars in our own environ first so they are part + # of the parent env that --genvall snapshots. + genv_flags: list[str] = [] + if plan.exchange_type == "http": + for name in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): + value = os.environ.get(name) + if value: + genv_flags += ["--genv", f"{name}={value}"] + # --genvall snapshots the whole parent env as belt-and-braces; + # the explicit per-var entries surface in launch_command.txt for + # post-hoc debugging when a rank can't reach the exchange. + genv_flags = ["--genvall", *genv_flags] cmd = [ plan.mpiexec, "-n", str(plan.agent_count), "--ppn", str(plan.agents_per_node), + *genv_flags, sys.executable, "-m", "chemgraph.cli.main", "academy", "mpi-daemon", "--", *daemon_args, ] diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index e6cb05b..16a112d 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -11,6 +11,7 @@ from chemgraph.academy.core.peer_protocol import build_message from chemgraph.academy.runtime.exchange import build_exchange_factory +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES from chemgraph.academy.runtime.registration import load_academy_registrations from chemgraph.academy.runtime.registration import wait_academy_registrations from chemgraph.academy.runtime.registration import write_academy_registrations @@ -216,9 +217,17 @@ def parse_args() -> argparse.Namespace: parser.add_argument('--redis-namespace') parser.add_argument( '--exchange-type', - choices=('redis', 'local', 'hybrid'), + choices=SUPPORTED_EXCHANGE_TYPES, default='redis', ) + parser.add_argument( + '--http-exchange-url', + default=None, + help=( + "Override URL for --exchange-type=http. Omit to use the " + "Academy-hosted default. Ignored for other exchange types." + ), + ) parser.add_argument('--chemgraph-repo-root') return parser.parse_args() @@ -247,6 +256,7 @@ def config_from_args(args: argparse.Namespace) -> ChemGraphDaemonConfig: redis_port=args.redis_port, redis_namespace=args.redis_namespace or namespace_for_run(run_dir), exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, rank=rank_from_env(), local_rank=local_rank_from_env(), chemgraph_repo_root=( diff --git a/src/chemgraph/academy/runtime/exchange.py b/src/chemgraph/academy/runtime/exchange.py index 6a8b2b2..bc7ed37 100644 --- a/src/chemgraph/academy/runtime/exchange.py +++ b/src/chemgraph/academy/runtime/exchange.py @@ -7,6 +7,26 @@ from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +SUPPORTED_EXCHANGE_TYPES: tuple[str, ...] = ('redis', 'local', 'hybrid', 'http') +"""All exchange types this module knows how to build. + +Used by the CLI to enforce ``--exchange-type`` choices and by tests +to assert the supported set stays in sync with the dispatch table +below. +""" + + +def exchange_uses_redis(exchange_type: str) -> bool: + """Return True when the exchange type requires a running Redis server. + + The compute launcher uses this to decide whether to start a Redis + subprocess on rank 0. Exchanges that don't talk to Redis (``local``, + ``http``) don't need one and skipping the subprocess avoids a port- + binding failure when Redis isn't installed on the compute node. + """ + return exchange_type in {'redis', 'hybrid'} + + def build_exchange_factory(config: ChemGraphDaemonConfig) -> Any: """Return the Academy exchange factory matching ``config.exchange_type``.""" exchange_type = config.exchange_type @@ -33,7 +53,25 @@ def build_exchange_factory(config: ChemGraphDaemonConfig) -> Any: namespace=config.redis_namespace, ) + if exchange_type == 'http': + # Academy's HTTP exchange. Passing url=None selects the + # hosted default (https://exchange.academy-agents.org/v1) + # with Globus Auth. The bearer token is read from + # $XDG_DATA_HOME/academy/storage.db -- the user (or the + # launcher's env-prep step) must have logged in already + # via the device flow before any agent constructs this. + # On Aurora compute nodes, http_proxy / https_proxy must be + # set to the ALCF proxy (http://proxy.alcf.anl.gov:3128) + # before the daemon starts; otherwise the first PUT will + # hang at the connection-timeout boundary. + from academy.exchange.cloud import HttpExchangeFactory + + kwargs: dict[str, Any] = {} + if config.http_exchange_url: + kwargs['url'] = config.http_exchange_url + return HttpExchangeFactory(**kwargs) + raise ValueError( f"Unsupported exchange type {exchange_type!r}; expected one of " - "'redis', 'local', 'hybrid'.", + f"{sorted(SUPPORTED_EXCHANGE_TYPES)}.", ) diff --git a/src/chemgraph/academy/runtime/mpi.py b/src/chemgraph/academy/runtime/mpi.py index 7439f58..b0439ab 100644 --- a/src/chemgraph/academy/runtime/mpi.py +++ b/src/chemgraph/academy/runtime/mpi.py @@ -96,6 +96,7 @@ def placement_payload(config: Any, agent_name: str) -> dict[str, Any]: 'rank': config.rank, 'local_rank': config.local_rank, 'exchange_type': config.exchange_type, + 'http_exchange_url': config.http_exchange_url, 'redis_host': config.redis_host, 'redis_port': config.redis_port, 'redis_namespace': config.redis_namespace, diff --git a/src/chemgraph/academy/runtime/registration.py b/src/chemgraph/academy/runtime/registration.py index ef8823d..855b248 100644 --- a/src/chemgraph/academy/runtime/registration.py +++ b/src/chemgraph/academy/runtime/registration.py @@ -7,6 +7,7 @@ from collections.abc import Mapping from typing import Any +from academy.exchange.cloud.client import HttpAgentRegistration from academy.exchange.hybrid import HybridAgentRegistration from academy.exchange.local import LocalAgentRegistration from academy.exchange.redis import RedisAgentRegistration @@ -21,6 +22,7 @@ 'local': LocalAgentRegistration, 'hybrid': HybridAgentRegistration, 'redis': RedisAgentRegistration, + 'http': HttpAgentRegistration, } diff --git a/tests/test_academy_compute_launcher.py b/tests/test_academy_compute_launcher.py index 20b04ea..f684652 100644 --- a/tests/test_academy_compute_launcher.py +++ b/tests/test_academy_compute_launcher.py @@ -56,3 +56,101 @@ def test_run_allocation_builds_single_mpiexec_command(tmp_path, monkeypatch) -> assert "--exchange-type" in cmd assert "--chemgraph-repo-root" in cmd assert (tmp_path / "launch_command.txt").exists() + + +# --------------------------------------------------------------------------- +# Phase B.1: --exchange-type http + cross-HPC plumbing +# --------------------------------------------------------------------------- + + +def _plan_http(tmp_path: Path, *, http_exchange_url: str | None = None) -> AllocationPlan: + base = _plan(tmp_path) + import dataclasses + return dataclasses.replace( + base, + exchange_type="http", + http_exchange_url=http_exchange_url, + ) + + +def test_run_allocation_with_http_exchange_does_not_start_redis( + tmp_path, monkeypatch, +) -> None: + """When the exchange doesn't talk to Redis (``http``, ``local``), + rank 0 must NOT start a redis-server subprocess. Otherwise compute + nodes without redis-server installed fail at launch, and nodes with + it pointlessly bind a port we never use.""" + started_subprocess: list[list[str]] = [] + + def fake_popen(cmd, **kwargs): # pragma: no cover - exercised via assert below + started_subprocess.append(list(cmd)) + raise AssertionError( + f"Popen should not be called for http exchange; got {cmd!r}", + ) + + monkeypatch.setattr(compute_launcher.subprocess, "Popen", fake_popen) + # wait_redis is the other Redis-touching site; assert it's not called. + def boom(*args, **kwargs): + raise AssertionError("wait_redis should not run for http exchange") + monkeypatch.setattr(compute_launcher, "wait_redis", boom) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: 0, + ) + + plan = _plan_http(tmp_path) + # start_redis is True by default; verify the http-exchange code path + # still skips Redis. This is the "operator forgot --no-start-redis" + # case, which used to fail loudly on nodes without redis-server. + import dataclasses + plan = dataclasses.replace(plan, start_redis=True) + assert compute_launcher.run_allocation(plan) == 0 + assert started_subprocess == [] + + +def test_run_allocation_forwards_http_exchange_url_when_set( + tmp_path, monkeypatch, +) -> None: + """``--http-exchange-url`` (operator override for a self-hosted + exchange) must flow into the daemon's argv. Otherwise the daemon + silently falls back to the hosted default.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + custom = "https://my-private-exchange.example.com/v1" + plan = _plan_http(tmp_path, http_exchange_url=custom) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--http-exchange-url" in cmd + assert custom in cmd + # Sanity: also confirm --exchange-type http rode along. + type_idx = cmd.index("--exchange-type") + assert cmd[type_idx + 1] == "http" + + +def test_run_allocation_omits_http_exchange_url_flag_when_unset( + tmp_path, monkeypatch, +) -> None: + """When no override is given, the daemon argv must NOT carry an + empty ``--http-exchange-url`` (which argparse would happily parse + as a literal empty-string URL and pass to HttpExchangeFactory).""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_http(tmp_path, http_exchange_url=None) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--http-exchange-url" not in cmd diff --git a/tests/test_academy_exchange_registration.py b/tests/test_academy_exchange_registration.py index 39aa150..fa9cc4b 100644 --- a/tests/test_academy_exchange_registration.py +++ b/tests/test_academy_exchange_registration.py @@ -3,6 +3,7 @@ from pathlib import Path import pytest +from academy.exchange.cloud.client import HttpAgentRegistration from academy.exchange.hybrid import HybridAgentRegistration from academy.exchange.local import LocalAgentRegistration from academy.exchange.redis import RedisAgentRegistration @@ -10,12 +11,19 @@ from chemgraph.academy.core.campaign import ChemGraphDaemonConfig from chemgraph.academy.runtime.exchange import build_exchange_factory +from chemgraph.academy.runtime.exchange import exchange_uses_redis +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES from chemgraph.academy.runtime.registration import load_academy_registrations from chemgraph.academy.runtime.registration import registration_payload from chemgraph.academy.runtime.registration import write_academy_registrations -def _config(tmp_path: Path, exchange_type: str) -> ChemGraphDaemonConfig: +def _config( + tmp_path: Path, + exchange_type: str, + *, + http_exchange_url: str | None = None, +) -> ChemGraphDaemonConfig: return ChemGraphDaemonConfig( run_dir=tmp_path, run_token='token-1', @@ -35,6 +43,7 @@ def _config(tmp_path: Path, exchange_type: str) -> ChemGraphDaemonConfig: local_rank=0, chemgraph_repo_root=tmp_path, exchange_type=exchange_type, + http_exchange_url=http_exchange_url, ) @@ -44,6 +53,7 @@ def _config(tmp_path: Path, exchange_type: str) -> ChemGraphDaemonConfig: ('redis', 'RedisExchangeFactory'), ('local', 'LocalExchangeFactory'), ('hybrid', 'HybridExchangeFactory'), + ('http', 'HttpExchangeFactory'), ], ) def test_build_exchange_factory_dispatches_by_config( @@ -61,12 +71,58 @@ def test_build_exchange_factory_rejects_unknown_exchange(tmp_path) -> None: build_exchange_factory(_config(tmp_path, 'bad')) +def test_supported_exchange_types_exposes_full_dispatch_table() -> None: + """SUPPORTED_EXCHANGE_TYPES drives both the CLI ``choices`` argument + on ``compute_launcher.parse_args`` and ``daemon.parse_args``. If the + set drifts from what ``build_exchange_factory`` actually handles, + the CLI happily accepts a value that then raises at runtime.""" + assert set(SUPPORTED_EXCHANGE_TYPES) == {'redis', 'local', 'hybrid', 'http'} + + +def test_exchange_uses_redis_helper_matches_dispatch_table() -> None: + """The compute launcher uses this helper to decide whether to start a + Redis subprocess on rank 0. Pin the answers for every supported type + so adding a new exchange forces a conscious yes/no decision here.""" + assert exchange_uses_redis('redis') is True + assert exchange_uses_redis('hybrid') is True + assert exchange_uses_redis('local') is False + assert exchange_uses_redis('http') is False + + +def test_http_exchange_factory_uses_hosted_default_when_url_omitted( + tmp_path, +) -> None: + """A ``None`` ``http_exchange_url`` must select Academy's hosted + default (https://exchange.academy-agents.org/v1). This is the path + every cross-HPC campaign takes unless the operator stands up a + self-hosted exchange.""" + from academy.exchange.cloud.client import DEFAULT_EXCHANGE_URL + factory = build_exchange_factory(_config(tmp_path, 'http')) + + # Upstream stores connection details on factory._info; reach into + # it to make sure we hand off the URL we mean to. + assert factory._info.url == DEFAULT_EXCHANGE_URL + + +def test_http_exchange_factory_honors_custom_url(tmp_path) -> None: + """Operators must be able to point at a self-hosted HTTP exchange + server (``python -m academy.exchange.cloud``). This is the escape + hatch when the public Academy server is unavailable or undesired.""" + custom = 'https://my-private-exchange.example.com/v1' + factory = build_exchange_factory( + _config(tmp_path, 'http', http_exchange_url=custom), + ) + + assert factory._info.url == custom + + @pytest.mark.parametrize( 'registration_cls', [ RedisAgentRegistration, LocalAgentRegistration, HybridAgentRegistration, + HttpAgentRegistration, ], ) def test_academy_registration_round_trips_by_exchange_type( From d5dd340cc58acb6afd41255d8a97110444af1801 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 08:29:48 -0500 Subject: [PATCH 03/25] feat(academy/core+runtime): agent subsetting + skip-bootstrap support Foundation for the federated ``spawn-site`` flow. The daemon can now launch a named subset of a campaign's agents instead of the whole roster, and rank 0 can skip its in-process bootstrap dispatch so kickoff is deferred to a separate operator-driven step. Both behaviors are opt-in; existing ``run-compute`` single-machine campaigns are untouched. core/campaign.py - ``filter_agents(campaign, names)`` returns a new ``ChemGraphCampaign`` with only the named agents, preserving order so MPI rank-to-agent mapping stays deterministic. Rejects empty selections, duplicate names, and names not declared on the campaign. Deliberately does NOT rewrite ``initial_agent`` -- in the federated flow that name may refer to an agent hosted on another site. - ``ChemGraphDaemonConfig`` gains two fields with backward-compatible defaults: ``agents: tuple[str, ...] = ()`` (empty = launch every declared agent) and ``skip_bootstrap: bool = False``. runtime/daemon.py - ``--agents `` CLI flag, parsed by ``_parse_agents_arg`` (whitespace-trimmed, empty-segment-tolerant). When set, the daemon applies ``filter_agents`` BEFORE ``validate_campaign`` so the downstream ``selected_agent(campaign, rank)`` and ``wait_for_agent_statuses_finished(campaign=...)`` both see the local slice only. - ``--no-bootstrap`` flag. Rank 0's bootstrap dispatch is now gated by ``not skip_bootstrap AND initial_agent in registrations``; the second clause naturally handles the case where ``initial_agent`` lives on another site. The skipped path emits a new ``bootstrap_message_skipped`` system trace recording the reason (flag vs. non-local agent) so investigators can tell "deferred to operator" apart from "silently forgot". Tests: 30/30 existing academy tests pass with the new defaults. Focused tests for filter_agents + slicing arrive with the ``spawn-site`` CLI in the follow-up commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/core/campaign.py | 54 ++++++++++++++++++- src/chemgraph/academy/runtime/daemon.py | 71 ++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/src/chemgraph/academy/core/campaign.py b/src/chemgraph/academy/core/campaign.py index 87299e6..1bd8bc0 100644 --- a/src/chemgraph/academy/core/campaign.py +++ b/src/chemgraph/academy/core/campaign.py @@ -3,7 +3,7 @@ import dataclasses import json import pathlib -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from typing import Any from chemgraph.academy.campaigns import resolve_campaign @@ -160,6 +160,20 @@ class ChemGraphDaemonConfig: # ``$XDG_DATA_HOME/academy/storage.db``. Set this for a # self-hosted ``python -m academy.exchange.cloud`` server. http_exchange_url: str | None = None + # Optional explicit agent slice for this launch. Empty tuple + # (default) means launch every agent declared on the campaign -- + # i.e. the single-machine ``run-compute`` flow. Non-empty means + # the federated ``spawn-site`` flow: this daemon only owns the + # listed agents, the rest are presumed running elsewhere and + # reachable through the exchange. Order is preserved so MPI ranks + # map to ``agents[rank]`` deterministically. + agents: tuple[str, ...] = () + # When True the rank-0 in-process bootstrap dispatch is skipped. + # Set by ``spawn-site`` to defer kickoff to the standalone + # ``bootstrap`` subcommand the operator runs once all sites are + # up. ``run-compute`` keeps the default (False) for backward + # compatibility with single-machine campaigns. + skip_bootstrap: bool = False def namespace_for_run(run_dir: pathlib.Path) -> str: @@ -443,6 +457,44 @@ def selected_agent(campaign: ChemGraphCampaign, rank: int) -> ChemGraphAgentSpec return campaign.agents[rank] +def filter_agents( + campaign: ChemGraphCampaign, + agent_names: Sequence[str], +) -> ChemGraphCampaign: + """Return a copy of ``campaign`` with only the named agents. + + Order in the returned ``agents`` tuple matches ``agent_names`` so MPI + rank-to-agent mapping stays deterministic across launches. + + Raises: + RuntimeError: if any name in ``agent_names`` is not declared on + the campaign, or if ``agent_names`` is empty / has duplicates. + + Note: + Subsetting deliberately does NOT rewrite ``initial_agent`` -- in + the federated ``spawn-site`` flow that name may still refer to an + agent hosted on another site. Validation against the subset is + loosened accordingly (callers must not pass the subsetted campaign + through ``validate_campaign`` with the strict ``initial_agent`` + check; use it for per-site daemon launch only). + """ + if not agent_names: + raise RuntimeError('filter_agents requires at least one agent name') + if len(set(agent_names)) != len(agent_names): + raise RuntimeError(f'duplicate agent names in selection: {list(agent_names)}') + + by_name = {agent.name: agent for agent in campaign.agents} + unknown = sorted(set(agent_names).difference(by_name)) + if unknown: + declared = sorted(by_name) + raise RuntimeError( + f'agents not declared on campaign: {unknown} (campaign declares {declared})', + ) + + selected = tuple(by_name[name] for name in agent_names) + return dataclasses.replace(campaign, agents=selected) + + def campaign_bootstrap_text(campaign: ChemGraphCampaign) -> str: initial_agent = next( (agent for agent in campaign.agents if agent.name == campaign.initial_agent), diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index 16a112d..e3d8e14 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -22,6 +22,7 @@ from chemgraph.academy.observability.run_artifacts import write_status_snapshot from chemgraph.academy.core.campaign import campaign_bootstrap_text from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +from chemgraph.academy.core.campaign import filter_agents from chemgraph.academy.core.campaign import load_campaign from chemgraph.academy.core.campaign import namespace_for_run from chemgraph.academy.core.campaign import resolve_campaign_resources @@ -38,6 +39,21 @@ from chemgraph.models.settings import load_lm_settings +def _parse_agents_arg(raw: str | None) -> tuple[str, ...]: + """Parse the comma-separated ``--agents`` flag into a name tuple. + + Returns an empty tuple when the flag is omitted (single-machine + ``run-compute`` flow, every agent on the campaign is launched). + Whitespace around individual names is trimmed; empty segments + (e.g. trailing comma) are dropped silently. Duplicate-name + detection lives in ``filter_agents`` so the user-facing error + surfaces in one place regardless of where the list came from. + """ + if not raw: + return () + return tuple(name.strip() for name in raw.split(',') if name.strip()) + + async def run_daemon(config: ChemGraphDaemonConfig) -> int: config.run_dir.mkdir(parents=True, exist_ok=True) llm_settings = load_lm_settings(config.lm_config) @@ -46,6 +62,12 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: config.run_dir, ) prompt_profile = load_prompt_profile(campaign.prompt_profile) + # When this site only owns a slice (federated spawn-site flow), + # filter the campaign down to that slice BEFORE validation so the + # daemon's downstream rank-indexing (selected_agent, mpiexec -n) + # all agree on the same agent ordering. + if config.agents: + campaign = filter_agents(campaign, config.agents) validate_campaign(campaign, config.agent_count) agent_spec = selected_agent(campaign, config.rank) placement = placement_payload(config, agent_spec.name) @@ -146,7 +168,21 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: async with runtime: await agent.write_runtime_status() - if config.rank == 0: + # Rank 0 normally dispatches the campaign bootstrap message + # to ``initial_agent``. Two conditions skip it: + # * ``--no-bootstrap`` was set (spawn-site flow -- kickoff + # happens via the separate ``bootstrap`` subcommand once + # every federated site has come up). + # * ``initial_agent`` is not in this site's slice (it + # lives on another site reachable through the exchange; + # a non-owning site must not pretend to deliver the + # bootstrap locally). + initial_is_local = campaign.initial_agent in registrations + if ( + config.rank == 0 + and not config.skip_bootstrap + and initial_is_local + ): bootstrap = build_message( sender='campaign', recipient=campaign.initial_agent, @@ -172,6 +208,19 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: 'via': 'academy_action', }, ) + elif config.rank == 0: + # Record the reason for skipping so investigators can + # tell "deferred to operator" apart from "silently + # forgot". + append_system_trace( + config.run_dir, + 'bootstrap_message_skipped', + { + 'initial_agent': campaign.initial_agent, + 'skip_bootstrap_flag': bool(config.skip_bootstrap), + 'initial_is_local': bool(initial_is_local), + }, + ) await runtime.wait_shutdown() @@ -228,6 +277,24 @@ def parse_args() -> argparse.Namespace: "Academy-hosted default. Ignored for other exchange types." ), ) + parser.add_argument( + '--agents', + default=None, + help=( + "Comma-separated subset of agent names to launch (federated " + "spawn-site mode). Omit to launch every agent declared on the " + "campaign (single-machine run-compute mode)." + ), + ) + parser.add_argument( + '--no-bootstrap', + action='store_true', + help=( + "Skip the rank-0 in-process bootstrap dispatch. Used by " + "spawn-site so kickoff can be triggered separately via the " + "'chemgraph academy bootstrap' subcommand once every site is up." + ), + ) parser.add_argument('--chemgraph-repo-root') return parser.parse_args() @@ -257,6 +324,8 @@ def config_from_args(args: argparse.Namespace) -> ChemGraphDaemonConfig: redis_namespace=args.redis_namespace or namespace_for_run(run_dir), exchange_type=args.exchange_type, http_exchange_url=args.http_exchange_url, + agents=_parse_agents_arg(args.agents), + skip_bootstrap=bool(args.no_bootstrap), rank=rank_from_env(), local_rank=local_rank_from_env(), chemgraph_repo_root=( From fcb8de20f9c5aa4e9eb0e56c0df4f829ef9ebe60 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 08:37:07 -0500 Subject: [PATCH 04/25] feat(academy): spawn-site CLI subcommand for federated launches Operator-facing piece of the federated flow: ``chemgraph academy spawn-site`` launches one site of a multi-site campaign. Same arguments as ``run-compute`` plus the slice selector ``--agents worker-a,worker-b``; internal bootstrap is always skipped (the operator triggers kickoff once every site is up, via the dedicated ``bootstrap`` subcommand that lands in a follow-up commit). UX target (Aurora + Crux + Mac dashboard): # Aurora compute node chemgraph academy spawn-site -- \\ --system aurora --campaign federated-demo.jsonc \\ --agents coordinator-agent --exchange-type http # Crux compute node chemgraph academy spawn-site -- \\ --system crux --campaign federated-demo.jsonc \\ --agents worker-a,worker-b --exchange-type http # Mac (later, after both sides are up) chemgraph academy bootstrap -- ... core/campaign.py - ``parse_agents_selection(raw)`` promotes the comma-list parser to a public helper so launcher and daemon agree on whitespace / empty- segment handling. Duplicate detection lives in ``filter_agents`` so the user-facing error appears in one place regardless of the input path. runtime/compute_launcher.py - ``--agents`` + ``--no-bootstrap`` flags. ``AllocationPlan`` gains matching ``agents: tuple[str, ...]`` and ``skip_bootstrap: bool`` fields, both with backward-compatible defaults so the existing ``run-compute`` flow is unchanged. - ``prepare_compute_launch`` derives ``agent_count`` from the slice length when ``--agents`` is given; refuses to mix a contradicting explicit ``--agent-count`` rather than silently picking one. Mpi ``-n`` therefore always matches the daemon's post-filter agent ordering. - ``run_allocation`` forwards ``--agents`` and ``--no-bootstrap`` into the daemon argv only when set. runtime/daemon.py - Drops the private ``_parse_agents_arg`` helper in favor of the shared ``parse_agents_selection`` import. cli/main.py - ``academy spawn-site`` subcommand registered. Implementation is a thin shell over ``compute_main`` that prepends ``--no-bootstrap`` if the operator didn't already include it -- ``spawn-site`` is semantically ``run-compute`` with bootstrap disabled and an agent slice required. Tests (+11, 41 -> 41 in the two touched suites; 63/63 across full academy sweep) - parse_agents_selection: trimming, empty segments, None / "" input - filter_agents: order preservation, unknown-name rejection, empty-selection rejection, duplicate-name rejection - prepare_compute_launch: derives agent_count from --agents, rejects contradicting --agent-count - run_allocation: --agents and --no-bootstrap are forwarded when set, omitted when default (so single-machine flow is byte- identical) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/core/campaign.py | 16 ++ .../academy/runtime/compute_launcher.py | 51 ++++- src/chemgraph/academy/runtime/daemon.py | 18 +- src/chemgraph/cli/main.py | 34 ++- tests/test_academy_campaign.py | 63 ++++++ tests/test_academy_compute_launcher.py | 206 ++++++++++++++++++ 6 files changed, 370 insertions(+), 18 deletions(-) diff --git a/src/chemgraph/academy/core/campaign.py b/src/chemgraph/academy/core/campaign.py index 1bd8bc0..3a3704a 100644 --- a/src/chemgraph/academy/core/campaign.py +++ b/src/chemgraph/academy/core/campaign.py @@ -457,6 +457,22 @@ def selected_agent(campaign: ChemGraphCampaign, rank: int) -> ChemGraphAgentSpec return campaign.agents[rank] +def parse_agents_selection(raw: str | None) -> tuple[str, ...]: + """Parse a comma-separated ``--agents`` flag into a name tuple. + + Returns an empty tuple when ``raw`` is None or empty (the + single-machine flow where every declared agent is launched). + Whitespace around individual names is trimmed; empty segments + (e.g. trailing comma) are dropped silently. Duplicate-name + detection lives in :func:`filter_agents` so the user-facing + error surfaces in one place regardless of where the list + originated. + """ + if not raw: + return () + return tuple(name.strip() for name in raw.split(',') if name.strip()) + + def filter_agents( campaign: ChemGraphCampaign, agent_names: Sequence[str], diff --git a/src/chemgraph/academy/runtime/compute_launcher.py b/src/chemgraph/academy/runtime/compute_launcher.py index 1647b7c..82ca761 100644 --- a/src/chemgraph/academy/runtime/compute_launcher.py +++ b/src/chemgraph/academy/runtime/compute_launcher.py @@ -15,6 +15,7 @@ from chemgraph.academy.campaigns import campaign_launch_defaults from chemgraph.academy.campaigns import resolve_campaign from chemgraph.academy.campaigns import resolve_lm_config_template +from chemgraph.academy.core.campaign import parse_agents_selection from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES from chemgraph.academy.runtime.exchange import exchange_uses_redis from chemgraph.academy.runtime.profiles import list_builtin_system_profiles @@ -51,6 +52,12 @@ class AllocationPlan: chemgraph_repo_root: Path exchange_type: str = "redis" http_exchange_url: str | None = None + # Federated spawn-site fields. Empty ``agents`` = single-machine + # run-compute flow (every agent on the campaign is launched). + # Non-empty = this allocation only owns the listed agents; the + # other sites are presumed running elsewhere on the same exchange. + agents: tuple[str, ...] = () + skip_bootstrap: bool = False def parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -100,6 +107,26 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: "Academy-hosted default." ), ) + parser.add_argument( + "--agents", + default=None, + help=( + "Comma-separated subset of agent names this allocation owns " + "(federated spawn-site mode). When given, --agent-count is " + "derived from the slice length and the daemon receives " + "--agents so it filters the campaign down to the slice. " + "Omit to launch every declared agent (single-machine mode)." + ), + ) + parser.add_argument( + "--no-bootstrap", + action="store_true", + help=( + "Skip rank-0's in-process bootstrap dispatch. Set by " + "spawn-site so kickoff is deferred to the separate " + "'chemgraph academy bootstrap' subcommand." + ), + ) parser.add_argument("--no-start-redis", action="store_true") return parser.parse_args(argv) @@ -258,7 +285,20 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: max_tokens=args.max_tokens, ) _export_workflow_lm_environment(lm_config) - agent_count = args.agent_count or defaults.agent_count + # When --agents is given the slice length is authoritative. Otherwise + # use the CLI / packaged default. We refuse to mix --agents with an + # explicit --agent-count that disagrees, to avoid silent surprises. + agents_slice = parse_agents_selection(getattr(args, "agents", None)) + if agents_slice: + derived_count = len(agents_slice) + if args.agent_count and args.agent_count != derived_count: + raise RuntimeError( + f"--agent-count={args.agent_count} contradicts --agents " + f"(which implies {derived_count}). Pass --agents alone." + ) + agent_count = derived_count + else: + agent_count = args.agent_count or defaults.agent_count agents_per_node = args.agents_per_node or defaults.agents_per_node max_decisions = args.max_decisions or defaults.max_decisions redis_port = args.redis_port or profile.redis_port @@ -290,6 +330,8 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: chemgraph_repo_root=Path(profile.repo_root).resolve(), exchange_type=args.exchange_type, http_exchange_url=args.http_exchange_url, + agents=agents_slice, + skip_bootstrap=bool(getattr(args, "no_bootstrap", False)), ) @@ -366,6 +408,13 @@ def run_allocation(plan: AllocationPlan) -> int: ] if plan.http_exchange_url: daemon_args += ["--http-exchange-url", plan.http_exchange_url] + # Federated spawn-site forwarding. Both flags are omitted on the + # single-machine run-compute flow so the daemon's argparse + # defaults (full campaign, bootstrap enabled) keep prior behavior. + if plan.agents: + daemon_args += ["--agents", ",".join(plan.agents)] + if plan.skip_bootstrap: + daemon_args += ["--no-bootstrap"] # When using the HTTP exchange on HPC compute nodes (e.g. Aurora), # ranks must reach exchange.academy-agents.org through the site's # outbound HTTP proxy. PALS/MPICH mpiexec strips most parent-shell diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index e3d8e14..173e03d 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -25,6 +25,7 @@ from chemgraph.academy.core.campaign import filter_agents from chemgraph.academy.core.campaign import load_campaign from chemgraph.academy.core.campaign import namespace_for_run +from chemgraph.academy.core.campaign import parse_agents_selection from chemgraph.academy.core.campaign import resolve_campaign_resources from chemgraph.academy.core.campaign import selected_agent from chemgraph.academy.core.campaign import validate_campaign @@ -39,21 +40,6 @@ from chemgraph.models.settings import load_lm_settings -def _parse_agents_arg(raw: str | None) -> tuple[str, ...]: - """Parse the comma-separated ``--agents`` flag into a name tuple. - - Returns an empty tuple when the flag is omitted (single-machine - ``run-compute`` flow, every agent on the campaign is launched). - Whitespace around individual names is trimmed; empty segments - (e.g. trailing comma) are dropped silently. Duplicate-name - detection lives in ``filter_agents`` so the user-facing error - surfaces in one place regardless of where the list came from. - """ - if not raw: - return () - return tuple(name.strip() for name in raw.split(',') if name.strip()) - - async def run_daemon(config: ChemGraphDaemonConfig) -> int: config.run_dir.mkdir(parents=True, exist_ok=True) llm_settings = load_lm_settings(config.lm_config) @@ -324,7 +310,7 @@ def config_from_args(args: argparse.Namespace) -> ChemGraphDaemonConfig: redis_namespace=args.redis_namespace or namespace_for_run(run_dir), exchange_type=args.exchange_type, http_exchange_url=args.http_exchange_url, - agents=_parse_agents_arg(args.agents), + agents=parse_agents_selection(args.agents), skip_bootstrap=bool(args.no_bootstrap), rank=rank_from_env(), local_rank=local_rank_from_env(), diff --git a/src/chemgraph/cli/main.py b/src/chemgraph/cli/main.py index 788d5aa..32e3680 100644 --- a/src/chemgraph/cli/main.py +++ b/src/chemgraph/cli/main.py @@ -284,6 +284,24 @@ def create_argument_parser() -> argparse.ArgumentParser: help="Arguments forwarded to chemgraph.academy.runtime.compute_launcher.", ) + spawn_site_parser = academy_sub.add_parser( + "spawn-site", + help=( + "Launch one site of a federated ChemGraph Academy campaign. " + "Like run-compute but only spawns the agent slice given via " + "--agents and skips internal bootstrap (use the 'bootstrap' " + "subcommand once every site is up)." + ), + ) + spawn_site_parser.add_argument( + "spawn_site_args", + nargs=argparse.REMAINDER, + help=( + "Arguments forwarded to chemgraph.academy.runtime.compute_launcher. " + "--agents is required; --no-bootstrap is auto-added." + ), + ) + dashboard_parser = academy_sub.add_parser( "dashboard", help="Start the local dashboard launcher for a ChemGraph Academy run.", @@ -625,6 +643,20 @@ def _handle_academy(args: argparse.Namespace) -> None: if code: sys.exit(code) return + if command == "spawn-site": + from chemgraph.academy.runtime.compute_launcher import main as compute_main + + # spawn-site = run-compute with --no-bootstrap forced on. The + # compute_launcher's argparse tolerates a repeated --no-bootstrap, + # so we prepend it unconditionally rather than try to detect + # whether the operator already passed it. + forwarded = _strip_remainder_separator(args.spawn_site_args) + if "--no-bootstrap" not in forwarded: + forwarded = ["--no-bootstrap", *forwarded] + code = compute_main(forwarded) + if code: + sys.exit(code) + return if command == "campaigns": from chemgraph.academy.campaigns import list_campaigns @@ -633,7 +665,7 @@ def _handle_academy(args: argparse.Namespace) -> None: return console.print( "Usage: chemgraph academy " - "{mpi-daemon,run-compute,dashboard,campaigns}.", + "{mpi-daemon,run-compute,spawn-site,dashboard,campaigns}.", ) diff --git a/tests/test_academy_campaign.py b/tests/test_academy_campaign.py index ec102fb..17726e1 100644 --- a/tests/test_academy_campaign.py +++ b/tests/test_academy_campaign.py @@ -351,3 +351,66 @@ def test_validate_campaign_rejects_allowed_tools_without_servers(tmp_path) -> No match="allowed_tools but no mcp_servers", ): validate_campaign(campaign, 1) + + +# --------------------------------------------------------------------------- +# Phase B.1: filter_agents + parse_agents_selection (federated spawn-site) +# --------------------------------------------------------------------------- + + +def test_parse_agents_selection_handles_trimming_and_empty_segments() -> None: + """The CLI's ``--agents`` value passes through this helper before + reaching the daemon. Tolerate whitespace + trailing commas so + operators don't get bitten by shell-quoting quirks.""" + from chemgraph.academy.core.campaign import parse_agents_selection + assert parse_agents_selection(None) == () + assert parse_agents_selection("") == () + assert parse_agents_selection("worker-a") == ("worker-a",) + assert parse_agents_selection(" worker-a , worker-b ") == ("worker-a", "worker-b") + assert parse_agents_selection("worker-a,,worker-b,") == ("worker-a", "worker-b") + + +def test_filter_agents_returns_slice_in_caller_order() -> None: + """MPI rank-to-agent mapping must match the order the operator + asked for. Don't accidentally re-sort or use the campaign's + declaration order.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + selected = filter_agents(campaign, ["mace-agent", "structure-agent-a"]) + assert [a.name for a in selected.agents] == ["mace-agent", "structure-agent-a"] + # initial_agent is intentionally NOT rewritten -- in the federated + # flow it may name an agent hosted on another site. + assert selected.initial_agent == campaign.initial_agent + # Untouched campaign fields stay intact. + assert selected.resources == campaign.resources + assert selected.mcp_servers == campaign.mcp_servers + + +def test_filter_agents_rejects_unknown_names() -> None: + """An unknown name almost certainly indicates an operator typo or + a campaign-file/CLI drift. Fail closed.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="not declared on campaign"): + filter_agents(campaign, ["mace-agent", "no-such-agent"]) + + +def test_filter_agents_rejects_empty_selection() -> None: + """A zero-length slice means 'launch nothing,' which is never what + the operator means. The launcher should never even construct an + empty selection (parse_agents_selection returns () on no input, + and the launcher short-circuits on empty), but the helper itself + must still fail closed if reached.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="at least one agent"): + filter_agents(campaign, []) + + +def test_filter_agents_rejects_duplicate_names() -> None: + """Duplicates would shadow each other in the post-filter campaign + and silently confuse the rank-to-agent mapping. Fail closed.""" + from chemgraph.academy.core.campaign import filter_agents, load_campaign + campaign = load_campaign("mace-ensemble-screening-20") + with pytest.raises(RuntimeError, match="duplicate agent names"): + filter_agents(campaign, ["mace-agent", "mace-agent"]) diff --git a/tests/test_academy_compute_launcher.py b/tests/test_academy_compute_launcher.py index f684652..9690d59 100644 --- a/tests/test_academy_compute_launcher.py +++ b/tests/test_academy_compute_launcher.py @@ -154,3 +154,209 @@ def test_run_allocation_omits_http_exchange_url_flag_when_unset( cmd = calls[0] assert "--http-exchange-url" not in cmd + + +# --------------------------------------------------------------------------- +# Phase B.1: agent subsetting + spawn-site --no-bootstrap forwarding +# --------------------------------------------------------------------------- + + +def _plan_subset( + tmp_path: Path, + *, + agents: tuple[str, ...], + skip_bootstrap: bool = True, +) -> AllocationPlan: + """An AllocationPlan that mimics what ``spawn-site`` would build.""" + import dataclasses + base = _plan(tmp_path) + return dataclasses.replace( + base, + agent_count=len(agents), # spawn-site derives count from slice + agents=agents, + skip_bootstrap=skip_bootstrap, + ) + + +def test_run_allocation_forwards_agents_flag_when_slice_given( + tmp_path, monkeypatch, +) -> None: + """When ``plan.agents`` is non-empty the daemon must receive + ``--agents worker-a,worker-b``, otherwise it would launch the + full campaign on every rank index and the rank-to-agent mapping + would diverge across sites.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_subset(tmp_path, agents=("worker-a", "worker-b")) + assert compute_launcher.run_allocation(plan) == 0 + + cmd = calls[0] + assert "--agents" in cmd + idx = cmd.index("--agents") + assert cmd[idx + 1] == "worker-a,worker-b" + # Slice length must drive mpiexec -n so rank ordering matches the + # daemon's post-filter view of campaign.agents. + assert cmd[: cmd.index("--ppn") + 2] == ["mpiexec", "-n", "2", "--ppn", "1"] + + +def test_run_allocation_omits_agents_flag_for_single_machine_runs( + tmp_path, monkeypatch, +) -> None: + """The single-machine ``run-compute`` flow leaves ``plan.agents`` + empty so the daemon falls back to its launch-everything default. + A spurious ``--agents`` flag here would cause subsetting to fail + closed (``filter_agents`` rejects unknown names).""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + assert compute_launcher.run_allocation(_plan(tmp_path)) == 0 + + cmd = calls[0] + assert "--agents" not in cmd + + +def test_run_allocation_forwards_no_bootstrap_when_requested( + tmp_path, monkeypatch, +) -> None: + """``spawn-site`` sets ``plan.skip_bootstrap=True`` because kickoff + must be deferred until every federated site is up. The launcher + must propagate this -- otherwise rank 0 dispatches the bootstrap + locally and the campaign starts before remote agents have + registered on the exchange.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + plan = _plan_subset(tmp_path, agents=("worker-a",), skip_bootstrap=True) + assert compute_launcher.run_allocation(plan) == 0 + assert "--no-bootstrap" in calls[0] + + +def test_run_allocation_omits_no_bootstrap_for_single_machine_runs( + tmp_path, monkeypatch, +) -> None: + """``run-compute`` keeps its inline bootstrap so the + single-machine UX doesn't regress -- the flag must be absent + when ``plan.skip_bootstrap`` is False.""" + calls: list[list[str]] = [] + monkeypatch.setattr(compute_launcher, "wait_redis", lambda *a, **k: None) + monkeypatch.setattr( + compute_launcher.subprocess, + "call", + lambda cmd: calls.append(cmd) or 0, + ) + + assert compute_launcher.run_allocation(_plan(tmp_path)) == 0 + assert "--no-bootstrap" not in calls[0] + + +def test_prepare_compute_launch_derives_agent_count_from_agents( + tmp_path, monkeypatch, +) -> None: + """When ``--agents worker-a,worker-b`` is given the launcher must + derive agent_count=2 from the slice length. An operator who also + passes ``--agent-count`` that disagrees should hit a loud error + -- silent precedence would let the two values drift, and the + daemon's MPI -n would not equal its post-filter agent ordering.""" + import argparse + args = argparse.Namespace( + run_id="r", campaign="mace-ensemble-screening-20", run_dir=None, + lm_base_url="http://stub:0/v1", relay_host=None, lm_model=None, lm_user=None, + max_tokens=None, agents_per_node=None, max_decisions=None, + redis_port=None, exchange_type="local", http_exchange_url=None, + no_start_redis=True, system="aurora", + agents="structure-agent-a,mace-agent", + no_bootstrap=True, + agent_count=None, + ) + # Avoid the actual aurora profile load (we'd need ALCF_USER set, + # the campaign template, etc). Stub the prep helpers that touch + # the filesystem. + monkeypatch.setattr(compute_launcher, "load_system_profile", + lambda name: _stub_profile(tmp_path)) + monkeypatch.setattr(compute_launcher, "_prepare_environment", + lambda profile, *, exchange_type: None) + monkeypatch.setattr(compute_launcher, "_load_dashboard_metadata", + lambda run_dir: {}) + monkeypatch.setattr(compute_launcher, "_write_lm_config", + lambda **kw: tmp_path / "lm.json") + monkeypatch.setattr(compute_launcher, "_export_workflow_lm_environment", + lambda lm_config: None) + + plan = compute_launcher.prepare_compute_launch(args) + assert plan.agent_count == 2 + assert plan.agents == ("structure-agent-a", "mace-agent") + assert plan.skip_bootstrap is True + + +def test_prepare_compute_launch_rejects_disagreeing_agent_count( + tmp_path, monkeypatch, +) -> None: + """Disagreeing ``--agent-count`` + ``--agents`` is a footgun: + silent precedence would let the operator think they were + launching 3 agents when only 2 ranks actually fire. Refuse loudly.""" + import argparse + import pytest + args = argparse.Namespace( + run_id="r", campaign="mace-ensemble-screening-20", run_dir=None, + lm_base_url="http://stub:0/v1", relay_host=None, lm_model=None, lm_user=None, + max_tokens=None, agents_per_node=None, max_decisions=None, + redis_port=None, exchange_type="local", http_exchange_url=None, + no_start_redis=True, system="aurora", + agents="structure-agent-a,mace-agent", + no_bootstrap=True, + agent_count=3, # mismatched -- 2 names but operator says 3 + ) + monkeypatch.setattr(compute_launcher, "load_system_profile", + lambda name: _stub_profile(tmp_path)) + monkeypatch.setattr(compute_launcher, "_prepare_environment", + lambda profile, *, exchange_type: None) + monkeypatch.setattr(compute_launcher, "_load_dashboard_metadata", + lambda run_dir: {}) + monkeypatch.setattr(compute_launcher, "_write_lm_config", + lambda **kw: tmp_path / "lm.json") + monkeypatch.setattr(compute_launcher, "_export_workflow_lm_environment", + lambda lm_config: None) + + with pytest.raises(RuntimeError, match="contradicts --agents"): + compute_launcher.prepare_compute_launch(args) + + +def _stub_profile(tmp_path: Path): + """Minimal SystemProfile-shaped stub for prepare_compute_launch tests.""" + from chemgraph.academy.runtime.profiles.system import SystemProfile + return SystemProfile( + name="aurora", + remote_host="jinchuli@aurora", + remote_root=str(tmp_path), + repo_root=str(tmp_path / "ChemGraph"), + run_root=str(tmp_path / "runs"), + relay_host_file=str(tmp_path / "relay.host"), + relay_port=18186, + venv_python=str(tmp_path / "venv/bin/python"), + redis_bin_dir=str(tmp_path / "redis/bin"), + redis_port=6392, + redis_bind="0.0.0.0", + redis_protected_mode="no", + mpiexec="mpiexec", + pythonpath_entries=[], + path_entries=[], + env={}, + unset_env=[], + no_proxy="127.0.0.1,localhost", + ) From da3ae6ff1f2256eb0af4cb44b5bba75d832c71cc Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 09:01:41 -0500 Subject: [PATCH 05/25] refactor(academy/runtime): exchange-based peer discovery, drop file registration Replace the shared-filesystem JSON file (``/academy_registrations.json``) with exchange-mediated discovery. The old mechanism required rank 0 to register every agent on the campaign and write the resulting registrations to disk for the other ranks to pick up. That works for a single allocation on a single FS; it cannot span machines, which blocks federated ``spawn-site`` campaigns spread across Aurora + Crux + ... New flow - Each rank registers ONLY its own local agent via ``transport.register_agent(ChemGraphLogicalAgent, name=...)``. - Each rank discovers cross-rank / cross-site peers by polling ``transport.discover()`` with a wall-clock timeout, filtering the returned ``AgentId`` tuple client-side by ``AgentId.name``. - No rank-0 special role for registration. Convergence is per-site: each rank exits the discovery loop as soon as its own ``allowed_peers`` are all visible on the exchange, regardless of what other ranks / sites are doing. - ``bootstrap_message_dispatched`` rule simplified to ``initial_agent == agent_spec.name`` (instead of "name in registrations dict"); semantically identical for single-machine runs, correct for federated runs. runtime/registration.py: gutted and rewritten. Old surface area (``load_academy_registrations``, ``write_academy_registrations``, ``wait_academy_registrations``, ``registration_payload``, ``academy_registration_path``, ``_REGISTRATION_TYPES``, ``_exchange_type_of``) deleted in favor of a single async helper ``discover_peer_agent_ids(transport, peer_names, *, agent_class, timeout_s, poll_interval_s)``. Returns ``dict[name, AgentId]`` for ``Handle`` construction. Times out with a message listing the missing peer names so operators can immediately tell which site failed to register. runtime/daemon.py: registration block + bootstrap dispatch reworked to match the new flow. Code shrinks: the rank-0 / rank-N branch is gone; the post-block "if rank == 0: reload registrations" hack is gone; ``registrations`` dict and its key lookups replaced with ``registration`` (own) plus ``peer_agent_ids`` (discovered). observability/run_artifacts.py: ``clear_run_outputs`` no longer deletes the dead ``academy_registrations.json`` filename. tests/test_academy_exchange_registration.py: file-based round-trip tests removed (their target functions no longer exist). Replaced with discovery-helper tests against a ``_FakeTransport`` whose ``discover()`` returns pre-configured rounds: * empty peer list short-circuits without any discover() calls * happy path returns name -> AgentId for requested peers only, even when discover() also returns other agents (cross-operator isolation depends on this filter) * waits across multiple polls for late peers (the federated convergence story) * times out with the missing peer names in the message * first-found-wins for a re-seen peer name across polls Run-id name-prefixing remains an operator-runbook convention until auto-prefixing lands; without it, two operators running concurrent demos against the same hosted exchange would see each other's agents in their ``discover()`` results. Tests: 62/62 academy sweep (was 63; net -1 because the parametrized file-round-trip test was 4 cases and the replacement is 4 helpers + 1 short-circuit case). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../academy/observability/run_artifacts.py | 1 - src/chemgraph/academy/runtime/daemon.py | 87 ++++---- src/chemgraph/academy/runtime/registration.py | 193 ++++++++---------- tests/test_academy_exchange_registration.py | 180 ++++++++++++---- 4 files changed, 257 insertions(+), 204 deletions(-) diff --git a/src/chemgraph/academy/observability/run_artifacts.py b/src/chemgraph/academy/observability/run_artifacts.py index 11fa8b4..083d2e2 100644 --- a/src/chemgraph/academy/observability/run_artifacts.py +++ b/src/chemgraph/academy/observability/run_artifacts.py @@ -265,7 +265,6 @@ async def wait_for_agent_statuses_finished( def clear_run_outputs(run_dir: pathlib.Path) -> None: for name in ( - 'academy_registrations.json', 'messages.jsonl', 'events.jsonl', 'placement.json', diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index 173e03d..fbde139 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -12,9 +12,7 @@ from chemgraph.academy.core.peer_protocol import build_message from chemgraph.academy.runtime.exchange import build_exchange_factory from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES -from chemgraph.academy.runtime.registration import load_academy_registrations -from chemgraph.academy.runtime.registration import wait_academy_registrations -from chemgraph.academy.runtime.registration import write_academy_registrations +from chemgraph.academy.runtime.registration import discover_peer_agent_ids from chemgraph.academy.observability.run_artifacts import initialize_run_files from chemgraph.academy.observability.run_artifacts import ( wait_for_agent_statuses_finished, @@ -77,55 +75,44 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: academy_factory = build_exchange_factory(config) if config.rank == 0: + # Rank 0 still owns the one-shot init-files dance, but it + # is NO LONGER special for registration -- every rank + # registers its own agent independently and discovers peers + # via the exchange. initialize_run_files( run_dir=config.run_dir, campaign=campaign, config=config, llm_settings=llm_settings, ) - registrar = await academy_factory.create_user_client( - name=f'{config.run_dir.name}-registrar', - start_listener=False, - ) - try: - registered = await registrar.register_agents( - [ - (ChemGraphLogicalAgent, spec.name) - for spec in campaign.agents - ], - ) - finally: - await registrar.close() - registrations = dict( - zip( - (spec.name for spec in campaign.agents), - registered, - strict=True, - ), - ) - write_academy_registrations( - run_dir=config.run_dir, - run_token=config.run_token, - registrations=registrations, + + # Each rank registers ONLY its own agent on the exchange and + # discovers cross-rank / cross-site peers by polling + # ``transport.discover()``. This works identically whether the + # peers are on the same node (LocalExchange), same allocation + # (Redis), or a different HPC entirely (HttpExchange against + # the hosted Academy server) -- the discovery protocol is the + # same. There is no longer a shared-filesystem dependency. + registrar = await academy_factory.create_user_client( + name=f'{config.run_dir.name}-rank{config.rank}-registrar', + start_listener=False, + ) + try: + registration = await registrar.register_agent( + ChemGraphLogicalAgent, + name=agent_spec.name, ) - else: - registrations = await wait_academy_registrations( - config.run_dir, - run_token=config.run_token, + peer_agent_ids = await discover_peer_agent_ids( + registrar._transport, + # Skip self if the campaign mistakenly lists own name + # as a peer (validate_campaign rejects this, but + # defense-in-depth costs nothing). + [p for p in agent_spec.allowed_peers if p != agent_spec.name], + agent_class=ChemGraphLogicalAgent, timeout_s=config.startup_timeout_s, ) - - if config.rank == 0: - registrations = load_academy_registrations( - config.run_dir, - run_token=config.run_token, - ) - registration = registrations[agent_spec.name] - peer_agent_ids = { - peer: registrations[peer].agent_id - for peer in agent_spec.allowed_peers - if peer in registrations - } + finally: + await registrar.close() agent = ChemGraphLogicalAgent( agent_spec, @@ -159,11 +146,11 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: # * ``--no-bootstrap`` was set (spawn-site flow -- kickoff # happens via the separate ``bootstrap`` subcommand once # every federated site has come up). - # * ``initial_agent`` is not in this site's slice (it - # lives on another site reachable through the exchange; - # a non-owning site must not pretend to deliver the - # bootstrap locally). - initial_is_local = campaign.initial_agent in registrations + # * ``initial_agent`` is not this rank's own agent (it + # lives on another rank / site; that owner's rank 0 + # handles it, or the operator triggers bootstrap once + # every site is up). + initial_is_local = campaign.initial_agent == agent_spec.name if ( config.rank == 0 and not config.skip_bootstrap @@ -178,9 +165,7 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: reason='Initial campaign task dispatch.', confidence=1.0, ) - initial_handle: Handle[Any] = Handle( - registrations[campaign.initial_agent].agent_id, - ) + initial_handle: Handle[Any] = Handle(registration.agent_id) await initial_handle.action( 'receive_message', bootstrap, diff --git a/src/chemgraph/academy/runtime/registration.py b/src/chemgraph/academy/runtime/registration.py index 855b248..01e37d5 100644 --- a/src/chemgraph/academy/runtime/registration.py +++ b/src/chemgraph/academy/runtime/registration.py @@ -1,124 +1,101 @@ +"""Peer-agent discovery against the Academy exchange. + +The runtime used to coordinate per-rank registrations via a +shared-filesystem JSON file (``/academy_registrations.json``): +rank 0 registered every agent on the campaign and wrote the resulting +``AgentRegistration`` objects to disk; other ranks polled the file. That +mechanism cannot span machines with separate filesystems, which blocks +the federated ``spawn-site`` flow. + +The replacement uses the exchange itself as the lookup service. Each +rank registers ONLY its own local agent (returning an +``AgentRegistration`` that goes straight into ``Runtime``), and looks +up the ``AgentId`` of every cross-site peer by polling +``transport.discover(ChemGraphLogicalAgent)`` until the expected names +appear. There is no rank-0 special-casing for registration anymore: any +rank can come up in any order, on any host, as long as eventually +every peer's mailbox is reachable through the exchange before +``startup_timeout_s`` elapses. + +Name collisions: ``discover()`` returns every agent of the given class +registered against the exchange, across all operators and campaigns. To +keep federated campaigns from accidentally seeing each other's +agents, operators should prefix agent names with a campaign-unique +run-id (e.g. ``demo-001-coordinator-agent``). Auto-prefixing is a +future ergonomic improvement; for now it's an operator-runbook +convention. +""" from __future__ import annotations import asyncio -import json -import pathlib +import logging import time -from collections.abc import Mapping +from collections.abc import Iterable from typing import Any -from academy.exchange.cloud.client import HttpAgentRegistration -from academy.exchange.hybrid import HybridAgentRegistration -from academy.exchange.local import LocalAgentRegistration -from academy.exchange.redis import RedisAgentRegistration -from academy.exchange.transport import AgentRegistration +from academy.exchange.transport import ExchangeTransportT from academy.identifier import AgentId -from pydantic import BaseModel -from chemgraph.academy.observability.run_files import write_json_atomic +logger = logging.getLogger(__name__) -_REGISTRATION_TYPES: dict[str, type[BaseModel]] = { - 'local': LocalAgentRegistration, - 'hybrid': HybridAgentRegistration, - 'redis': RedisAgentRegistration, - 'http': HttpAgentRegistration, -} - - -def academy_registration_path(run_dir: pathlib.Path) -> pathlib.Path: - return run_dir / 'academy_registrations.json' - - -def _exchange_type_of(registration: AgentRegistration[Any]) -> str: - value = getattr(registration, 'exchange_type', None) - if not isinstance(value, str): - raise TypeError( - f'Registration {type(registration).__name__} has no string ' - '`exchange_type` field; cannot persist.', - ) - return value - - -def registration_payload( - *, - run_token: str, - registrations: Mapping[str, AgentRegistration[Any]], -) -> dict[str, Any]: - if not registrations: - raise ValueError('at least one registration is required') - exchange_types = {_exchange_type_of(r) for r in registrations.values()} - if len(exchange_types) > 1: - raise ValueError( - f'mixed exchange types in one campaign: {sorted(exchange_types)}', - ) - (exchange_type,) = exchange_types - return { - 'run_token': run_token, - 'exchange_type': exchange_type, - 'agents': { - name: registration.agent_id.model_dump(mode='json') - for name, registration in registrations.items() - }, - } - - -def write_academy_registrations( +async def discover_peer_agent_ids( + transport: ExchangeTransportT, + peer_names: Iterable[str], *, - run_dir: pathlib.Path, - run_token: str, - registrations: Mapping[str, AgentRegistration[Any]], -) -> None: - write_json_atomic( - academy_registration_path(run_dir), - registration_payload(run_token=run_token, registrations=registrations), - ) - - -def load_academy_registrations( - run_dir: pathlib.Path, - *, - run_token: str, -) -> dict[str, AgentRegistration[Any]]: - path = academy_registration_path(run_dir) - data = json.loads(path.read_text(encoding='utf-8')) - if data.get('run_token') != run_token: - raise RuntimeError( - f'Academy registration file {path} belongs to a different run', - ) - exchange_type = data.get('exchange_type') - if exchange_type not in _REGISTRATION_TYPES: - raise RuntimeError( - f'Academy registration file has unsupported exchange_type ' - f'{exchange_type!r}; expected one of ' - f'{sorted(_REGISTRATION_TYPES)}', - ) - cls = _REGISTRATION_TYPES[exchange_type] - agents = data.get('agents') - if not isinstance(agents, dict): - raise RuntimeError(f'Academy registration file is malformed: {path}') - return { - name: cls(agent_id=AgentId[Any].model_validate(agent_id)) - for name, agent_id in agents.items() - } - - -async def wait_academy_registrations( - run_dir: pathlib.Path, - *, - run_token: str, + agent_class: type, timeout_s: float, -) -> dict[str, AgentRegistration[Any]]: - path = academy_registration_path(run_dir) + poll_interval_s: float = 1.0, +) -> dict[str, AgentId[Any]]: + """Poll ``transport.discover()`` until every named peer is found. + + Args: + transport: An open exchange transport already registered for the + local rank's own agent. Discovery is read-only from this + rank's perspective -- it does not create or mutate mailboxes. + peer_names: Names of agents this rank intends to message. Each + name must match the ``AgentId.name`` of an agent previously + registered against the same exchange (potentially by a + different process on a different host). + agent_class: Concrete ``Agent`` subclass to scope the discovery + query (``transport.discover`` is class-typed). All ChemGraph + agents are ``ChemGraphLogicalAgent``, so callers pass that. + timeout_s: Wall-clock budget. On expiry a ``TimeoutError`` is + raised whose message lists the peers we never saw, so + operators can immediately tell which remote site is missing + or whose agent failed to register. + poll_interval_s: Backoff between ``discover()`` retries. The + default keeps startup snappy without hammering the exchange. + + Returns: + Mapping from peer name to the discovered ``AgentId``. Empty + ``peer_names`` short-circuits to an empty dict. + """ + wanted = set(peer_names) + if not wanted: + return {} + found: dict[str, AgentId[Any]] = {} deadline = time.monotonic() + timeout_s while True: - if path.exists(): - return load_academy_registrations( - run_dir, - run_token=run_token, - ) - if time.monotonic() > deadline: + agent_ids = await transport.discover(agent_class) + for agent_id in agent_ids: + name = getattr(agent_id, 'name', None) + if isinstance(name, str) and name in wanted and name not in found: + found[name] = agent_id + missing = wanted.difference(found) + if not missing: + return found + if time.monotonic() >= deadline: raise TimeoutError( - f'Timed out waiting for Academy registrations at {path}', + f'Timed out after {timeout_s:.1f}s waiting to discover ' + f'peer agents on the exchange: missing={sorted(missing)}. ' + f'Confirm every site of the federated campaign has ' + f'started and registered its agents under the expected ' + f'names (run-id-prefixed names are required when the ' + f'hosted exchange is shared across operators).', ) - await asyncio.sleep(0.25) + logger.debug( + 'discover() missing %d peers (%s); sleeping %.1fs', + len(missing), sorted(missing), poll_interval_s, + ) + await asyncio.sleep(poll_interval_s) diff --git a/tests/test_academy_exchange_registration.py b/tests/test_academy_exchange_registration.py index fa9cc4b..6ee1246 100644 --- a/tests/test_academy_exchange_registration.py +++ b/tests/test_academy_exchange_registration.py @@ -1,21 +1,17 @@ from __future__ import annotations +import asyncio from pathlib import Path +from typing import Any import pytest -from academy.exchange.cloud.client import HttpAgentRegistration -from academy.exchange.hybrid import HybridAgentRegistration -from academy.exchange.local import LocalAgentRegistration -from academy.exchange.redis import RedisAgentRegistration from academy.identifier import AgentId from chemgraph.academy.core.campaign import ChemGraphDaemonConfig from chemgraph.academy.runtime.exchange import build_exchange_factory from chemgraph.academy.runtime.exchange import exchange_uses_redis from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES -from chemgraph.academy.runtime.registration import load_academy_registrations -from chemgraph.academy.runtime.registration import registration_payload -from chemgraph.academy.runtime.registration import write_academy_registrations +from chemgraph.academy.runtime.registration import discover_peer_agent_ids def _config( @@ -47,6 +43,11 @@ def _config( ) +# --------------------------------------------------------------------------- +# Exchange factory dispatch +# --------------------------------------------------------------------------- + + @pytest.mark.parametrize( ('exchange_type', 'expected_class'), [ @@ -116,47 +117,138 @@ def test_http_exchange_factory_honors_custom_url(tmp_path) -> None: assert factory._info.url == custom -@pytest.mark.parametrize( - 'registration_cls', - [ - RedisAgentRegistration, - LocalAgentRegistration, - HybridAgentRegistration, - HttpAgentRegistration, - ], -) -def test_academy_registration_round_trips_by_exchange_type( - tmp_path, - registration_cls, -) -> None: - registration = registration_cls(agent_id=AgentId.new('agent-a')) - write_academy_registrations( - run_dir=tmp_path, - run_token='token-1', - registrations={'agent-a': registration}, - ) +# --------------------------------------------------------------------------- +# discover_peer_agent_ids (federated peer discovery) +# +# Real exchange transports require a running broker. We exercise the +# discovery helper against a fake transport whose ``discover`` returns +# a sequence we control across successive polls. This keeps the tests +# fast and deterministic while still pinning the behavior that matters: +# wait-for-peers, success when all are present, timeout listing the +# missing ones. +# --------------------------------------------------------------------------- + + +class _FakeTransport: + """Minimal ``transport.discover()`` stand-in for the discovery tests. + + Configure with a list of "rounds"; each call to ``discover()`` + returns (and consumes) one round. After the configured rounds run + out the last one keeps being returned, so timeout tests can assert + 'never converged'. + """ + + def __init__(self, rounds: list[list[AgentId[Any]]]) -> None: + self._rounds = rounds + self._calls = 0 - loaded = load_academy_registrations(tmp_path, run_token='token-1') + async def discover(self, agent_class): # noqa: ARG002 - sig match only + index = min(self._calls, len(self._rounds) - 1) + self._calls += 1 + return tuple(self._rounds[index]) - assert isinstance(loaded['agent-a'], registration_cls) - assert loaded['agent-a'].agent_id == registration.agent_id +def _agent_id(name: str) -> AgentId[Any]: + return AgentId.new(name) -def test_registration_payload_rejects_mixed_exchange_types() -> None: - with pytest.raises(ValueError, match='mixed exchange types'): - registration_payload( - run_token='token-1', - registrations={ - 'redis-agent': RedisAgentRegistration( - agent_id=AgentId.new('redis-agent'), - ), - 'local-agent': LocalAgentRegistration( - agent_id=AgentId.new('local-agent'), - ), - }, + +def test_discover_peer_agent_ids_returns_empty_for_empty_peer_list() -> None: + """When the local agent has no allowed_peers the helper short-circuits + -- it must not poll the exchange unnecessarily (the network round-trip + would block daemon startup for nothing).""" + transport = _FakeTransport(rounds=[[_agent_id('anyone')]]) + result = asyncio.run( + discover_peer_agent_ids( + transport, [], agent_class=object, timeout_s=0.01, + ), + ) + assert result == {} + assert transport._calls == 0 + + +def test_discover_peer_agent_ids_finds_all_peers_on_first_poll() -> None: + """Happy path: every peer is already on the exchange. The helper + must return promptly with a name->AgentId mapping in the same + direction the daemon will use it (peer name -> AgentId for Handle + construction).""" + a = _agent_id('worker-a') + b = _agent_id('worker-b') + c = _agent_id('coordinator') + transport = _FakeTransport(rounds=[[a, b, c]]) + result = asyncio.run( + discover_peer_agent_ids( + transport, ['worker-a', 'worker-b'], + agent_class=object, timeout_s=1.0, + ), + ) + assert result == {'worker-a': a, 'worker-b': b} + # Did NOT include the un-requested coordinator -- filtering by name + # is what keeps cross-operator agents on the shared hosted exchange + # from leaking into each other's peer dicts. + assert 'coordinator' not in result + + +def test_discover_peer_agent_ids_waits_for_late_peers() -> None: + """The federated convergence story: site A registers at t=0 and + polls; site B doesn't register its agent until poll #3; the helper + must keep polling and succeed once B is visible. This is the + behavior that lets operators bring sites up in any order.""" + a = _agent_id('worker-a') + b = _agent_id('worker-b') + rounds = [ + [a], # poll 1: only A visible + [a], # poll 2: still waiting for B + [a, b], # poll 3: B comes up + ] + transport = _FakeTransport(rounds=rounds) + result = asyncio.run( + discover_peer_agent_ids( + transport, ['worker-a', 'worker-b'], + agent_class=object, + timeout_s=2.0, + poll_interval_s=0.01, # keep the test fast + ), + ) + assert result == {'worker-a': a, 'worker-b': b} + + +def test_discover_peer_agent_ids_times_out_naming_missing_peers() -> None: + """When a remote site never shows up, the helper must raise with a + message that names the missing peers. Operators reading the log + should immediately know which site to bring up / debug.""" + transport = _FakeTransport(rounds=[[_agent_id('worker-a')]]) + with pytest.raises(TimeoutError, match=r"missing=\['coordinator', 'worker-b'\]"): + asyncio.run( + discover_peer_agent_ids( + transport, ['worker-a', 'worker-b', 'coordinator'], + agent_class=object, + timeout_s=0.05, + poll_interval_s=0.01, + ), ) -def test_registration_payload_rejects_empty_registrations() -> None: - with pytest.raises(ValueError, match='at least one registration'): - registration_payload(run_token='token-1', registrations={}) +def test_discover_peer_agent_ids_skips_already_found_peers_on_re_poll() -> None: + """If poll N saw peer A, poll N+1 must not overwrite A's AgentId + even if discover returns a fresh AgentId object with the same name + (which the hosted exchange doesn't actually do, but the helper's + behavior shouldn't depend on that). Pin the 'first found wins' + invariant explicitly.""" + a_first = _agent_id('worker-a') + a_again = _agent_id('worker-a') # different uuid, same name + b = _agent_id('worker-b') + rounds = [ + [a_first], + [a_again, b], + ] + transport = _FakeTransport(rounds=rounds) + result = asyncio.run( + discover_peer_agent_ids( + transport, ['worker-a', 'worker-b'], + agent_class=object, + timeout_s=2.0, + poll_interval_s=0.01, + ), + ) + assert result['worker-a'] is a_first + assert result['worker-b'] is b From f6e492bd1de5839b2e8d42922d50a76fe3a92f41 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 09:04:59 -0500 Subject: [PATCH 06/25] feat(academy): bootstrap subcommand for federated campaign kickoff The federated piece the operator runs after every site is up. In a single-machine campaign, rank 0 of the daemon dispatches the ``campaign`` -> ``initial_agent`` bootstrap message in-process; in a federated campaign that's impossible because the agent that owns ``initial_agent`` may live on a different machine that hasn't even come up yet. ``spawn-site`` already skips the inline dispatch (``--no-bootstrap``). This commit adds the matching standalone command that triggers the kickoff at the right moment from anywhere with the cached Globus token. UX: chemgraph academy bootstrap -- \\ --campaign federated-demo.jsonc \\ --exchange-type http # or override the recipient for partial re-runs / debugging chemgraph academy bootstrap -- \\ --campaign federated-demo.jsonc \\ --recipient worker-a \\ --exchange-type http runtime/bootstrap.py (new) - ``parse_args``: --campaign (required), --recipient (defaults to campaign.initial_agent), --exchange-type (defaults to 'http' since that's the main use case), --http-exchange-url override, redis triple for the local-broker case, --discover-timeout-s. - ``dispatch_bootstrap``: opens a user client on the configured exchange, discovers the recipient by name via the shared ``discover_peer_agent_ids`` helper, sends one ``receive_message`` action, closes the client (also on error so the aiohttp session backing the http transport doesn't leak). - ``main``: returns exit code 2 with a stderr message when the recipient never appears on the exchange, so wrapping shell scripts can branch on "bootstrap didn't actually happen." cli/main.py - ``academy bootstrap`` subparser + dispatch in ``_handle_academy``. - Usage hint updated to include the new command. Tests (6 new, 68/68 academy sweep) - parse_args: --campaign required, exchange-type default, recipient override - dispatch_bootstrap: happy-path discovery + handle action, sender / recipient / message_id consistency, campaign user_task embedded in the dispatched content, client closed on success - dispatch_bootstrap: client closed on timeout too (no Handle construction attempted when discovery fails) - main: returns 2 on TimeoutError and writes the missing recipient name to stderr Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/runtime/bootstrap.py | 209 +++++++++++++++++++++ src/chemgraph/cli/main.py | 23 ++- tests/test_academy_bootstrap.py | 179 ++++++++++++++++++ 3 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 src/chemgraph/academy/runtime/bootstrap.py create mode 100644 tests/test_academy_bootstrap.py diff --git a/src/chemgraph/academy/runtime/bootstrap.py b/src/chemgraph/academy/runtime/bootstrap.py new file mode 100644 index 0000000..f18a810 --- /dev/null +++ b/src/chemgraph/academy/runtime/bootstrap.py @@ -0,0 +1,209 @@ +"""Standalone campaign-bootstrap dispatch for federated runs. + +In single-machine campaigns rank 0 of the daemon dispatches the +``campaign`` -> ``initial_agent`` bootstrap message in-process as the +last step of startup. The federated ``spawn-site`` flow can't do +that: at startup time the agent that owns ``initial_agent`` may live +on a different machine that hasn't come up yet, so each site skips +the inline dispatch (``--no-bootstrap``) and the operator triggers +kickoff once every site is up by running ``chemgraph academy +bootstrap`` from anywhere with the cached Globus token. + +This module is intentionally light: it does not load a system profile, +does not need a run-dir, and does not invoke ``mpiexec``. It just +opens an exchange user-client, discovers the recipient by name, and +sends one message. +""" +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import Any + +from academy.handle import Handle + +from chemgraph.academy.campaigns import resolve_campaign +from chemgraph.academy.core.agent import ChemGraphLogicalAgent +from chemgraph.academy.core.campaign import campaign_bootstrap_text +from chemgraph.academy.core.campaign import ChemGraphCampaign +from chemgraph.academy.core.campaign import ChemGraphDaemonConfig +from chemgraph.academy.core.campaign import load_campaign +from chemgraph.academy.core.campaign import namespace_for_run +from chemgraph.academy.core.peer_protocol import build_message +from chemgraph.academy.runtime.exchange import build_exchange_factory +from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES +from chemgraph.academy.runtime.registration import discover_peer_agent_ids + +logger = logging.getLogger(__name__) + + +def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog='chemgraph academy bootstrap', + description=( + 'Dispatch the campaign bootstrap message to the initial agent. ' + 'Run this once every site of a federated campaign is up; the ' + 'recipient is looked up by name on the exchange.' + ), + ) + parser.add_argument( + '--campaign', required=True, + help='Campaign config (packaged name or path to campaign.jsonc).', + ) + parser.add_argument( + '--recipient', + help=( + "Name of the agent that should receive the bootstrap " + "message. Defaults to the campaign's ``initial_agent``." + ), + ) + parser.add_argument( + '--exchange-type', + choices=SUPPORTED_EXCHANGE_TYPES, + default='http', + help=( + "Academy exchange backend. Defaults to 'http' since " + "federated bootstrap is the main use case; pass 'local' / " + "'redis' / 'hybrid' if you're re-bootstrapping a single-" + "machine campaign for some reason." + ), + ) + parser.add_argument( + '--http-exchange-url', + help='Override URL for --exchange-type=http (defaults to Academy-hosted).', + ) + parser.add_argument( + '--redis-host', default='127.0.0.1', + help='Redis host (only used for redis / hybrid exchanges).', + ) + parser.add_argument( + '--redis-port', type=int, default=6379, + help='Redis port (only used for redis / hybrid exchanges).', + ) + parser.add_argument( + '--redis-namespace', + help='Redis namespace (only used for hybrid; defaults from run-id).', + ) + parser.add_argument( + '--discover-timeout-s', type=float, default=120.0, + help=( + "How long to wait for the recipient agent to appear on the " + "exchange. Defaults to 2 minutes; bump it if a federated " + "site is slow to come up." + ), + ) + return parser.parse_args(argv) + + +def _config_for_factory(args: argparse.Namespace) -> ChemGraphDaemonConfig: + """Build the minimal DaemonConfig that ``build_exchange_factory`` reads. + + Most fields are unused for bootstrap and get throwaway values; what + matters is ``exchange_type``, ``http_exchange_url``, and the redis + triple. ``run_dir`` is a placeholder because the factory builder + only consults a couple of fields. + """ + run_dir = Path.cwd() / '.bootstrap-tmp' + return ChemGraphDaemonConfig( + run_dir=run_dir, + run_token='bootstrap', + agent_count=0, + campaign_config=Path(args.campaign), + lm_config=run_dir / 'lm.json', + max_decisions=0, + poll_timeout_s=1.0, + idle_timeout_s=1.0, + startup_timeout_s=args.discover_timeout_s, + completion_timeout_s=1.0, + status_interval_s=1.0, + redis_host=args.redis_host, + redis_port=args.redis_port, + redis_namespace=args.redis_namespace or namespace_for_run(run_dir), + rank=0, + local_rank=0, + chemgraph_repo_root=Path.cwd(), + exchange_type=args.exchange_type, + http_exchange_url=args.http_exchange_url, + ) + + +async def dispatch_bootstrap( + *, + campaign: ChemGraphCampaign, + recipient: str, + exchange_factory: Any, + discover_timeout_s: float, +) -> str: + """Send the campaign bootstrap message to ``recipient`` over the exchange. + + Returns the dispatched message_id so the operator can correlate it + with what shows up on the recipient site's event log. + """ + client = await exchange_factory.create_user_client( + name='chemgraph-bootstrap', + start_listener=False, + ) + try: + recipient_ids = await discover_peer_agent_ids( + client._transport, + [recipient], + agent_class=ChemGraphLogicalAgent, + timeout_s=discover_timeout_s, + ) + recipient_id = recipient_ids[recipient] + + message = build_message( + sender='campaign', + recipient=recipient, + content=campaign_bootstrap_text(campaign), + kind='message', + tldr='Campaign bootstrap', + reason='Initial campaign task dispatch (operator-triggered).', + confidence=1.0, + ) + handle: Handle[Any] = Handle(recipient_id) + await handle.action('receive_message', message) + logger.info( + 'Bootstrap message dispatched: recipient=%s message_id=%s', + recipient, message['message_id'], + ) + return message['message_id'] + finally: + await client.close() + + +def main(argv: Sequence[str] | None = None) -> int: + logging.basicConfig(level=logging.INFO, format='%(message)s') + args = parse_args(argv) + + campaign_path = resolve_campaign(args.campaign) + if not campaign_path.exists(): + campaign_path = Path(args.campaign).resolve() + campaign = load_campaign(campaign_path) + + recipient = args.recipient or campaign.initial_agent + config = _config_for_factory(args) + factory = build_exchange_factory(config) + + try: + message_id = asyncio.run( + dispatch_bootstrap( + campaign=campaign, + recipient=recipient, + exchange_factory=factory, + discover_timeout_s=args.discover_timeout_s, + ), + ) + except TimeoutError as exc: + print(f'bootstrap failed: {exc}', file=sys.stderr) + return 2 + print(f'ok: sent bootstrap to {recipient} (message_id={message_id})') + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/chemgraph/cli/main.py b/src/chemgraph/cli/main.py index 32e3680..0c97c29 100644 --- a/src/chemgraph/cli/main.py +++ b/src/chemgraph/cli/main.py @@ -302,6 +302,20 @@ def create_argument_parser() -> argparse.ArgumentParser: ), ) + bootstrap_parser = academy_sub.add_parser( + "bootstrap", + help=( + "Dispatch the campaign bootstrap message to the initial agent " + "via the exchange. Run after every site of a federated " + "campaign is up; the recipient is discovered by name." + ), + ) + bootstrap_parser.add_argument( + "bootstrap_args", + nargs=argparse.REMAINDER, + help="Arguments forwarded to chemgraph.academy.runtime.bootstrap.", + ) + dashboard_parser = academy_sub.add_parser( "dashboard", help="Start the local dashboard launcher for a ChemGraph Academy run.", @@ -657,6 +671,13 @@ def _handle_academy(args: argparse.Namespace) -> None: if code: sys.exit(code) return + if command == "bootstrap": + from chemgraph.academy.runtime.bootstrap import main as bootstrap_main + + code = bootstrap_main(_strip_remainder_separator(args.bootstrap_args)) + if code: + sys.exit(code) + return if command == "campaigns": from chemgraph.academy.campaigns import list_campaigns @@ -665,7 +686,7 @@ def _handle_academy(args: argparse.Namespace) -> None: return console.print( "Usage: chemgraph academy " - "{mpi-daemon,run-compute,spawn-site,dashboard,campaigns}.", + "{mpi-daemon,run-compute,spawn-site,bootstrap,dashboard,campaigns}.", ) diff --git a/tests/test_academy_bootstrap.py b/tests/test_academy_bootstrap.py new file mode 100644 index 0000000..2b9e70a --- /dev/null +++ b/tests/test_academy_bootstrap.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +import asyncio +from typing import Any +from unittest.mock import AsyncMock + +import pytest +from academy.identifier import AgentId + +from chemgraph.academy.runtime import bootstrap + + +# --------------------------------------------------------------------------- +# parse_args -- CLI surface +# --------------------------------------------------------------------------- + + +def test_parse_args_requires_campaign() -> None: + """``--campaign`` is the only field that doesn't have a default -- + bootstrap is useless without knowing which campaign's + ``user_task`` to send.""" + with pytest.raises(SystemExit): + bootstrap.parse_args([]) + + +def test_parse_args_defaults_exchange_type_to_http() -> None: + """Federated bootstrap is the main use case so http is the right + default. Operators on single-machine runs can override but they + rarely need this command at all (run-compute auto-bootstraps).""" + args = bootstrap.parse_args(['--campaign', 'mace-ensemble-screening-20']) + assert args.exchange_type == 'http' + assert args.recipient is None # defaults to campaign.initial_agent + assert args.discover_timeout_s == pytest.approx(120.0) + + +def test_parse_args_accepts_recipient_override() -> None: + """Operator should be able to bootstrap a non-initial agent for + e.g. partial re-runs or debugging.""" + args = bootstrap.parse_args([ + '--campaign', 'foo.jsonc', + '--recipient', 'worker-a', + ]) + assert args.recipient == 'worker-a' + + +# --------------------------------------------------------------------------- +# dispatch_bootstrap -- the core async path +# --------------------------------------------------------------------------- + + +class _FakeTransport: + """``transport.discover()`` returns a fixed agent list.""" + def __init__(self, agents): + self._agents = tuple(agents) + + async def discover(self, agent_class): # noqa: ARG002 - sig match only + return self._agents + + +class _FakeClient: + def __init__(self, transport): + self._transport = transport + self.close = AsyncMock() + + +class _FakeFactory: + def __init__(self, client): + self._client = client + + async def create_user_client(self, *, name, start_listener): # noqa: ARG002 + return self._client + + +class _FakeCampaign: + """Minimal stand-in for ChemGraphCampaign with just what + ``campaign_bootstrap_text`` reads. Avoids the full file-load path.""" + def __init__(self, user_task: str = 'do the thing'): + self.user_task = user_task + self.initial_agent = 'coordinator-agent' + self.agents = () + self.resources = {} + + +def test_dispatch_bootstrap_sends_one_message_to_discovered_recipient( + monkeypatch, +) -> None: + """Happy path: recipient is on the exchange, helper discovers them, + one Handle.action call gets made, the message_id returned matches + what was sent.""" + target = AgentId.new('coordinator-agent') + transport = _FakeTransport(agents=[target, AgentId.new('other-agent')]) + client = _FakeClient(transport) + factory = _FakeFactory(client) + + sent: list[tuple[Any, str, dict]] = [] + + class _RecordingHandle: + def __init__(self, agent_id): + self._agent_id = agent_id + + async def action(self, name, message): + sent.append((self._agent_id, name, message)) + + monkeypatch.setattr(bootstrap, 'Handle', _RecordingHandle) + + message_id = asyncio.run( + bootstrap.dispatch_bootstrap( + campaign=_FakeCampaign(), + recipient='coordinator-agent', + exchange_factory=factory, + discover_timeout_s=1.0, + ), + ) + + assert len(sent) == 1 + agent_id, action_name, message = sent[0] + assert agent_id is target + assert action_name == 'receive_message' + assert message['recipient'] == 'coordinator-agent' + assert message['sender'] == 'campaign' + assert message['message_id'] == message_id + # The bootstrap content embeds the campaign's user_task; the + # recipient agent's first round will parse this content. + assert 'do the thing' in message['content'] + # Client must be closed on the happy path so we don't leak the + # aiohttp session that backs the http exchange transport. + client.close.assert_awaited_once() + + +def test_dispatch_bootstrap_closes_client_on_discover_timeout( + monkeypatch, +) -> None: + """If the recipient never appears on the exchange the helper must + raise TimeoutError -- AND the client must still be closed so we + don't leak the underlying network resources.""" + transport = _FakeTransport(agents=[AgentId.new('other-agent')]) + client = _FakeClient(transport) + factory = _FakeFactory(client) + + monkeypatch.setattr(bootstrap, 'Handle', + lambda agent_id: pytest.fail("Handle must not be built on timeout")) + + with pytest.raises(TimeoutError): + asyncio.run( + bootstrap.dispatch_bootstrap( + campaign=_FakeCampaign(), + recipient='coordinator-agent', + exchange_factory=factory, + discover_timeout_s=0.05, + ), + ) + client.close.assert_awaited_once() + + +# --------------------------------------------------------------------------- +# main() -- end-to-end exit codes +# --------------------------------------------------------------------------- + + +def test_main_returns_2_on_recipient_timeout(monkeypatch, capsys) -> None: + """Operators need a non-zero exit so wrapping shell scripts know + bootstrap didn't actually happen. The stderr message should be the + TimeoutError's text (which names the missing recipient).""" + async def _raise(*args, **kwargs): + raise TimeoutError('Timed out ... missing=[\'coordinator-agent\']') + monkeypatch.setattr(bootstrap, 'dispatch_bootstrap', _raise) + # Bypass the campaign-file load to keep the test offline. + monkeypatch.setattr(bootstrap, 'load_campaign', + lambda path: _FakeCampaign()) + monkeypatch.setattr(bootstrap, 'build_exchange_factory', + lambda config: None) + + code = bootstrap.main([ + '--campaign', 'mace-ensemble-screening-20', + '--exchange-type', 'local', + ]) + assert code == 2 + err = capsys.readouterr().err + assert 'coordinator-agent' in err From e1492c6ec731df3b15e4f56ff8f13bed90fa28b0 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 09:20:32 -0500 Subject: [PATCH 07/25] feat(academy/dashboard): multi-site launcher + merged event view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The piece that lets one Mac terminal serve a federated campaign running across multiple HPC sites. Operator runs ONE dashboard command with ``--system aurora,crux``; the launcher spins up per-site SSH tunnels + UAN relays + rsync mirrors, and the server walks the merged subdir tree to render one timeline tagged by site. runtime/dashboard_launcher.py - ``--system`` now takes a comma-list ('aurora,crux'). Single-site invocations are unchanged (the value resolves to a 1-tuple and the rest of the launcher uses the same per-site setup helper). - Per-site setup extracted into ``_setup_site`` returning a ``_SiteHandle`` carrying everything the cleanup finally needs. ``main`` loops over the resolved tuple; failure on any site triggers teardown of the partially-set-up sites. - Each site gets its own reverse-port (base + site_index) so two SSH ``-R`` tunnels don't collide on the Mac side. - Multi-site mode rejects scalar overrides that can't sensibly apply to every site (--remote-host, --ssh-control-path, --relay-port, --lm-base-url, --local-run-dir). Operators encode site differences in the profile JSON instead. - Single-site mirror layout unchanged (``//``); multi-site mirrors under ``///``. dashboard/server.py - ``_iter_site_dirs`` detects layout: if ``events.jsonl`` is at the top level it's legacy single-site; otherwise walk subdirs and treat each as a site if it has ``events.jsonl`` OR ``dashboard_metadata.json``. The metadata check covers the early-startup window where a site is up but no events have been written yet, so federated dashboards don't briefly look like "empty single-site". - ``events_payload``: legacy shape preserved for single-site; federated merges sites in timestamp order with a ``site`` tag on each event so the UI can color/group per-site. - ``status_payload``: legacy keys preserved for single-site; federated nests per-site status/placement/summary under ``sites: {: ...}`` with a top-level ``updated`` reflecting the latest per-site update. Tests (+10, 78/78 academy sweep) - _iter_site_dirs: recognizes metadata-only sites; falls back to single-site for empty dirs - events_payload: merges + tags by site; timestamp-sorted output even when sites are seeded reverse-order - status_payload: nests under ``sites`` for federated, preserves legacy keys for single-site (regression guard against an accidental "make them uniform" refactor) - _parse_systems_list: single name, comma-list with whitespace, rejects empty, rejects duplicates Aurora ⇄ Crux demo runbook (operator runs once both sites have a system profile in the repo): # Mac terminal A chemgraph academy dashboard -- federated-demo-001 \\ --system aurora,crux --campaign federated-demo.jsonc # Aurora compute chemgraph academy spawn-site -- \\ --system aurora --campaign federated-demo.jsonc \\ --agents coordinator-agent --exchange-type http # Crux compute chemgraph academy spawn-site -- \\ --system crux --campaign federated-demo.jsonc \\ --agents worker-a --exchange-type http # Mac terminal B chemgraph academy bootstrap -- \\ --campaign federated-demo.jsonc --exchange-type http (Crux profile JSON is still TODO -- pre-requisite for the actual demo, not for the dashboard code.) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/dashboard/server.py | 124 ++++++- .../academy/runtime/dashboard_launcher.py | 349 ++++++++++++++---- tests/test_academy_dashboard.py | 227 ++++++++++++ 3 files changed, 628 insertions(+), 72 deletions(-) diff --git a/src/chemgraph/academy/dashboard/server.py b/src/chemgraph/academy/dashboard/server.py index 3c50741..233615d 100644 --- a/src/chemgraph/academy/dashboard/server.py +++ b/src/chemgraph/academy/dashboard/server.py @@ -15,6 +15,47 @@ _STATIC_CACHE: dict[str, bytes] = {} +def _iter_site_dirs(run_dir: Path) -> list[tuple[str | None, Path]]: + """Return ``[(site_name, site_dir)]`` for the dashboard to read from. + + Single-site mode (the legacy layout): ``run_dir/events.jsonl`` + exists at the top level. Returns ``[(None, run_dir)]`` and the + dashboard behaves exactly as it did pre-federation. + + Multi-site mode (federated, per-site subdirs): ``run_dir`` does + NOT contain ``events.jsonl`` itself; instead it contains one + subdir per site, each with its own ``events.jsonl``. Returns + ``[(name, subdir), ...]`` for every subdir that looks like a + site mirror. The ``site_name`` is used to tag events and + namespace per-site status / placement / summary in the merged + payload. + + Detection heuristic: ``events.jsonl`` at the top level wins + (single-site, even if subdirs exist for some reason). Otherwise + every immediate subdir whose own ``events.jsonl`` exists OR + which carries a ``dashboard_metadata.json`` (written per-site by + the launcher) counts as a site. The metadata check catches the + pre-startup window where a site is up but no events have been + written yet, so federated dashboards don't briefly look like + "empty single-site" while waiting on the first event. + """ + if (run_dir / "events.jsonl").exists(): + return [(None, run_dir)] + sites: list[tuple[str | None, Path]] = [] + if run_dir.is_dir(): + for child in sorted(run_dir.iterdir()): + if not child.is_dir(): + continue + if (child / "events.jsonl").exists() or (child / "dashboard_metadata.json").exists(): + sites.append((child.name, child)) + if not sites: + # Neither single-site events nor any recognizable site subdirs. + # Fall back to treating the dir as single-site so the empty-run + # case (just-created dir, no events yet) doesn't break. + return [(None, run_dir)] + return sites + + def _static_file(name: str, content_type: str) -> tuple[bytes, str]: if name not in _STATIC_CACHE: resource = files('chemgraph.academy.dashboard').joinpath( @@ -79,17 +120,17 @@ def snapshot(handler: DashboardHandler) -> dict[str, Any]: return data -def status_payload(handler: DashboardHandler) -> dict[str, Any]: - run_dir = handler.run_dir - status_path = run_dir / "status.json" +def _site_status(site_dir: Path) -> dict[str, Any]: + """Compose one site's ``status`` slice (status.json + placement + summary).""" + status_path = site_dir / "status.json" status: dict[str, Any] = {} if status_path.exists(): try: status = json.loads(status_path.read_text(encoding="utf-8")) except json.JSONDecodeError: status = {} - artifacts = write_run_artifacts(run_dir) - manifest = read_json_file(run_dir / "manifest.json", default={}) + artifacts = write_run_artifacts(site_dir) + manifest = read_json_file(site_dir / "manifest.json", default={}) updated = status.get("updated") or status.get("timestamp") schema = ( status.get("mode") @@ -97,7 +138,6 @@ def status_payload(handler: DashboardHandler) -> dict[str, Any]: or "canonical_events" ) return { - "run_dir": str(run_dir), "updated": updated, "schema": schema, "status": status, @@ -106,13 +146,77 @@ def status_payload(handler: DashboardHandler) -> dict[str, Any]: } +def status_payload(handler: DashboardHandler) -> dict[str, Any]: + run_dir = handler.run_dir + sites = _iter_site_dirs(run_dir) + + if len(sites) == 1 and sites[0][0] is None: + # Single-site / legacy layout -- preserve exact pre-federation + # payload shape so existing JS clients keep working. + site_data = _site_status(run_dir) + return { + "run_dir": str(run_dir), + **site_data, + } + + # Federated layout: nest per-site status under ``sites`` and add a + # top-level ``updated`` reflecting the most recent per-site update + # so the dashboard header has something to display. + sites_data: dict[str, dict[str, Any]] = {} + latest_updated: float | None = None + for site_name, site_dir in sites: + assert site_name is not None + sites_data[site_name] = _site_status(site_dir) + site_updated = sites_data[site_name].get("updated") + if isinstance(site_updated, (int, float)): + latest_updated = ( + site_updated if latest_updated is None + else max(latest_updated, float(site_updated)) + ) + return { + "run_dir": str(run_dir), + "updated": latest_updated, + "schema": "canonical_events", + "sites": sites_data, + } + + def events_payload(run_dir: Path) -> dict[str, Any]: - events = [ - event.model_dump(mode="json") for event in read_events(run_dir / "events.jsonl") - ] + sites = _iter_site_dirs(run_dir) + + if len(sites) == 1 and sites[0][0] is None: + # Single-site / legacy layout -- preserve exact event payload + # shape (no per-event ``site`` tag). + events = [ + event.model_dump(mode="json") + for event in read_events(run_dir / "events.jsonl") + ] + return { + "run_dir": str(run_dir), + "events": events, + } + + # Federated: tag each event with its site and merge in timestamp + # order so the dashboard can render a single interleaved stream. + merged: list[dict[str, Any]] = [] + for site_name, site_dir in sites: + for event in read_events(site_dir / "events.jsonl"): + payload = event.model_dump(mode="json") + payload["site"] = site_name + merged.append(payload) + # Sort by timestamp when available; events lacking a timestamp + # sink to the bottom rather than throw off the ordering of + # well-formed ones. + def _ts(e: dict[str, Any]) -> float: + v = e.get("timestamp") or e.get("time") + try: + return float(v) if v is not None else float("inf") + except (TypeError, ValueError): + return float("inf") + merged.sort(key=_ts) return { "run_dir": str(run_dir), - "events": events, + "events": merged, } diff --git a/src/chemgraph/academy/runtime/dashboard_launcher.py b/src/chemgraph/academy/runtime/dashboard_launcher.py index 0116176..ef474bf 100644 --- a/src/chemgraph/academy/runtime/dashboard_launcher.py +++ b/src/chemgraph/academy/runtime/dashboard_launcher.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import dataclasses import json import os, shlex, shutil, signal, subprocess, threading import time @@ -8,6 +9,7 @@ import urllib.request from importlib.resources import files from pathlib import Path +from typing import Any from chemgraph.academy.dashboard import serve_dashboard from chemgraph.academy.campaigns import campaign_launch_defaults @@ -15,11 +17,63 @@ from chemgraph.academy.runtime.profiles import load_system_profile from chemgraph.academy.runtime.profiles.system import SystemProfile + +@dataclasses.dataclass +class _SiteHandle: + """Per-site state held by the launcher's main loop. + + One of these per ``--system`` value when launching a federated + dashboard. Single-site invocations build exactly one. The fields + track everything the cleanup ``finally`` block needs to tear down + (relay subprocess, ControlMaster ownership) plus the values the + rsync loop and dashboard server need (local mirror dir, the + composed ``lm_base_url`` for the site's compute nodes). + """ + + profile: SystemProfile + remote_host: str + control_path: str + local_mirror_dir: Path # the per-site dir (multi) or top-level (single) + relay_port: int + relay_host: str | None = None + lm_base_url: str | None = None + relay_process: subprocess.Popen[str] | None = None + started_master: bool = False + + +def _parse_systems_list(raw: str) -> tuple[str, ...]: + """Parse a comma-list of system profile names ('aurora,crux'). + + Whitespace-tolerant; trailing commas dropped. Empty input is a + user error and surfaces a clean message at argparse-resolve time + rather than later in the setup loop. + """ + names = tuple(name.strip() for name in raw.split(',') if name.strip()) + if not names: + raise argparse.ArgumentTypeError( + "--system requires at least one profile name", + ) + if len(set(names)) != len(names): + raise argparse.ArgumentTypeError( + f"--system has duplicate profile names: {names}", + ) + return names + def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(prog="chemgraph academy dashboard") a = p.add_argument a("run_id") - a("--system", default="aurora", help="Built-ins: " + ", ".join(list_builtin_system_profiles())) + a( + "--system", + type=_parse_systems_list, + default=("aurora",), + help=( + "One profile name for a single-site campaign, or a comma " + "list ('aurora,crux') for a federated dashboard that brings " + "up per-site relays + rsync mirrors and serves a merged " + "view. Built-ins: " + ", ".join(list_builtin_system_profiles()) + ), + ) a("--campaign", default="mace-ensemble-screening-20") a("--lm-connect", choices=("mac-argo-relay", "direct"), default="mac-argo-relay") a("--lm-base-url") @@ -28,7 +82,13 @@ def parse_args() -> argparse.Namespace: a("--keep-ssh-master", action="store_true") a("--local-argo-host", default="127.0.0.1") a("--local-argo-port", type=int, default=18085) - a("--reverse-port", type=int, default=18185) + a( + "--reverse-port", type=int, default=18185, + help=( + "Reverse-tunnel local port. In multi-site mode each site " + "gets reverse_port + offset (offset = i for the i-th system)." + ), + ) a("--relay-port", type=int) a("--relay-python") a("--rsync-interval-s", type=float, default=2.0) @@ -39,7 +99,31 @@ def parse_args() -> argparse.Namespace: a("--local", action="store_true", help="Only serve an already mirrored local run.") a("--no-dashboard", action="store_true") a("--overwrite-run", action="store_true") - return p.parse_args() + args = p.parse_args() + # Per-site override flags only make sense in single-site mode -- + # in multi-site they'd silently apply to all sites and almost + # always be wrong (e.g. one Aurora remote_host doesn't fit Crux). + # Force operators to encode site-specific quirks in the profile JSON. + if len(args.system) > 1: + forbidden = [ + (name, getattr(args, attr)) + for name, attr in ( + ("--remote-host", "remote_host"), + ("--ssh-control-path", "ssh_control_path"), + ("--relay-port", "relay_port"), + ("--lm-base-url", "lm_base_url"), + ("--local-run-dir", "local_run_dir"), + ) + if getattr(args, attr) + ] + if forbidden: + names = ", ".join(flag for flag, _ in forbidden) + p.error( + f"multi-site --system rejects single-site overrides {names}; " + f"encode per-site differences in the system profile JSON " + f"instead.", + ) + return args def template(name: str) -> str: return files("chemgraph.academy.runtime.templates").joinpath(name).read_text() @@ -131,16 +215,187 @@ def compute_lines(profile: SystemProfile, wrapper_path: str, run_id: str, campai lines = [" module use /soft/modulefiles", " module load conda", " conda activate base"] if profile.name == "polaris" else [" module load frameworks"] return lines + [f" source {profile.remote_root}/venvs/academy-swarm/bin/activate", f" export PATH={profile.remote_root}/bin:$PATH", " chemgraph academy run-compute \\", f" --system {profile.name} \\", f" --run-id {run_id} \\", f" --campaign {campaign}", "", "If PATH is not configured, use:", f" {wrapper_path} \\", f" --system {profile.name} \\", f" --run-id {run_id} \\", f" --campaign {campaign}"] +def _resolve_local_run_root(args: argparse.Namespace) -> Path: + """Top-level dashboard dir on the Mac. + + Single-site mode: ``//`` -- byte-identical to the + pre-multi-site layout, so existing dashboard URLs / mirror paths + keep working unchanged. + + Multi-site mode: ``//`` is a PARENT containing + per-site subdirs (``//aurora/``, ``.../crux/``). + The dashboard server walks that tree and merges per-site event + streams into one view. + """ + if args.local_run_dir: + return Path(args.local_run_dir).expanduser() + return (Path(args.local_mirror_root) / args.run_id).expanduser() + + +def _site_mirror_dir( + local_run_root: Path, + profile_name: str, + *, + multi_site: bool, +) -> Path: + return local_run_root / profile_name if multi_site else local_run_root + + +def _setup_site( + *, + profile_name: str, + args: argparse.Namespace, + local_run_root: Path, + multi_site: bool, + site_index: int, + stop: threading.Event, +) -> _SiteHandle: + """Bring up one site's ControlMaster + UAN relay + rsync mirror. + + Pulled out of ``main`` so the multi-site loop has one place to call. + The single-site path also goes through this function (with + ``multi_site=False`` so the mirror dir + reverse-port stay + backward-compatible). Returns a ``_SiteHandle`` carrying everything + the cleanup ``finally`` needs. + """ + profile = load_system_profile(profile_name) + remote_host = args.remote_host or profile.remote_host + control_path = ( + args.ssh_control_path + or str(Path.home() / f".ssh/{profile.name}-dashboard-%r@%h:%p") + ) + relay_port = args.relay_port or profile.relay_port + remote_run_dir = f"{profile.run_root}/{args.run_id}" + local_mirror_dir = _site_mirror_dir(local_run_root, profile.name, multi_site=multi_site) + site = _SiteHandle( + profile=profile, + remote_host=remote_host, + control_path=control_path, + local_mirror_dir=local_mirror_dir, + relay_port=relay_port, + ) + + Path(control_path).expanduser().parent.mkdir(parents=True, exist_ok=True) + if ssh(remote_host, None, control_path=control_path, extra=["-O", "check"], check=False, batch_mode=False).returncode != 0: + print(f"[{profile.name}] Starting SSH ControlMaster for {remote_host}...", flush=True) + ssh(remote_host, None, control_path=control_path, extra=["-M", "-N", "-f", "-o", "ControlMaster=yes"], batch_mode=False) + site.started_master = True + + if args.overwrite_run: + if not args.run_id or "/" in args.run_id or args.run_id in {".", ".."}: + raise RuntimeError(f"Refusing to overwrite unsafe run id: {args.run_id!r}") + print(f"[{profile.name}] Deleting existing run artifacts (--overwrite-run):", flush=True) + print(f" remote: {remote_host}:{remote_run_dir}", flush=True) + print(f" local: {local_mirror_dir}", flush=True) + delete = f"set -euo pipefail; run_root={shlex.quote(profile.run_root)}; run_id={shlex.quote(args.run_id)}; case \"$run_id\" in \"\"|.|..|*/*) echo \"unsafe run id\" >&2; exit 2;; esac; run_dir=\"$run_root/$run_id\"; trash_root=\"$run_root/.deleted-runs\"; if [ -e \"$run_dir\" ]; then mkdir -p \"$trash_root\"; trash_dir=\"$trash_root/${{run_id}}.$(date +%Y%m%d%H%M%S).$$\"; mv -- \"$run_dir\" \"$trash_dir\"; for delay in 0 1 2 5 10; do sleep \"$delay\"; if rm -rf -- \"$trash_dir\" 2>/dev/null; then break; fi; done; fi; mkdir -p \"$run_dir\"" + ssh(remote_host, delete, control_path=control_path) + if local_mirror_dir.exists(): + shutil.rmtree(local_mirror_dir) + + wrapper_path = f"{profile.remote_root}/bin/chemgraph-academy-run" + print(f"[{profile.name}] Installing compute wrapper at {wrapper_path}...", flush=True) + ssh(remote_host, f"mkdir -p {shlex.quote(profile.remote_root + '/bin')} && cat > {shlex.quote(wrapper_path)} && chmod +x {shlex.quote(wrapper_path)}", control_path=control_path, input_text=wrapper(profile)) + + relay_host = None + if args.lm_connect == "mac-argo-relay": + # Each site gets its own reverse port (base + site_index) so two + # SSH -R tunnels don't fight over the same local port. The remote + # relay always listens on the profile's relay_port; only the SSH + # tunneling end on the Mac shifts. + per_site_args = argparse.Namespace(**vars(args)) + per_site_args.reverse_port = args.reverse_port + site_index + print(f"[{profile.name}] Staging UAN relay script...", flush=True) + relay_script = stage_relay_script(profile, remote_host, control_path) + print(f"[{profile.name}] Starting UAN relay through {remote_host} (reverse port {per_site_args.reverse_port})...", flush=True) + relay_log = Path(f"/tmp/chemgraph-academy-{args.run_id}-{profile.name}-relay.log") + site.relay_process = start_relay( + profile, remote_host, control_path, per_site_args, + relay_port, args.relay_python or profile.venv_python, + relay_log, relay_script, + ) + relay_host = wait_relay(profile, remote_host, control_path, relay_port, site.relay_process, relay_log) + site.relay_host = relay_host + + lm_base_url = ( + f"http://{relay_host}:{relay_port}/argoapi/v1" + if relay_host else str(args.lm_base_url) + ) + site.lm_base_url = lm_base_url + print(f"[{profile.name}] Compute-node LM URL: {lm_base_url}", flush=True) + + metadata: dict[str, Any] = { + "created_at": time.time(), + "created_by": "chemgraph academy dashboard", + "run_id": args.run_id, + "system": profile.name, + "campaign": args.campaign, + "remote_run_dir": remote_run_dir, + "remote_host": remote_host, + "lm_connect": args.lm_connect, + "lm_base_url": lm_base_url, + "workspace_root": profile.remote_root, + "chemgraph_repo_root": profile.repo_root, + } + if relay_host: + metadata.update({"relay_host": relay_host, "relay_port": relay_port}) + print(f"[{profile.name}] Writing run metadata: {remote_host}:{remote_run_dir}/dashboard_metadata.json", flush=True) + ssh(remote_host, f"mkdir -p {shlex.quote(remote_run_dir)} && cat > {shlex.quote(remote_run_dir + '/dashboard_metadata.json')}", control_path=control_path, input_text=json.dumps(metadata, indent=2) + "\n") + + print(f"[{profile.name}] Starting rsync mirror:", flush=True) + print(f" {remote_host}:{remote_run_dir}/", flush=True) + print(f" {local_mirror_dir}/", flush=True) + start_rsync(remote_host, control_path, remote_run_dir, local_mirror_dir, args.rsync_interval_s, stop) + + print(f"\n[{profile.name}] Compute-node command:", flush=True) + print("\n".join(compute_lines(profile, wrapper_path, args.run_id, args.campaign)), flush=True) + + return site + + +def _teardown_site(site: _SiteHandle, *, keep_ssh_master: bool) -> None: + if site.relay_process is not None and site.relay_process.poll() is None: + site.relay_process.terminate() + try: + site.relay_process.wait(timeout=5) + except subprocess.TimeoutExpired: + site.relay_process.kill() + if site.started_master and not keep_ssh_master: + ssh( + site.remote_host, None, + control_path=site.control_path, + extra=["-O", "exit"], check=False, batch_mode=False, + ) + + +# Note about local-argo reachability: we only check the local argo-shim +# once at the top of main(), even in multi-site mode -- all sites share +# the same Mac shim, so one check covers them all. + + def main() -> int: args = parse_args() - profile = load_system_profile(args.system) + # Tolerate args.system being a plain string (legacy single-site + # callers / older tests) as well as the tuple form produced by the + # new --system parser. Without this, "aurora" would iterate + # character-by-character. + systems: tuple[str, ...] = ( + (args.system,) if isinstance(args.system, str) else tuple(args.system) + ) + multi_site = len(systems) > 1 campaign_launch_defaults(args.campaign) - local_run_dir = Path(args.local_run_dir or Path(args.local_mirror_root) / args.run_id).expanduser() - local_run_dir.mkdir(parents=True, exist_ok=True) + local_run_root = _resolve_local_run_root(args) + local_run_root.mkdir(parents=True, exist_ok=True) + if args.local: if args.overwrite_run: raise RuntimeError("--overwrite-run cannot be used with --local") - return 0 if args.no_dashboard else serve_dashboard(run_dir=local_run_dir, host=args.dashboard_host, port=args.dashboard_port) + # Dashboard server walks the tree either way -- single-site + # mirror dir or multi-site parent both work as inputs. + return 0 if args.no_dashboard else serve_dashboard( + run_dir=local_run_root, + host=args.dashboard_host, port=args.dashboard_port, + ) + if args.lm_connect == "direct" and not args.lm_base_url: raise RuntimeError("--lm-connect direct requires --lm-base-url") if args.lm_connect == "mac-argo-relay": @@ -151,68 +406,38 @@ def main() -> int: except (OSError, urllib.error.URLError, urllib.error.HTTPError) as exc: raise RuntimeError("Local argo-shim is not reachable. Start it before using --lm-connect mac-argo-relay.") from exc - remote_host = args.remote_host or profile.remote_host - control_path = args.ssh_control_path or str(Path.home() / f".ssh/{profile.name}-dashboard-%r@%h:%p") - relay_port = args.relay_port or profile.relay_port - remote_run_dir = f"{profile.run_root}/{args.run_id}" - relay_process: subprocess.Popen[str] | None = None stop = threading.Event() - started_master = False + sites: list[_SiteHandle] = [] try: - Path(control_path).expanduser().parent.mkdir(parents=True, exist_ok=True) - if ssh(remote_host, None, control_path=control_path, extra=["-O", "check"], check=False, batch_mode=False).returncode != 0: - print(f"Starting SSH ControlMaster for {remote_host}...", flush=True) - ssh(remote_host, None, control_path=control_path, extra=["-M", "-N", "-f", "-o", "ControlMaster=yes"], batch_mode=False) - started_master = True - if args.overwrite_run: - if not args.run_id or "/" in args.run_id or args.run_id in {".", ".."}: - raise RuntimeError(f"Refusing to overwrite unsafe run id: {args.run_id!r}") - print("Deleting existing run artifacts because --overwrite-run was set:", flush=True) - print(f" remote: {remote_host}:{remote_run_dir}", flush=True) - print(f" local: {local_run_dir}", flush=True) - delete = f"set -euo pipefail; run_root={shlex.quote(profile.run_root)}; run_id={shlex.quote(args.run_id)}; case \"$run_id\" in \"\"|.|..|*/*) echo \"unsafe run id\" >&2; exit 2;; esac; run_dir=\"$run_root/$run_id\"; trash_root=\"$run_root/.deleted-runs\"; if [ -e \"$run_dir\" ]; then mkdir -p \"$trash_root\"; trash_dir=\"$trash_root/${{run_id}}.$(date +%Y%m%d%H%M%S).$$\"; mv -- \"$run_dir\" \"$trash_dir\"; for delay in 0 1 2 5 10; do sleep \"$delay\"; if rm -rf -- \"$trash_dir\" 2>/dev/null; then break; fi; done; fi; mkdir -p \"$run_dir\"" - ssh(remote_host, delete, control_path=control_path) - if local_run_dir.exists(): - shutil.rmtree(local_run_dir) - wrapper_path = f"{profile.remote_root}/bin/chemgraph-academy-run" - print(f"Installing compute wrapper at {wrapper_path}...", flush=True) - ssh(remote_host, f"mkdir -p {shlex.quote(profile.remote_root + '/bin')} && cat > {shlex.quote(wrapper_path)} && chmod +x {shlex.quote(wrapper_path)}", control_path=control_path, input_text=wrapper(profile)) - relay_host = None - if args.lm_connect == "mac-argo-relay": - print(f"Staging UAN relay script under {profile.remote_root}/{REMOTE_RELAY_SUBPATH}...", flush=True) - relay_script = stage_relay_script(profile, remote_host, control_path) - print(f"Starting {profile.name} UAN relay through {remote_host}...", flush=True) - relay_process = start_relay(profile, remote_host, control_path, args, relay_port, args.relay_python or profile.venv_python, Path(f"/tmp/chemgraph-academy-{args.run_id}-relay.log"), relay_script) - relay_host = wait_relay(profile, remote_host, control_path, relay_port, relay_process, Path(f"/tmp/chemgraph-academy-{args.run_id}-relay.log")) - lm_base_url = f"http://{relay_host}:{relay_port}/argoapi/v1" if relay_host else str(args.lm_base_url) - print(f"Compute-node LM URL: {lm_base_url}", flush=True) - metadata = {"created_at": time.time(), "created_by": "chemgraph academy dashboard", "run_id": args.run_id, "system": profile.name, "campaign": args.campaign, "remote_run_dir": remote_run_dir, "remote_host": remote_host, "lm_connect": args.lm_connect, "lm_base_url": lm_base_url, "workspace_root": profile.remote_root, "chemgraph_repo_root": profile.repo_root} - if relay_host: - metadata.update({"relay_host": relay_host, "relay_port": relay_port}) - print(f"Writing run metadata: {remote_host}:{remote_run_dir}/dashboard_metadata.json", flush=True) - ssh(remote_host, f"mkdir -p {shlex.quote(remote_run_dir)} && cat > {shlex.quote(remote_run_dir + '/dashboard_metadata.json')}", control_path=control_path, input_text=json.dumps(metadata, indent=2) + "\n") - print("Starting rsync mirror:", flush=True) - print(f" {remote_host}:{remote_run_dir}/", flush=True) - print(f" {local_run_dir}/", flush=True) - start_rsync(remote_host, control_path, remote_run_dir, local_run_dir, args.rsync_interval_s, stop) - print("\nDashboard launcher is ready.\n", flush=True) - print(f"On the {profile.name} compute node, use:", flush=True) - print("\n".join(compute_lines(profile, wrapper_path, args.run_id, args.campaign)), flush=True) + for index, profile_name in enumerate(systems): + site = _setup_site( + profile_name=profile_name, + args=args, + local_run_root=local_run_root, + multi_site=multi_site, + site_index=index, + stop=stop, + ) + sites.append(site) + + print("\nDashboard launcher is ready.", flush=True) + if multi_site: + print(f"Federated mirror tree: {local_run_root}//", flush=True) + if args.no_dashboard: return 0 + print(f"\nStarting dashboard at http://{args.dashboard_host}:{args.dashboard_port}", flush=True) - print("Ctrl-C stops the local dashboard, rsync loop, and relay tunnel.", flush=True) - return serve_dashboard(run_dir=local_run_dir, host=args.dashboard_host, port=args.dashboard_port) + print("Ctrl-C stops the local dashboard, rsync loops, and relay tunnels.", flush=True) + return serve_dashboard( + run_dir=local_run_root, + host=args.dashboard_host, port=args.dashboard_port, + ) finally: stop.set() - if relay_process is not None and relay_process.poll() is None: - relay_process.terminate() - try: - relay_process.wait(timeout=5) - except subprocess.TimeoutExpired: - relay_process.kill() - if started_master and not args.keep_ssh_master: - ssh(remote_host, None, control_path=control_path, extra=["-O", "exit"], check=False, batch_mode=False) + for site in sites: + _teardown_site(site, keep_ssh_master=args.keep_ssh_master) + if __name__ == "__main__": raise SystemExit(main()) diff --git a/tests/test_academy_dashboard.py b/tests/test_academy_dashboard.py index 6f62f37..e60e4e4 100644 --- a/tests/test_academy_dashboard.py +++ b/tests/test_academy_dashboard.py @@ -2,6 +2,8 @@ import json +import pytest + import chemgraph.academy.dashboard as dashboard from chemgraph.academy.observability.event_log import EventLog @@ -137,3 +139,228 @@ def test_dashboard_ignores_legacy_trace_jsonl(tmp_path) -> None: ) assert dashboard.events_payload(run_dir)["events"] == [] + + +# --------------------------------------------------------------------------- +# B.4c: federated dashboard (multi-site subdir layout) +# --------------------------------------------------------------------------- + + +def _seed_site(site_dir, *, status_timestamp, events) -> None: + """Write a minimal per-site mirror: status.json + events.jsonl. + + Touches an empty events.jsonl even when ``events`` is empty so the + federated-dashboard detector recognizes the dir as a real site + (EventLog itself only creates the file on first emit, which is + too late for the iterator's existence check). + """ + site_dir.mkdir(parents=True) + (site_dir / "status.json").write_text( + json.dumps({"mode": "mpi_daemon", "timestamp": status_timestamp, "agents": []}) + + "\n", + encoding="utf-8", + ) + (site_dir / "events.jsonl").touch() + log = EventLog(site_dir / "events.jsonl") + for event_name, payload in events: + log.emit( + event_name, + agent_id=payload["agent_id"], + role="observer", + payload=payload, + ) + + +def test_events_payload_merges_sites_and_tags_each_event(tmp_path) -> None: + """Federated dashboard's core promise: pointed at a parent dir with + per-site subdirs, ``events_payload`` returns a single timestamp- + sorted stream where each event carries a ``site`` field. UI uses + that field to color-code per-site events in the merged view.""" + root = tmp_path / "federated-run" + _seed_site( + root / "aurora", + status_timestamp=10.0, + events=[ + ("agent_started", { + "agent_id": "agent-00", "role": "observer", + "placement": {"hostname": "aur1", "short_hostname": "aur1"}, + "hostname": "aur1", "short_hostname": "aur1", + }), + ], + ) + _seed_site( + root / "crux", + status_timestamp=20.0, + events=[ + ("agent_started", { + "agent_id": "agent-01", "role": "observer", + "placement": {"hostname": "crux1", "short_hostname": "crux1"}, + "hostname": "crux1", "short_hostname": "crux1", + }), + ], + ) + + payload = dashboard.events_payload(root) + + assert {e["site"] for e in payload["events"]} == {"aurora", "crux"} + # Both sites' agents are visible in the merged stream. + agents_by_site = {e["site"]: e["agent_id"] for e in payload["events"]} + assert agents_by_site == {"aurora": "agent-00", "crux": "agent-01"} + + +def test_events_payload_sorts_merged_stream_by_timestamp(tmp_path) -> None: + """Per-site clocks don't have to agree, but the merged dashboard + view must be readable -- order by event timestamp regardless of + which site emitted each.""" + root = tmp_path / "federated-run" + + # Sites are seeded in reverse-time order; the merge must still + # produce timestamp-ascending output. + aurora_dir = root / "aurora" + aurora_dir.mkdir(parents=True) + (aurora_dir / "status.json").write_text("{}", encoding="utf-8") + aurora_log = EventLog(aurora_dir / "events.jsonl") + aurora_log.emit("agent_started", agent_id="ag-aur", role="r", payload={ + "agent_id": "ag-aur", "role": "r", + "placement": {"hostname": "h"}, "hostname": "h", + }) + + crux_dir = root / "crux" + crux_dir.mkdir(parents=True) + (crux_dir / "status.json").write_text("{}", encoding="utf-8") + crux_log = EventLog(crux_dir / "events.jsonl") + crux_log.emit("agent_started", agent_id="ag-crux", role="r", payload={ + "agent_id": "ag-crux", "role": "r", + "placement": {"hostname": "h"}, "hostname": "h", + }) + + events = dashboard.events_payload(root)["events"] + timestamps = [e["timestamp"] for e in events] + assert timestamps == sorted(timestamps) + + +def test_status_payload_nests_under_sites_for_federated_layout(tmp_path) -> None: + """Single-site clients use ``payload['status']`` / + ``payload['summary']`` etc directly. Federated clients want a + ``sites: {: {...}}`` shape so the UI can render per-site + sub-panels. Pin the structural difference so a future "make them + uniform" refactor must be a conscious choice.""" + root = tmp_path / "federated-run" + _seed_site( + root / "aurora", + status_timestamp=10.0, + events=[], + ) + _seed_site( + root / "crux", + status_timestamp=15.0, + events=[], + ) + + class Handler: + pass + handler = Handler() + handler.run_dir = root + + payload = dashboard.status_payload(handler) + assert "sites" in payload + assert set(payload["sites"]) == {"aurora", "crux"} + for site_name, site_payload in payload["sites"].items(): + assert "status" in site_payload + assert "summary" in site_payload + assert "placement" in site_payload + # Top-level 'updated' reflects the latest per-site update so the + # dashboard header has a meaningful timestamp. + assert payload["updated"] == 15.0 + + +def test_status_payload_keeps_legacy_shape_for_single_site(tmp_path) -> None: + """Existing single-site dashboard clients must see exactly the + pre-federation payload shape. The federated nesting only kicks in + when ``events.jsonl`` is absent at the top level.""" + root = tmp_path / "single-run" + _seed_site( + root, + status_timestamp=10.0, + events=[], + ) + + class Handler: + pass + handler = Handler() + handler.run_dir = root + + payload = dashboard.status_payload(handler) + # Single-site keys, no ``sites`` nesting. + assert "sites" not in payload + assert set(payload) == { + "placement", "run_dir", "schema", "status", "summary", "updated", + } + + +def test_iter_site_dirs_recognizes_metadata_only_sites(tmp_path) -> None: + """A site that's started but hasn't emitted any events yet still + has a ``dashboard_metadata.json`` written by the launcher. The + iterator must recognize it so a federated dashboard doesn't + briefly look like 'empty single-site' during startup.""" + from chemgraph.academy.dashboard.server import _iter_site_dirs + + root = tmp_path / "early-startup" + (root / "aurora").mkdir(parents=True) + (root / "aurora" / "dashboard_metadata.json").write_text("{}", encoding="utf-8") + (root / "crux").mkdir(parents=True) + (root / "crux" / "dashboard_metadata.json").write_text("{}", encoding="utf-8") + + sites = _iter_site_dirs(root) + assert {name for name, _ in sites} == {"aurora", "crux"} + + +def test_iter_site_dirs_falls_back_to_single_site_when_empty(tmp_path) -> None: + """Just-created run dir with neither events.jsonl nor recognizable + subdirs: behave as single-site so the dashboard renders an + empty-but-valid view instead of erroring out.""" + from chemgraph.academy.dashboard.server import _iter_site_dirs + + root = tmp_path / "brand-new" + root.mkdir() + sites = _iter_site_dirs(root) + assert sites == [(None, root)] + + +# --------------------------------------------------------------------------- +# Multi-site launcher argument parsing +# --------------------------------------------------------------------------- + + +def test_parse_systems_list_accepts_single_name() -> None: + """Single-site invocation is the legacy case; tuple-of-one keeps + the rest of the launcher uniform.""" + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + assert _parse_systems_list("aurora") == ("aurora",) + + +def test_parse_systems_list_accepts_comma_list_and_trims() -> None: + """The federated UX. Whitespace-tolerant for paste-from-doc.""" + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + assert _parse_systems_list(" aurora , crux ") == ("aurora", "crux") + assert _parse_systems_list("aurora,crux,") == ("aurora", "crux") + + +def test_parse_systems_list_rejects_empty() -> None: + """Operator typo or unexpected expansion -- fail at argparse-resolve + time with a clean message.""" + import argparse + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + with pytest.raises(argparse.ArgumentTypeError, match="at least one"): + _parse_systems_list("") + with pytest.raises(argparse.ArgumentTypeError, match="at least one"): + _parse_systems_list(",") + + +def test_parse_systems_list_rejects_duplicates() -> None: + """Listing the same site twice would set up duplicate tunnels + + rsync threads racing on the same mirror dir. Fail closed.""" + import argparse + from chemgraph.academy.runtime.dashboard_launcher import _parse_systems_list + with pytest.raises(argparse.ArgumentTypeError, match="duplicate"): + _parse_systems_list("aurora,crux,aurora") From 3893825d71dc20e25f473ed84b8eb1d48d8ee9a8 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 09:51:53 -0500 Subject: [PATCH 08/25] feat(academy): crux profile + federated-hello campaign + federated validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The non-code artifacts that turn the federation primitives shipped in B.1-B.4c into an actually runnable Aurora ⇄ Crux demo. runtime/profiles/crux.template.json (new) - Mirrors polaris.template.json (/eagle paths) but with Crux-specific bits: separate venv directory name (academy-swarm-crux) so it doesn't collide with the existing x86_64 Polaris venv on the same /eagle workspace; -crux suffix on the relay host file so per-site relays in the multi-site dashboard don't fight over the same path. - Registered in profiles/__init__.py BUILTIN_SYSTEM_PROFILES so ``chemgraph academy spawn-site --system crux`` and the multi-site dashboard launcher both recognize it. - Same unset_env policy as Aurora/Polaris -- proxies stripped by default for the LM-relay path; the launcher's exchange_type=='http' branch already overrides this so http exchange works via the ALCF proxy (proxy reachability empirically verified on Crux compute today). campaigns/federated-hello/ (new) - Two agents (agent-aurora, agent-crux), each declaring the other as its only allowed peer. No MCP servers, no resources, no science tools -- the smallest possible end-to-end campaign that exercises cross-site discovery + cross-site send_message + LM-driven decision turns. ~$0.01-0.05 of GPT-5-mini calls per run. - agent-aurora's mission: send ONE 'hello from aurora' to agent-crux, finish_turn, then finish_turn on every subsequent wakeup. - agent-crux's mission: wait, reply ONCE, finish_turn. Strong anti-loop guidance in both missions + the prompt profile. - prompt_profiles/default.json: tight system + protocol prompts that explicitly say "no science tools, only send_message and finish_turn." langchain_recursion_limit=32 since neither agent should ever loop more than a handful of rounds. - lm_config.json: GPT-5-mini template (no temperature field, since reasoning models reject non-default values -- the launcher's auto-strip would handle it but cleaner to just omit). - Registered under 'federated-hello' in CAMPAIGNS + CAMPAIGN_LAUNCH_DEFAULTS so ``--campaign federated-hello`` works as a packaged name (no rsync of the campaign dir required). core/campaign.py: validate_campaign(*, federated=False) - New keyword-only flag loosens two single-machine assumptions that break in federated spawn-site flows: * initial_agent may name an agent hosted on another site * each agent's allowed_peers may reference cross-site agents Both are looked up via the exchange at runtime, so the validator legitimately can't pre-check them in a federated slice. - Intra-slice checks (duplicate names, self-peer, MCP server / tool / resource resolvability) still run. Self-peer in particular stays a hard error because it would loop messages regardless of how many sites the campaign spans. runtime/daemon.py - Passes federated=bool(config.agents) to validate_campaign. The presence of an --agents slice is the canonical indicator of "I'm one site of a federated launch." Single-machine run-compute flows pass federated=False (the default), so prior behavior is byte-identical. Tests (+2, 80/80 academy sweep green; was 78) - validate_campaign federated=True accepts the cross-site peer reference in a federated-hello slice that strict validation rejects (regression guard for the relaxation). - validate_campaign federated=True still rejects self-peer (regression guard against accidentally relaxing too much). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/campaigns/__init__.py | 13 ++++++ .../campaigns/federated-hello/campaign.jsonc | 45 +++++++++++++++++++ .../campaigns/federated-hello/lm_config.json | 11 +++++ .../prompt_profiles/default.json | 12 +++++ src/chemgraph/academy/core/campaign.py | 32 ++++++++++--- src/chemgraph/academy/runtime/daemon.py | 9 +++- .../academy/runtime/profiles/__init__.py | 1 + .../runtime/profiles/crux.template.json | 36 +++++++++++++++ tests/test_academy_campaign.py | 41 +++++++++++++++++ 9 files changed, 192 insertions(+), 8 deletions(-) create mode 100644 src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc create mode 100644 src/chemgraph/academy/campaigns/federated-hello/lm_config.json create mode 100644 src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json create mode 100644 src/chemgraph/academy/runtime/profiles/crux.template.json diff --git a/src/chemgraph/academy/campaigns/__init__.py b/src/chemgraph/academy/campaigns/__init__.py index 8c3f5cd..9948bf8 100644 --- a/src/chemgraph/academy/campaigns/__init__.py +++ b/src/chemgraph/academy/campaigns/__init__.py @@ -6,13 +6,16 @@ EXAMPLE_002 = 'example-002-mace-ensemble-screening' +FEDERATED_HELLO = 'federated-hello' CAMPAIGNS = { 'mace-ensemble-screening-20': f'{EXAMPLE_002}/campaign.jsonc', + 'federated-hello': f'{FEDERATED_HELLO}/campaign.jsonc', } LM_CONFIG_TEMPLATES = { 'argo-gpt54-mace-template': f'{EXAMPLE_002}/lm_config.json', + 'argo-gpt5mini-federated-hello': f'{FEDERATED_HELLO}/lm_config.json', } @@ -33,6 +36,16 @@ class CampaignLaunchDefaults: agents_per_node=1, max_decisions=24, ), + # Two-agent federated hello-world. Per-site spawn-site invocations + # override --agent-count from the slice length, so the value below + # is only the "ran via run-compute" single-machine fallback (which + # nobody should actually do for this campaign). + 'federated-hello': CampaignLaunchDefaults( + lm_config_template='argo-gpt5mini-federated-hello', + agent_count=2, + agents_per_node=1, + max_decisions=4, + ), } diff --git a/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc b/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc new file mode 100644 index 0000000..f22b31e --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc @@ -0,0 +1,45 @@ +{ + // --------------------------------------------------------------------- + // Federated-hello: the smallest possible cross-HPC ChemGraph Academy + // campaign. Two agents, no science tools, no MCP servers. Each agent + // is intended to live on a DIFFERENT machine (one on Aurora, one on + // Crux). The campaign proves the federation story works end-to-end: + // * each site's spawn-site registers its slice on the hosted + // exchange, + // * cross-site peer discovery via transport.discover() succeeds, + // * a real LM-driven send_message round-trip crosses the HPC + // boundary through the public Academy exchange. + // Run as: + // chemgraph academy dashboard -- demo --system aurora,crux \ + // --campaign federated-hello + // chemgraph academy spawn-site -- --system aurora \ + // --campaign federated-hello --agents agent-aurora --exchange-type http + // chemgraph academy spawn-site -- --system crux \ + // --campaign federated-hello --agents agent-crux --exchange-type http + // chemgraph academy bootstrap -- --campaign federated-hello --exchange-type http + // --------------------------------------------------------------------- + "run_id": "federated-hello", + "user_task": "Federated hello: greet your peer once across the HPC boundary.", + "prompt_profile": "prompt_profiles/default.json", + "initial_agent": "agent-aurora", + "resources": {}, + "mcp_servers": [], + "agents": [ + { + "name": "agent-aurora", + "role": "FederatedHelloInitiator", + "mission": "You are agent-aurora, running on the Aurora HPC. On your FIRST decision round (when you receive the campaign bootstrap), send EXACTLY ONE message to agent-crux saying 'hello from aurora' with tldr 'hello'. Set reply_requested=true. Then call finish_turn. On subsequent rounds, if you have received a reply from agent-crux acknowledging your hello, do not send anything further -- just call finish_turn. NEVER send more than one outgoing hello.", + "allowed_peers": ["agent-crux"], + "mcp_servers": [], + "resources": [] + }, + { + "name": "agent-crux", + "role": "FederatedHelloResponder", + "mission": "You are agent-crux, running on the Crux HPC. You are waiting for one incoming message from agent-aurora. When you receive it, send EXACTLY ONE reply back to agent-aurora saying 'hello from crux, received your message' with tldr 'ack'. Set reply_requested=false. Then call finish_turn. NEVER initiate a message on your own; you only ever reply.", + "allowed_peers": ["agent-aurora"], + "mcp_servers": [], + "resources": [] + } + ] +} diff --git a/src/chemgraph/academy/campaigns/federated-hello/lm_config.json b/src/chemgraph/academy/campaigns/federated-hello/lm_config.json new file mode 100644 index 0000000..20c4b7f --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-hello/lm_config.json @@ -0,0 +1,11 @@ +{ + "provider": "openai_compatible_tools", + "base_url": "http://:18186/argoapi/v1", + "model": "GPT-5-mini", + "api_key": "dummy", + "user": "", + "timeout_s": 180, + "max_tokens": 4096, + "max_retries": 3, + "retry_delay_s": 2 +} diff --git a/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json b/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json new file mode 100644 index 0000000..68d3059 --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json @@ -0,0 +1,12 @@ +{ + "prompt_version": "federated-hello-v1", + "prompt_style": "json_state", + "system_prompt": "You are a persistent ChemGraph-style LM agent hosted inside an Academy daemon on HPC. You communicate with peers ONLY through send_message. This campaign has NO science tools; your only useful actions are send_message and finish_turn. Follow your mission literally; do not invent additional work.", + "protocol_prompt": "Return one or more tool calls. If no action is useful, call finish_turn. Every send_message call must include tldr: one short line for the dashboard. Set reply_requested=true when the peer should answer, otherwise false. Keep arguments concise. Do NOT loop -- if you already sent your one hello (initiator) or already sent your one ack (responder), call finish_turn immediately on every subsequent round.", + "langchain_recursion_limit": 32, + "state_limits": { + "received_messages_last_n": 8, + "tool_results_last_n": 4, + "actions_last_n": 8 + } +} diff --git a/src/chemgraph/academy/core/campaign.py b/src/chemgraph/academy/core/campaign.py index 3a3704a..2593929 100644 --- a/src/chemgraph/academy/core/campaign.py +++ b/src/chemgraph/academy/core/campaign.py @@ -401,7 +401,24 @@ def _resolve_campaign_relative_path( return path.resolve() -def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: +def validate_campaign( + campaign: ChemGraphCampaign, + agent_count: int, + *, + federated: bool = False, +) -> None: + """Validate a campaign before the daemon constructs agents from it. + + ``federated=True`` loosens two single-machine assumptions that don't + hold for federated spawn-site launches: + * ``initial_agent`` may name an agent hosted on another site + (this site only has a slice). + * each agent's ``allowed_peers`` may reference agents on other + sites that aren't in this slice. Those are looked up via the + exchange at runtime; the validator can't know about them. + The intra-slice checks (no duplicate names, no self-peer, MCP + server / resource references all resolvable) still run. + """ if len(campaign.agents) != agent_count: raise RuntimeError( f'campaign defines {len(campaign.agents)} agents but ' @@ -410,7 +427,7 @@ def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: names = [agent.name for agent in campaign.agents] if len(set(names)) != len(names): raise RuntimeError('campaign agent names must be unique') - if campaign.initial_agent not in names: + if not federated and campaign.initial_agent not in names: raise RuntimeError( f'initial_agent {campaign.initial_agent!r} is not an agent', ) @@ -419,11 +436,12 @@ def validate_campaign(campaign: ChemGraphCampaign, agent_count: int) -> None: raise RuntimeError('campaign MCP server names must be unique') declared_servers = set(server_names) for agent in campaign.agents: - unknown = sorted(set(agent.allowed_peers).difference(names)) - if unknown: - raise RuntimeError( - f'{agent.name} has unknown allowed peers: {unknown}', - ) + if not federated: + unknown = sorted(set(agent.allowed_peers).difference(names)) + if unknown: + raise RuntimeError( + f'{agent.name} has unknown allowed peers: {unknown}', + ) if agent.name in agent.allowed_peers: raise RuntimeError(f'{agent.name} must not list itself as a peer') unknown_servers = sorted(set(agent.mcp_servers).difference(declared_servers)) diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index fbde139..1d887b8 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -52,7 +52,14 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: # all agree on the same agent ordering. if config.agents: campaign = filter_agents(campaign, config.agents) - validate_campaign(campaign, config.agent_count) + # Loosen cross-site peer / initial_agent checks for federated + # slices -- those names may legitimately reference agents this + # site doesn't own (they're discovered through the exchange). + validate_campaign( + campaign, + config.agent_count, + federated=bool(config.agents), + ) agent_spec = selected_agent(campaign, config.rank) placement = placement_payload(config, agent_spec.name) supervisor = MCPServerSupervisor( diff --git a/src/chemgraph/academy/runtime/profiles/__init__.py b/src/chemgraph/academy/runtime/profiles/__init__.py index 2ead8a2..740d00e 100644 --- a/src/chemgraph/academy/runtime/profiles/__init__.py +++ b/src/chemgraph/academy/runtime/profiles/__init__.py @@ -7,6 +7,7 @@ BUILTIN_SYSTEM_PROFILES = { "aurora": "aurora.template.json", "polaris": "polaris.template.json", + "crux": "crux.template.json", } diff --git a/src/chemgraph/academy/runtime/profiles/crux.template.json b/src/chemgraph/academy/runtime/profiles/crux.template.json new file mode 100644 index 0000000..7774822 --- /dev/null +++ b/src/chemgraph/academy/runtime/profiles/crux.template.json @@ -0,0 +1,36 @@ +{ + "name": "crux", + "remote_host": "${ALCF_USER}@crux.alcf.anl.gov", + "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", + "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", + "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", + "relay_host_file": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/uan-relay-18186-crux.host", + "relay_port": 18186, + "venv_python": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/venvs/academy-swarm-crux/bin/python", + "redis_bin_dir": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/tools/redis-crux/bin", + "redis_port": 6392, + "redis_bind": "0.0.0.0", + "redis_protected_mode": "no", + "mpiexec": "mpiexec", + "pythonpath_entries": [ + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph/src" + ], + "path_entries": [ + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/tools/redis-crux/bin", + "/eagle/${ALCF_PROJECT}/${ALCF_USER}/bin" + ], + "env": { + "NUMEXPR_MAX_THREADS": "256", + "NUMEXPR_NUM_THREADS": "64", + "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_ACADEMY_PY": "0.0.0+crux" + }, + "unset_env": [ + "http_proxy", + "HTTP_PROXY", + "https_proxy", + "HTTPS_PROXY", + "all_proxy", + "ALL_PROXY" + ], + "no_proxy": "127.0.0.1,localhost,.alcf.anl.gov,*.alcf.anl.gov" +} diff --git a/tests/test_academy_campaign.py b/tests/test_academy_campaign.py index 17726e1..b3fa829 100644 --- a/tests/test_academy_campaign.py +++ b/tests/test_academy_campaign.py @@ -414,3 +414,44 @@ def test_filter_agents_rejects_duplicate_names() -> None: campaign = load_campaign("mace-ensemble-screening-20") with pytest.raises(RuntimeError, match="duplicate agent names"): filter_agents(campaign, ["mace-agent", "mace-agent"]) + + +def test_validate_campaign_federated_loosens_cross_site_peer_check() -> None: + """In a federated spawn-site slice, allowed_peers / initial_agent + may legitimately reference agents owned by another site. Strict + validation (the default) rejects those; ``federated=True`` lets + them through because the daemon will discover those peers via the + exchange at runtime instead of from this slice's agent list.""" + from chemgraph.academy.core.campaign import ( + filter_agents, load_campaign, validate_campaign, + ) + campaign = load_campaign("federated-hello") + slice_aurora = filter_agents(campaign, ["agent-aurora"]) + + # Strict validation rejects the cross-site peer reference. + with pytest.raises(RuntimeError, match="unknown allowed peers"): + validate_campaign(slice_aurora, agent_count=1) + + # federated=True accepts it. + validate_campaign(slice_aurora, agent_count=1, federated=True) + + +def test_validate_campaign_federated_still_rejects_self_peer() -> None: + """The 'agent must not list itself as a peer' invariant is local + to the slice and stays a hard error even in federated mode -- + self-peering would loop messages back to the sender, regardless + of how many sites the campaign spans.""" + from chemgraph.academy.core.campaign import ( + ChemGraphAgentSpec, ChemGraphCampaign, validate_campaign, + ) + import pathlib + bad = ChemGraphCampaign( + run_id="r", user_task="t", initial_agent="a", + prompt_profile=pathlib.Path("p"), + agents=(ChemGraphAgentSpec( + name="a", role="r", mission="m", + allowed_peers=("a",), # <-- self-peer + ),), + ) + with pytest.raises(RuntimeError, match="must not list itself as a peer"): + validate_campaign(bad, agent_count=1, federated=True) From fdc257f95bf0e30dfd007b273d230184588b2cc9 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 10:01:25 -0500 Subject: [PATCH 09/25] feat(academy/dashboard): per-site rendering for federated runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The minimum-scope UI work that makes the federation story visible in the dashboard. Without it the merged event payload (B.4c.2) landed on a UI that displayed everything as if it were a single machine -- timeline still rendered, agent graph still rendered, but operators / demo viewers had no visual cue that the campaign was spanning multiple HPCs. What the UI now shows in federated runs: * Header bar: "Sites: aurora (1🤖 / 12📨) · crux (1🤖 / 8📨)" so the multi-site nature is immediately legible from the top of the dashboard. * Agent-graph swimlanes labelled by site ("aurora", "crux") instead of by individual compute hostnames ("x4708...", "x1000...") -- same nodes, same edges, far clearer story. * Message-flow detail panel: route is labelled "cross-site" (federated) or "cross-node" (single-machine) depending on context, with "From site" / "To site" rows showing aurora vs crux. The literal hostname is still available in each agent's detail panel. * Cross-node-messages metric becomes meaningful as "messages that crossed the HPC boundary" in federated runs. Single-site runs are visually byte-identical to before: ``snapshot.federated`` is false so ``agentGroup`` falls back to ``agentHost``, the sitesBadge stays hidden, route labels stay "cross-node" / "same-node", detail rows stay "From host" / "To host". Test suite (80/80) confirms server-side payload shape is unchanged for single-site. Implementation - ``load()``: detect ``snapshot.sites`` (set by server-side ``_iter_site_dirs`` in B.4c.2), set ``snapshot.federated``, build a flat ``sitesByAgent`` index from ``sites[*].status.agents`` and ``sites[*].placement.agents``, backfilled from per-event ``site`` tags as authoritative. - ``agentSite(agent)`` / ``agentGroup(agent)``: the single point where federated vs single-site rendering diverges. Every renderer that asks "what bucket does this agent belong to" now goes through ``agentGroup`` instead of ``agentHost``. - ``renderSitesBadge()``: header-bar federation indicator with per-site agent counts and per-site event counts. - Three message-route detail panels updated to label by group rather than hardcoding "host", and to show "cross-site" in federated mode. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/dashboard/static/app.js | 146 +++++++++++++++--- .../academy/dashboard/static/index.html | 5 + 2 files changed, 132 insertions(+), 19 deletions(-) diff --git a/src/chemgraph/academy/dashboard/static/app.js b/src/chemgraph/academy/dashboard/static/app.js index 1796c0e..f821e63 100644 --- a/src/chemgraph/academy/dashboard/static/app.js +++ b/src/chemgraph/academy/dashboard/static/app.js @@ -88,6 +88,38 @@ const statusData = await statusRes.json(); const eventsData = await eventsRes.json(); const nextSnapshot = {...statusData, events: eventsData.events || []}; + // Federated detection: server-side B.4c.2 wraps per-site state + // under ``sites: {: {status, placement, summary, updated, schema}}`` + // for multi-site runs and tags every event with ``site``. Build + // a flat agent->site index so renderers can ask "which site does + // this agent live on?" without re-walking sites every time. + nextSnapshot.federated = !!(nextSnapshot.sites && Object.keys(nextSnapshot.sites).length); + nextSnapshot.siteNames = nextSnapshot.federated + ? Object.keys(nextSnapshot.sites).sort() + : []; + nextSnapshot.sitesByAgent = {}; + if (nextSnapshot.federated) { + for (const [siteName, siteData] of Object.entries(nextSnapshot.sites)) { + const agents = (siteData?.status?.agents) || []; + agents.forEach(spec => { + const agentId = spec.agent_id || spec.agent_name || spec.name; + if (agentId) nextSnapshot.sitesByAgent[agentId] = siteName; + }); + const placements = (siteData?.placement?.agents) || {}; + Object.keys(placements).forEach(agentId => { + if (!(agentId in nextSnapshot.sitesByAgent)) { + nextSnapshot.sitesByAgent[agentId] = siteName; + } + }); + } + // Backfill from events too, since the per-event ``site`` tag + // is authoritative for the agent that emitted each event. + (nextSnapshot.events || []).forEach(event => { + if (event.site && event.agent_id && !(event.agent_id in nextSnapshot.sitesByAgent)) { + nextSnapshot.sitesByAgent[event.agent_id] = event.site; + } + }); + } const nextIdentity = identityForSnapshot(nextSnapshot); const previousEventCount = snapshot?.events?.length || 0; const nextEventCount = nextSnapshot.events.length; @@ -304,6 +336,57 @@ return agent?.placement?.short_hostname || agent?.placement?.hostname || (agent?.started ? 'unknown host' : 'pending'); } + function agentSite(agent) { + // In federated runs the meaningful grouping is "which HPC" + // (aurora vs crux), not individual compute hostnames. The + // server tags every event with ``site`` and we built a + // sitesByAgent index in load(); fall back to "pending" so + // not-yet-registered agents still render. + if (!snapshot?.federated) return null; + const agentId = agent?.agent_id || agent?.agent_name || agent?.name; + return snapshot.sitesByAgent[agentId] || 'pending'; + } + + function agentGroup(agent) { + // Single source of truth for "what bucket does this agent + // belong to in the current view": site (federated) or host + // (single-machine). Renderers that ask "what color / what + // label" go through here so federated vs single-site rendering + // diverges in exactly one place. + return agentSite(agent) || agentHost(agent); + } + + function renderSitesBadge() { + // Header-bar federation indicator. Hidden in single-site runs + // so the existing single-site UI looks unchanged; in federated + // runs it shows e.g. "Sites: aurora · crux (2 agents on aurora, + // 1 on crux, 0 events from aurora)" so operators / demo + // viewers can confirm at a glance that the campaign really + // is spanning multiple HPCs. + const badge = document.getElementById('sitesBadge'); + if (!badge) return; + if (!snapshot?.federated || !snapshot.siteNames.length) { + badge.style.display = 'none'; + badge.textContent = ''; + return; + } + const eventCounts = {}; + (snapshot.events || []).forEach(e => { + if (e.site) eventCounts[e.site] = (eventCounts[e.site] || 0) + 1; + }); + const agentCounts = {}; + Object.values(snapshot.sitesByAgent || {}).forEach(site => { + agentCounts[site] = (agentCounts[site] || 0) + 1; + }); + const parts = snapshot.siteNames.map(site => { + const a = agentCounts[site] || 0; + const e = eventCounts[site] || 0; + return `${site} (${a}🤖 / ${e}📨)`; + }); + badge.textContent = 'Sites: ' + parts.join(' · '); + badge.style.display = ''; + } + function hostColor(index) { const colors = ['#dbeafe', '#dcfce7', '#fef3c7', '#fce7f3', '#e0e7ff', '#ccfbf1', '#fee2e2', '#ede9fe']; return colors[index % colors.length]; @@ -318,6 +401,7 @@ const detailScroll = captureDetailScrollSnapshot(); document.getElementById('updated').textContent = snapshot.updated ? new Date(snapshot.updated * 1000).toLocaleTimeString() : ''; document.getElementById('runPath').textContent = snapshot.run_dir || ''; + renderSitesBadge(); document.getElementById('graphTitle').textContent = isWorkflowMode() ? 'ChemGraph Workflow' : 'Agent Graph'; renderTimeline(); renderMetrics(); @@ -358,8 +442,12 @@ events.forEach(event => { counts[event.event] = (counts[event.event] || 0) + 1; }); const currentAgents = agents(); const startedAgents = currentAgents.filter(agent => agent.started); - const hostByAgent = new Map(currentAgents.map(agent => [agent.agent_id, agentHost(agent)])); - const hosts = new Set(startedAgents.map(agentHost).filter(host => host && host !== 'pending')); + // Group by site in federated mode, by host otherwise. The + // "cross-node messages" metric below stays meaningful either + // way -- in federated mode it becomes "messages that crossed + // the HPC boundary," which is exactly what we want to surface. + const hostByAgent = new Map(currentAgents.map(agent => [agent.agent_id, agentGroup(agent)])); + const hosts = new Set(startedAgents.map(agentGroup).filter(host => host && host !== 'pending')); const finish = latestEventOf('campaign_finished')?.payload || {}; const messageEvents = events.filter(event => event.event === 'message_sent'); const crossNodeMessages = messageEvents.filter(event => { @@ -1251,9 +1339,14 @@ return; } + // Group by site (federated) or hostname (single-site). The + // graph layout is unchanged either way -- only the swimlane + // labels and the band per-agent-group differ. In federated + // mode you see "aurora" and "crux" swimlanes instead of + // "x4708..." and "x1000...". Same nodes, clearer story. const byHost = new Map(); currentAgents.forEach(agent => { - const host = agentHost(agent); + const host = agentGroup(agent); if (!byHost.has(host)) byHost.set(host, []); byHost.get(host).push(agent); }); @@ -1944,21 +2037,28 @@ const currentAgents = agents(); const senderAgent = currentAgents.find(agent => agent.agent_id === sender); const recipientAgent = currentAgents.find(agent => agent.agent_id === recipient); - const senderHost = agentHost(senderAgent); - const recipientHost = agentHost(recipientAgent); + // "Group" = site in federated mode, host in single-machine. + // Route label becomes "cross-site" or "cross-node" accordingly, + // which is what the operator actually wants to see at a glance. + const senderGroup = agentGroup(senderAgent); + const recipientGroup = agentGroup(recipientAgent); + const groupLabel = snapshot?.federated ? 'site' : 'host'; const messages = eventsOf('message_sent').filter(e => { const p = e.payload || {}; return p.sender === sender && p.recipient === recipient; }); const latest = messages.length ? messages[messages.length - 1] : null; const latestPayload = latest?.payload || {}; - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; document.getElementById('detailTitle').textContent = `${sender} -> ${recipient}`; document.getElementById('detailCards').innerHTML = detailCards([ ['Route', route], ['Messages', messages.length], - ['From host', senderHost], - ['To host', recipientHost], + [`From ${groupLabel}`, senderGroup], + [`To ${groupLabel}`, recipientGroup], ]); setDetailHtmlBlock('detailPrimaryTitle', 'Latest Message', 'detailPrimary', latest ? messageDetailHtml(latest) @@ -2256,16 +2356,20 @@ const currentAgents = agents(); const sender = currentAgents.find(agent => agent.agent_id === p.sender); const recipient = currentAgents.find(agent => agent.agent_id === p.recipient); - const senderHost = agentHost(sender); - const recipientHost = agentHost(recipient); - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const senderGroup = agentGroup(sender); + const recipientGroup = agentGroup(recipient); + const groupLabel = snapshot?.federated ? 'site' : 'host'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; return detailRich( detailSection('Route', detailKvGrid([ ['Type', route], - ['Sender host', senderHost], - ['Recipient host', recipientHost], + [`Sender ${groupLabel}`, senderGroup], + [`Recipient ${groupLabel}`, recipientGroup], ['Message id', p.message_id || '-', 'mono'], - ]), route === 'cross-node' ? 'ok' : 'info'), + ]), crossGroup ? 'ok' : 'info'), ); } @@ -2510,13 +2614,17 @@ const currentAgents = agents(); const sender = currentAgents.find(agent => agent.agent_id === p.sender); const recipient = currentAgents.find(agent => agent.agent_id === p.recipient); - const senderHost = agentHost(sender); - const recipientHost = agentHost(recipient); - const route = senderHost && recipientHost && senderHost !== recipientHost ? 'cross-node' : 'same-node'; + const senderGroup = agentGroup(sender); + const recipientGroup = agentGroup(recipient); + const groupLabel = snapshot?.federated ? 'site' : 'host'; + const crossGroup = senderGroup && recipientGroup && senderGroup !== recipientGroup; + const route = crossGroup + ? (snapshot?.federated ? 'cross-site' : 'cross-node') + : 'same-' + groupLabel; return [ `Route: ${route}`, - `Sender host: ${senderHost}`, - `Recipient host: ${recipientHost}`, + `Sender ${groupLabel}: ${senderGroup}`, + `Recipient ${groupLabel}: ${recipientGroup}`, `Message id: ${p.message_id || '-'}`, ].join('\n'); } diff --git a/src/chemgraph/academy/dashboard/static/index.html b/src/chemgraph/academy/dashboard/static/index.html index f26c106..bdd2ab4 100644 --- a/src/chemgraph/academy/dashboard/static/index.html +++ b/src/chemgraph/academy/dashboard/static/index.html @@ -590,6 +590,11 @@

ChemGraph Academy Dashboard

+ +
From 5549dbb6b8ae69c83252cbc2c443c94482857e56 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 10:07:00 -0500 Subject: [PATCH 10/25] fix(academy/runtime/profiles): separate ALCF_SSH_USER from ALCF_USER For accounts whose SSH login differs from the workspace directory name. ALCF_USER drives every path interpolation (``/flare/${ALCF_PROJECT}/${ALCF_USER}/``) while ALCF_SSH_USER drives only ``remote_host`` (``${ALCF_SSH_USER}@aurora.alcf.anl.gov``). The two collided on a single env var until now, forcing operators to choose: set ALCF_USER for paths and get the wrong SSH login (which triggered an ALCF Cyber security challenge on Aurora), or set it right for SSH and have all the run-dir / venv paths point at a non-existent directory. The relevant operator on this repo has SSH login ``jinchuli`` but their Aurora/Crux/Polaris workspace lives under ``/{flare,eagle}//jinchu/`` (no trailing 'i'), so the ALCF_USER=jinchu setting was producing the right paths but the wrong SSH user. Now they set ALCF_USER=jinchu for paths and ALCF_SSH_USER=jinchuli for SSH and both work. Default ALCF_SSH_USER to ALCF_USER when unset, so the majority of users for whom the two are equal don't have to set both. system.py - New ``_expand_with(text, env)`` does ``os.path.expandvars``-style substitution against a caller-supplied env dict rather than the process environment, so the SSH-USER default doesn't leak into ``os.environ`` for subsequent callers. - ``load_system_profile`` copies the environ, fills in the default, and substitutes through ``_expand_with``. profiles/{aurora,crux,polaris}.template.json - ``remote_host`` now interpolates ``${ALCF_SSH_USER}``; every other field still uses ``${ALCF_USER}`` for the path component. Tests: 80/80 academy sweep still green. Default-case behavior (both env vars equal) is byte-identical to the prior single-var setup. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../runtime/profiles/aurora.template.json | 2 +- .../runtime/profiles/crux.template.json | 2 +- .../runtime/profiles/polaris.template.json | 2 +- .../academy/runtime/profiles/system.py | 27 ++++++++++++++++++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/chemgraph/academy/runtime/profiles/aurora.template.json b/src/chemgraph/academy/runtime/profiles/aurora.template.json index 1e3e40a..921155a 100644 --- a/src/chemgraph/academy/runtime/profiles/aurora.template.json +++ b/src/chemgraph/academy/runtime/profiles/aurora.template.json @@ -1,6 +1,6 @@ { "name": "aurora", - "remote_host": "${ALCF_USER}@aurora.alcf.anl.gov", + "remote_host": "${ALCF_SSH_USER}@aurora.alcf.anl.gov", "remote_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/flare/${ALCF_PROJECT}/${ALCF_USER}/runs", diff --git a/src/chemgraph/academy/runtime/profiles/crux.template.json b/src/chemgraph/academy/runtime/profiles/crux.template.json index 7774822..4a28193 100644 --- a/src/chemgraph/academy/runtime/profiles/crux.template.json +++ b/src/chemgraph/academy/runtime/profiles/crux.template.json @@ -1,6 +1,6 @@ { "name": "crux", - "remote_host": "${ALCF_USER}@crux.alcf.anl.gov", + "remote_host": "${ALCF_SSH_USER}@crux.alcf.anl.gov", "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", diff --git a/src/chemgraph/academy/runtime/profiles/polaris.template.json b/src/chemgraph/academy/runtime/profiles/polaris.template.json index 7be57c9..2af485c 100644 --- a/src/chemgraph/academy/runtime/profiles/polaris.template.json +++ b/src/chemgraph/academy/runtime/profiles/polaris.template.json @@ -1,6 +1,6 @@ { "name": "polaris", - "remote_host": "${ALCF_USER}@polaris.alcf.anl.gov", + "remote_host": "${ALCF_SSH_USER}@polaris.alcf.anl.gov", "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", diff --git a/src/chemgraph/academy/runtime/profiles/system.py b/src/chemgraph/academy/runtime/profiles/system.py index 02ed6dc..0de3cd7 100644 --- a/src/chemgraph/academy/runtime/profiles/system.py +++ b/src/chemgraph/academy/runtime/profiles/system.py @@ -39,7 +39,16 @@ class SystemProfile(BaseModel): def load_system_profile(path_or_name: str | Path) -> SystemProfile: profile_path = resolve_builtin_system_profile(path_or_name) - text = os.path.expandvars(profile_path.read_text(encoding="utf-8")) + # Default ALCF_SSH_USER to ALCF_USER when unset. This separates the + # *SSH login* (used in ``remote_host``) from the *path component* + # (used everywhere else), which matters for accounts whose login + # differs from their workspace dir name -- e.g. login ``jinchuli`` + # but workspace under ``/flare/.../jinchu/``. Most users have one + # equal to the other and the default keeps their setup unchanged. + env = os.environ.copy() + if "ALCF_USER" in env and not env.get("ALCF_SSH_USER"): + env["ALCF_SSH_USER"] = env["ALCF_USER"] + text = _expand_with(profile_path.read_text(encoding="utf-8"), env) unresolved = sorted(set(re.findall(r"\$\{([^}]+)\}", text))) if unresolved: raise ValueError( @@ -48,3 +57,19 @@ def load_system_profile(path_or_name: str | Path) -> SystemProfile: ) data = json.loads(text) return SystemProfile.model_validate(data) + + +def _expand_with(text: str, env: dict[str, str]) -> str: + """``os.path.expandvars`` but reading from a caller-supplied env dict. + + The stdlib's ``expandvars`` always reads ``os.environ`` directly, + which means ``ALCF_SSH_USER`` defaulted to ``ALCF_USER`` only by + mutating the process environment. That'd leak the default into + every subsequent caller. Substituting via regex keeps the change + local. + """ + return re.sub( + r"\$\{([^}]+)\}", + lambda m: env.get(m.group(1), m.group(0)), + text, + ) From e162f131f1a449bb0b0470b0696d53bffcacf3bc Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 10:40:17 -0500 Subject: [PATCH 11/25] fix(academy/runtime): startup-timeout-s default 600s + operator override; crux relay port 18187 Two operational fixes from the Aurora<->Crux federated demo. compute_launcher.py - Default startup_timeout_s 120s -> 600s. The realistic worst case for federated launches is one site's HPC queue wait + Python imports outpacing the other site's peer-discovery patience; 120s is comfortably too short. 600s comfortably absorbs debug-scaling / workq schedule delays. Single-machine launches reach discover_peer_agent_ids in seconds so the new ceiling never matters for them. - New --startup-timeout-s CLI flag so operators can extend the window further when they know a site will be slow. profiles/crux.template.json - Bump relay_port 18186 -> 18187 to dodge a leftover ssh -R reverse-tunnel that's still bound to 127.0.0.1:18186 on crux-uan-0001 from a prior failed dashboard launch. Follow-up cleanup: launcher should probe for a free port instead of insisting on the profile's hardcoded one. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../academy/runtime/compute_launcher.py | 21 ++++++++++++++++++- .../runtime/profiles/crux.template.json | 4 ++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/chemgraph/academy/runtime/compute_launcher.py b/src/chemgraph/academy/runtime/compute_launcher.py index 82ca761..2d20e4a 100644 --- a/src/chemgraph/academy/runtime/compute_launcher.py +++ b/src/chemgraph/academy/runtime/compute_launcher.py @@ -127,6 +127,18 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: "'chemgraph academy bootstrap' subcommand." ), ) + parser.add_argument( + "--startup-timeout-s", + type=float, + default=None, + help=( + "How long the daemon's cross-site peer-discovery loop " + "waits before giving up. Default 600s. Bump higher for " + "federated launches where HPC queue waits + Python " + "imports + cold-cache rsyncs can push one site's startup " + "well past the other site's discovery patience." + ), + ) parser.add_argument("--no-start-redis", action="store_true") return parser.parse_args(argv) @@ -317,7 +329,14 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: max_decisions=max_decisions, poll_timeout_s=2.0, idle_timeout_s=600.0, - startup_timeout_s=120.0, + # Default 600s (was 120s). Single-machine runs reach this + # codepath in seconds; the realistic worst case is federated + # launches where one site's HPC queue wait + Python imports + # outpaces the other site's peer-discovery patience. 10 min + # comfortably covers debug-scaling / workq scheduling delays + # without making single-machine failures slow to surface + # (the daemon prints a clear "missing=..." message regardless). + startup_timeout_s=(getattr(args, "startup_timeout_s", None) or 600.0), completion_timeout_s=60.0, status_interval_s=5.0, redis_host=socket.getfqdn(), diff --git a/src/chemgraph/academy/runtime/profiles/crux.template.json b/src/chemgraph/academy/runtime/profiles/crux.template.json index 4a28193..810d2c8 100644 --- a/src/chemgraph/academy/runtime/profiles/crux.template.json +++ b/src/chemgraph/academy/runtime/profiles/crux.template.json @@ -4,8 +4,8 @@ "remote_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}", "repo_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/ChemGraph", "run_root": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/runs", - "relay_host_file": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/uan-relay-18186-crux.host", - "relay_port": 18186, + "relay_host_file": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/uan-relay-18187-crux.host", + "relay_port": 18187, "venv_python": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/venvs/academy-swarm-crux/bin/python", "redis_bin_dir": "/eagle/${ALCF_PROJECT}/${ALCF_USER}/tools/redis-crux/bin", "redis_port": 6392, From 90b636f445621b38ba2748212cbd3fcf526926aa Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 10:50:26 -0500 Subject: [PATCH 12/25] feat(academy): operator-visible daemon lifecycle prints The daemon was opaque during its slow stretches -- import, registration, peer discovery, runtime entry, and waiting for the bootstrap message all happened silently from the operator terminal. You could not distinguish "still importing" from "stuck on discovery" from "alive and waiting for bootstrap" without tailing events.jsonl or checking the dashboard. Add four landmark prints, all grep-able as ``[daemon]`` or ``[agent ]``: daemon.py - ``[daemon] rankN registered on the exchange`` -- own- registration completed; next step is peer discovery - ``[daemon] rankN discovering peers [...] (timeout 600s)...`` -- entering the wait - ``[daemon] rankN discovered N peer(s): [...]`` -- past discovery, about to enter Runtime - ``[daemon] rankN agent is now running inside Academy Runtime`` -- agent is alive and listening - ``[daemon] rankN dispatched inline bootstrap to `` / ``... skipping inline bootstrap (federated mode); waiting for chemgraph academy bootstrap ...`` so the operator knows whether to fire the standalone bootstrap subcommand core/agent.py - ``[agent ] first message arrived from (kind=...): `` on the FIRST inbox message. For the federated demo the recipient agents both print this -- agent-aurora when bootstrap lands, agent-crux when the hello arrives. Concrete "kickoff worked" signal without needing the dashboard. All prints flush=True so they survive PALS/MPICH buffering when mpiexec is forwarding many ranks stdout simultaneously. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/core/agent.py | 16 ++++++++ src/chemgraph/academy/runtime/daemon.py | 54 ++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/src/chemgraph/academy/core/agent.py b/src/chemgraph/academy/core/agent.py index 6f2c81c..562ee3b 100644 --- a/src/chemgraph/academy/core/agent.py +++ b/src/chemgraph/academy/core/agent.py @@ -90,10 +90,26 @@ async def agent_on_startup(self) -> None: @action async def receive_message(self, message: dict[str, Any]) -> None: validate_message(message) + first_message = not self.received_message_history self.received_message_history.append(message) self._trace('message_received', message) if self._wake_event is not None: self._wake_event.set() + if first_message: + # Operator-visible lifecycle landmark: the FIRST message + # to land on this agent (almost always the campaign + # bootstrap for initial_agent, or a peer's reply on + # everyone else) is the canonical "kickoff arrived" + # signal. Use print so it surfaces on stdout regardless + # of log level configuration on the rank. + sender = message.get('sender', '?') + kind = message.get('kind', '?') + tldr = message.get('tldr') or message.get('content', '')[:60] + print( + f"[agent {self.spec.name}] first message arrived from " + f"{sender!r} (kind={kind}): {tldr}", + flush=True, + ) @action async def get_status(self) -> dict[str, Any]: diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index 1d887b8..b0beba6 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -109,15 +109,42 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: ChemGraphLogicalAgent, name=agent_spec.name, ) + # Operator-visible lifecycle prints. The default INFO + # logging is too verbose for these landmark events to be + # spottable; use print so they land on stdout regardless + # of log level. Each line is a single grep-able phrase + # so an operator can ``grep -E '\[daemon\]'`` to follow + # progress through the silent stretches. + print( + f"[daemon] rank{config.rank} registered " + f"{agent_spec.name!r} on the exchange", + flush=True, + ) + wanted_peers = [ + p for p in agent_spec.allowed_peers if p != agent_spec.name + ] + if wanted_peers: + print( + f"[daemon] rank{config.rank} discovering peers " + f"{wanted_peers} (timeout {config.startup_timeout_s:.0f}s)...", + flush=True, + ) peer_agent_ids = await discover_peer_agent_ids( registrar._transport, # Skip self if the campaign mistakenly lists own name # as a peer (validate_campaign rejects this, but # defense-in-depth costs nothing). - [p for p in agent_spec.allowed_peers if p != agent_spec.name], + wanted_peers, agent_class=ChemGraphLogicalAgent, timeout_s=config.startup_timeout_s, ) + if wanted_peers: + print( + f"[daemon] rank{config.rank} discovered " + f"{len(peer_agent_ids)} peer(s): " + f"{sorted(peer_agent_ids.keys())}", + flush=True, + ) finally: await registrar.close() @@ -147,6 +174,11 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: ) async with runtime: await agent.write_runtime_status() + print( + f"[daemon] rank{config.rank} agent {agent_spec.name!r} " + "is now running inside Academy Runtime", + flush=True, + ) # Rank 0 normally dispatches the campaign bootstrap message # to ``initial_agent``. Two conditions skip it: @@ -186,6 +218,11 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: 'via': 'academy_action', }, ) + print( + f"[daemon] rank{config.rank} dispatched inline " + f"bootstrap to {campaign.initial_agent!r}", + flush=True, + ) elif config.rank == 0: # Record the reason for skipping so investigators can # tell "deferred to operator" apart from "silently @@ -199,6 +236,21 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: 'initial_is_local': bool(initial_is_local), }, ) + if config.skip_bootstrap: + print( + f"[daemon] rank{config.rank} skipping inline " + f"bootstrap (federated mode); waiting for " + f"'chemgraph academy bootstrap' to deliver " + f"the kickoff message...", + flush=True, + ) + else: + print( + f"[daemon] rank{config.rank} initial_agent " + f"{campaign.initial_agent!r} is not on this " + f"site; another site owns the bootstrap", + flush=True, + ) await runtime.wait_shutdown() From 52fa7b5f99206fdcf2ce2a280a1018bff88325ec Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:06:06 -0500 Subject: [PATCH 13/25] fix(academy/runtime): deterministic peer UIDs for hosted-exchange rendezvous The B.4 federated demo kept timing out at discovery -- both sites registered their agents, neither could find the other. Root cause turned out to be that Academy's hosted HttpExchange strips ``AgentId.name`` from ``discover()`` responses: only ``uid`` and ``role`` round-trip. Our name-based filter ``if agent_id.name in wanted`` was silently never matching across sites because every discovered AgentId came back with name=None. (The original ChemGraph test suite missed this because it used ``AgentId.new('worker-a')`` fakes that preserve the name -- the same fakes the real hosted exchange does not.) Replacement: deterministic UIDs. registration.py - ``deterministic_agent_uid(run_id, agent_name)`` derives a stable uuid5 from a fixed namespace + ``"{run_id}/{agent_name}"``. Same inputs on every site produce the same UID, so each rank constructs every peer's AgentId locally instead of needing ``discover()`` to echo the name back. - ``deterministic_agent_id(run_id, agent_name)`` builds the full AgentId with the local name preserved (for trace events) and the deterministic UID. - ``register_agent_with_uid(transport, agent_class, agent_id)`` bypasses the SDK's ``register_agent`` (which always generates a random UID via ``AgentId.new``) and POSTs the pre-built deterministic AgentId directly to the same mailbox endpoint. - ``wait_for_peers_alive(transport, peer_ids, ...)`` replaces ``discover_peer_agent_ids``. Matches on UID (preserved by discover()) instead of name (stripped). Times out with a message listing missing peer names+UIDs. daemon.py - Imports + uses the new helpers. Each rank computes its own AgentId deterministically and registers with it, then computes every peer's AgentId locally and waits for the peer's mailbox to be visible on the exchange. No "discover by name" anywhere. - Runtime is still handed a real HttpAgentRegistration wrapping the deterministic AgentId, so the agent runs unchanged. bootstrap.py - New ``--run-id`` required arg. The recipient's mailbox UID is derived from (run-id, recipient-name); operator must pass the same run-id they used for spawn-site or the bootstrap addresses a different mailbox than the daemons registered. - Bumped ``--discover-timeout-s`` default 120s -> 600s to match spawn-site's startup_timeout_s. - Uses ``deterministic_agent_id`` + ``wait_for_peers_alive`` instead of name-based discovery. Side effect: agent names are now campaign-scoped via the run-id. Two operators running the SAME campaign with the SAME run-id will collide on the mailbox UIDs and the second registration will fail with "mailbox already exists" -- correct fail-fast behavior. The old run-id-prefixing convention from the original docstring is now load-bearing rather than advisory. Tests (+5, 85/85 academy sweep green) - deterministic_agent_uid: stable; differs by run_id; differs by agent_name - deterministic_agent_id: name preserved locally - wait_for_peers_alive: empty list short-circuits; succeeds when all UIDs present (with names stripped, mirroring the real exchange response); waits across polls for late peers; times out naming missing UIDs; ignores unrelated agents - bootstrap: requires --run-id; defaults discover-timeout to 600s; sends to deterministic recipient AgentId; closes client on timeout; main() returns 2 with stderr message Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/runtime/bootstrap.py | 41 +++- src/chemgraph/academy/runtime/daemon.py | 69 ++++--- src/chemgraph/academy/runtime/registration.py | 178 +++++++++------- tests/test_academy_bootstrap.py | 90 ++++++--- tests/test_academy_exchange_registration.py | 191 ++++++++++-------- 5 files changed, 363 insertions(+), 206 deletions(-) diff --git a/src/chemgraph/academy/runtime/bootstrap.py b/src/chemgraph/academy/runtime/bootstrap.py index f18a810..d0ef87c 100644 --- a/src/chemgraph/academy/runtime/bootstrap.py +++ b/src/chemgraph/academy/runtime/bootstrap.py @@ -36,7 +36,8 @@ from chemgraph.academy.core.peer_protocol import build_message from chemgraph.academy.runtime.exchange import build_exchange_factory from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES -from chemgraph.academy.runtime.registration import discover_peer_agent_ids +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import wait_for_peers_alive logger = logging.getLogger(__name__) @@ -54,6 +55,15 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: '--campaign', required=True, help='Campaign config (packaged name or path to campaign.jsonc).', ) + parser.add_argument( + '--run-id', required=True, + help=( + "The run-id used by the spawn-site invocations. The bootstrap " + "recipient's mailbox UID is derived deterministically from " + "(run-id, agent-name); the same run-id must be passed here " + "and to every spawn-site in the campaign." + ), + ) parser.add_argument( '--recipient', help=( @@ -89,11 +99,12 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: help='Redis namespace (only used for hybrid; defaults from run-id).', ) parser.add_argument( - '--discover-timeout-s', type=float, default=120.0, + '--discover-timeout-s', type=float, default=600.0, help=( - "How long to wait for the recipient agent to appear on the " - "exchange. Defaults to 2 minutes; bump it if a federated " - "site is slow to come up." + "How long to wait for the recipient agent's mailbox to be " + "visible on the exchange. Defaults to 10 minutes to match " + "spawn-site's startup_timeout_s; bump it higher if a " + "federated site is unusually slow to come up." ), ) return parser.parse_args(argv) @@ -134,6 +145,7 @@ def _config_for_factory(args: argparse.Namespace) -> ChemGraphDaemonConfig: async def dispatch_bootstrap( *, campaign: ChemGraphCampaign, + run_id: str, recipient: str, exchange_factory: Any, discover_timeout_s: float, @@ -142,19 +154,31 @@ async def dispatch_bootstrap( Returns the dispatched message_id so the operator can correlate it with what shows up on the recipient site's event log. + + The recipient's AgentId is constructed deterministically from + ``(run_id, recipient_name)`` -- same scheme spawn-site uses on + the daemon side -- so no name-based discovery is needed (the + hosted exchange strips names from discover() responses, which + made the old discover-by-name approach silently fail). """ client = await exchange_factory.create_user_client( name='chemgraph-bootstrap', start_listener=False, ) try: - recipient_ids = await discover_peer_agent_ids( + recipient_id = deterministic_agent_id( + run_id=run_id, agent_name=recipient, + ) + # Liveness probe: wait for the recipient's mailbox to actually + # be registered on the exchange before sending. Without this + # we'd happily POST a message to a mailbox that doesn't exist + # yet -- the exchange would reject it. + await wait_for_peers_alive( client._transport, - [recipient], + [recipient_id], agent_class=ChemGraphLogicalAgent, timeout_s=discover_timeout_s, ) - recipient_id = recipient_ids[recipient] message = build_message( sender='campaign', @@ -193,6 +217,7 @@ def main(argv: Sequence[str] | None = None) -> int: message_id = asyncio.run( dispatch_bootstrap( campaign=campaign, + run_id=args.run_id, recipient=recipient, exchange_factory=factory, discover_timeout_s=args.discover_timeout_s, diff --git a/src/chemgraph/academy/runtime/daemon.py b/src/chemgraph/academy/runtime/daemon.py index b0beba6..4361640 100644 --- a/src/chemgraph/academy/runtime/daemon.py +++ b/src/chemgraph/academy/runtime/daemon.py @@ -4,7 +4,9 @@ import asyncio import pathlib import signal +from typing import Any +from academy.exchange.cloud.client import HttpAgentRegistration from academy.handle import Handle from academy.runtime import Runtime from academy.runtime import RuntimeConfig @@ -12,7 +14,9 @@ from chemgraph.academy.core.peer_protocol import build_message from chemgraph.academy.runtime.exchange import build_exchange_factory from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES -from chemgraph.academy.runtime.registration import discover_peer_agent_ids +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import register_agent_with_uid +from chemgraph.academy.runtime.registration import wait_for_peers_alive from chemgraph.academy.observability.run_artifacts import initialize_run_files from chemgraph.academy.observability.run_artifacts import ( wait_for_agent_statuses_finished, @@ -105,44 +109,57 @@ async def run_daemon(config: ChemGraphDaemonConfig) -> int: start_listener=False, ) try: - registration = await registrar.register_agent( - ChemGraphLogicalAgent, - name=agent_spec.name, + # Register THIS rank's agent with a deterministic UID + # derived from (run_id, agent_name). The hosted exchange + # strips AgentId.name from discover() responses, so the + # name-based filter we used to do never matched on the + # public HttpExchange. Deterministic UIDs let every site + # construct the same AgentId for the same (run, agent) + # without ever needing discover() to echo the name back. + my_agent_id = deterministic_agent_id( + run_id=config.run_dir.name, agent_name=agent_spec.name, ) - # Operator-visible lifecycle prints. The default INFO - # logging is too verbose for these landmark events to be - # spottable; use print so they land on stdout regardless - # of log level. Each line is a single grep-able phrase - # so an operator can ``grep -E '\[daemon\]'`` to follow - # progress through the silent stretches. + await register_agent_with_uid( + registrar._transport, ChemGraphLogicalAgent, my_agent_id, + ) + registration = HttpAgentRegistration(agent_id=my_agent_id) print( f"[daemon] rank{config.rank} registered " - f"{agent_spec.name!r} on the exchange", + f"{agent_spec.name!r} on the exchange " + f"(uid={my_agent_id.uid})", flush=True, ) + wanted_peers = [ p for p in agent_spec.allowed_peers if p != agent_spec.name ] + # Compute peer AgentIds locally -- no discovery polling + # needed for the identity itself, since every site agrees + # on the deterministic UID. We still poll discover() as a + # LIVENESS PROBE so we don't proceed to bootstrap before + # the other side's mailbox is actually up. + peer_agent_ids: dict[str, Any] = { + peer: deterministic_agent_id( + run_id=config.run_dir.name, agent_name=peer, + ) + for peer in wanted_peers + } if wanted_peers: print( - f"[daemon] rank{config.rank} discovering peers " - f"{wanted_peers} (timeout {config.startup_timeout_s:.0f}s)...", + f"[daemon] rank{config.rank} waiting for peers " + f"{wanted_peers} to come online " + f"(timeout {config.startup_timeout_s:.0f}s)...", flush=True, ) - peer_agent_ids = await discover_peer_agent_ids( - registrar._transport, - # Skip self if the campaign mistakenly lists own name - # as a peer (validate_campaign rejects this, but - # defense-in-depth costs nothing). - wanted_peers, - agent_class=ChemGraphLogicalAgent, - timeout_s=config.startup_timeout_s, - ) - if wanted_peers: + await wait_for_peers_alive( + registrar._transport, + peer_agent_ids.values(), + agent_class=ChemGraphLogicalAgent, + timeout_s=config.startup_timeout_s, + ) print( - f"[daemon] rank{config.rank} discovered " - f"{len(peer_agent_ids)} peer(s): " - f"{sorted(peer_agent_ids.keys())}", + f"[daemon] rank{config.rank} all {len(peer_agent_ids)} " + f"peer(s) are alive: {sorted(peer_agent_ids.keys())}", flush=True, ) finally: diff --git a/src/chemgraph/academy/runtime/registration.py b/src/chemgraph/academy/runtime/registration.py index 01e37d5..ec9b241 100644 --- a/src/chemgraph/academy/runtime/registration.py +++ b/src/chemgraph/academy/runtime/registration.py @@ -1,35 +1,35 @@ -"""Peer-agent discovery against the Academy exchange. - -The runtime used to coordinate per-rank registrations via a -shared-filesystem JSON file (``/academy_registrations.json``): -rank 0 registered every agent on the campaign and wrote the resulting -``AgentRegistration`` objects to disk; other ranks polled the file. That -mechanism cannot span machines with separate filesystems, which blocks -the federated ``spawn-site`` flow. - -The replacement uses the exchange itself as the lookup service. Each -rank registers ONLY its own local agent (returning an -``AgentRegistration`` that goes straight into ``Runtime``), and looks -up the ``AgentId`` of every cross-site peer by polling -``transport.discover(ChemGraphLogicalAgent)`` until the expected names -appear. There is no rank-0 special-casing for registration anymore: any -rank can come up in any order, on any host, as long as eventually -every peer's mailbox is reachable through the exchange before -``startup_timeout_s`` elapses. - -Name collisions: ``discover()`` returns every agent of the given class -registered against the exchange, across all operators and campaigns. To -keep federated campaigns from accidentally seeing each other's -agents, operators should prefix agent names with a campaign-unique -run-id (e.g. ``demo-001-coordinator-agent``). Auto-prefixing is a -future ergonomic improvement; for now it's an operator-runbook -convention. +"""Peer-agent identity + readiness for federated ChemGraph campaigns. + +Cross-site peer rendezvous needs each rank to be able to address every +other site's agents without a shared filesystem. The first cut polled +``transport.discover(ChemGraphLogicalAgent)`` and filtered by +``AgentId.name``, which works on the local/redis/hybrid exchanges but +breaks on Academy's hosted HTTP exchange: ``discover()`` returns +``AgentId`` objects with ``name=None`` and ``role='agent'`` only. +Names round-trip through the server's mailbox state but are not +echoed back in the discovery response. + +The replacement: agree on a **deterministic UID** for every (run-id, +agent-name) pair. Both sites compute the same UID from the same +inputs, so each side knows the recipient's UID before either rank +boots. ``discover()`` is still useful as a liveness probe (matching +on UID, which IS preserved across the server round-trip) so a rank +can wait until its peers have actually registered before proceeding. + +Side effect of this scheme: agent names become campaign-scoped. Two +operators running the SAME ``federated-hello`` campaign concurrently +would clash on the same UIDs and crash the registration POST with +"mailbox already exists". The run-id is part of the UID namespace, +so as long as operators bump ``--run-id`` (federated-hello-001 vs +federated-hello-002) the UIDs differ and the campaigns don't see +each other. """ from __future__ import annotations import asyncio import logging import time +import uuid from collections.abc import Iterable from typing import Any @@ -39,63 +39,105 @@ logger = logging.getLogger(__name__) -async def discover_peer_agent_ids( +# Stable namespace UUID used as the seed for uuid5 derivation. The +# value itself doesn't matter -- only that every site computes the +# same UID for the same (run_id, agent_name) pair. Bumping this +# constant would invalidate every running deployment, so don't. +_PEER_UID_NAMESPACE = uuid.UUID('1e7eda44-1b34-4f5a-b2a1-cf5ca5db8e8b') + + +def deterministic_agent_uid(*, run_id: str, agent_name: str) -> uuid.UUID: + """Derive the AgentId.uid that every site will use for ``agent_name``. + + Same inputs on Aurora and Crux ⇒ same UID. The recipient side + registers with this UID; the sender side constructs an + ``AgentId`` with the same UID locally and uses it to build a + ``Handle`` without ever calling ``discover()``. + """ + return uuid.uuid5(_PEER_UID_NAMESPACE, f"{run_id}/{agent_name}") + + +def deterministic_agent_id(*, run_id: str, agent_name: str) -> AgentId[Any]: + """Construct the ``AgentId`` peers can reconstruct from name alone.""" + return AgentId( + uid=deterministic_agent_uid(run_id=run_id, agent_name=agent_name), + name=agent_name, + role='agent', + ) + + +async def register_agent_with_uid( transport: ExchangeTransportT, - peer_names: Iterable[str], + agent_class: type, + agent_id: AgentId[Any], +) -> AgentId[Any]: + """Register ``agent_id`` on the exchange, reusing the supplied UID. + + Bypasses ``transport.register_agent`` (which always calls + ``AgentId.new`` and generates a random UID) by POSTing directly + to the same mailbox endpoint with our pre-built AgentId. Returns + the same AgentId on success so callers can hand it to Runtime. + """ + # Reach into the transport for the same session + URL the SDK uses. + # The shape mirrors HttpExchangeTransport.register_agent exactly, + # we just swap the auto-generated AgentId for the deterministic one. + session = transport._session + mailbox_url = transport._mailbox_url + payload = { + 'mailbox': agent_id.model_dump_json(), + 'agent': ','.join(agent_class._agent_mro()), + } + async with session.post(mailbox_url, json=payload) as response: + # _raise_for_status is what the SDK uses; reach in for it too. + from academy.exchange.cloud.client import _raise_for_status + _raise_for_status(response, transport.mailbox_id, agent_id) + return agent_id + + +async def wait_for_peers_alive( + transport: ExchangeTransportT, + peer_ids: Iterable[AgentId[Any]], *, agent_class: type, timeout_s: float, poll_interval_s: float = 1.0, -) -> dict[str, AgentId[Any]]: - """Poll ``transport.discover()`` until every named peer is found. - - Args: - transport: An open exchange transport already registered for the - local rank's own agent. Discovery is read-only from this - rank's perspective -- it does not create or mutate mailboxes. - peer_names: Names of agents this rank intends to message. Each - name must match the ``AgentId.name`` of an agent previously - registered against the same exchange (potentially by a - different process on a different host). - agent_class: Concrete ``Agent`` subclass to scope the discovery - query (``transport.discover`` is class-typed). All ChemGraph - agents are ``ChemGraphLogicalAgent``, so callers pass that. - timeout_s: Wall-clock budget. On expiry a ``TimeoutError`` is - raised whose message lists the peers we never saw, so - operators can immediately tell which remote site is missing - or whose agent failed to register. - poll_interval_s: Backoff between ``discover()`` retries. The - default keeps startup snappy without hammering the exchange. - - Returns: - Mapping from peer name to the discovered ``AgentId``. Empty - ``peer_names`` short-circuits to an empty dict. +) -> None: + """Block until every peer in ``peer_ids`` is visible to ``discover()``. + + UID-based matching: ``discover()`` strips names but preserves + UIDs, so we filter the discover response by UID and wait until + every expected peer's mailbox shows up. If ``peer_ids`` is empty + (single-agent or self-only slice), return immediately. + + Raises ``TimeoutError`` listing the missing peers' UIDs after + ``timeout_s`` so the operator can correlate with their other + site's launch logs. """ - wanted = set(peer_names) + wanted = {peer.uid: peer for peer in peer_ids} if not wanted: - return {} - found: dict[str, AgentId[Any]] = {} + return + seen: set[uuid.UUID] = set() deadline = time.monotonic() + timeout_s while True: agent_ids = await transport.discover(agent_class) - for agent_id in agent_ids: - name = getattr(agent_id, 'name', None) - if isinstance(name, str) and name in wanted and name not in found: - found[name] = agent_id - missing = wanted.difference(found) + for aid in agent_ids: + if aid.uid in wanted: + seen.add(aid.uid) + missing = set(wanted).difference(seen) if not missing: - return found + return if time.monotonic() >= deadline: + missing_descs = sorted(f'{wanted[u].name}({u})' for u in missing) raise TimeoutError( - f'Timed out after {timeout_s:.1f}s waiting to discover ' - f'peer agents on the exchange: missing={sorted(missing)}. ' - f'Confirm every site of the federated campaign has ' - f'started and registered its agents under the expected ' - f'names (run-id-prefixed names are required when the ' - f'hosted exchange is shared across operators).', + f'Timed out after {timeout_s:.1f}s waiting for peer agents ' + f'to register on the exchange: missing={missing_descs}. ' + f'Confirm every site of the federated campaign has started ' + f'and that all sites are using the same --run-id (the run-id ' + f'is part of the UID namespace; mismatches make the peers ' + f'invisible to each other).', ) logger.debug( - 'discover() missing %d peers (%s); sleeping %.1fs', + 'wait_for_peers_alive: missing %d (%s); sleeping %.1fs', len(missing), sorted(missing), poll_interval_s, ) await asyncio.sleep(poll_interval_s) diff --git a/tests/test_academy_bootstrap.py b/tests/test_academy_bootstrap.py index 2b9e70a..18ee8c1 100644 --- a/tests/test_academy_bootstrap.py +++ b/tests/test_academy_bootstrap.py @@ -8,6 +8,7 @@ from academy.identifier import AgentId from chemgraph.academy.runtime import bootstrap +from chemgraph.academy.runtime.registration import deterministic_agent_id # --------------------------------------------------------------------------- @@ -16,21 +17,33 @@ def test_parse_args_requires_campaign() -> None: - """``--campaign`` is the only field that doesn't have a default -- - bootstrap is useless without knowing which campaign's - ``user_task`` to send.""" + """``--campaign`` is one of the two required fields. Bootstrap is + useless without knowing which campaign's user_task to send.""" with pytest.raises(SystemExit): - bootstrap.parse_args([]) + bootstrap.parse_args(['--run-id', 'r-001']) + + +def test_parse_args_requires_run_id() -> None: + """``--run-id`` is required because the recipient's mailbox UID + is derived deterministically from (run_id, agent_name). Without + it the bootstrap would address a different mailbox than the + spawn-site invocations registered.""" + with pytest.raises(SystemExit): + bootstrap.parse_args(['--campaign', 'mace-ensemble-screening-20']) def test_parse_args_defaults_exchange_type_to_http() -> None: """Federated bootstrap is the main use case so http is the right default. Operators on single-machine runs can override but they rarely need this command at all (run-compute auto-bootstraps).""" - args = bootstrap.parse_args(['--campaign', 'mace-ensemble-screening-20']) + args = bootstrap.parse_args([ + '--campaign', 'mace-ensemble-screening-20', + '--run-id', 'r-001', + ]) assert args.exchange_type == 'http' assert args.recipient is None # defaults to campaign.initial_agent - assert args.discover_timeout_s == pytest.approx(120.0) + # discover-timeout-s default now matches spawn-site's 600s. + assert args.discover_timeout_s == pytest.approx(600.0) def test_parse_args_accepts_recipient_override() -> None: @@ -38,6 +51,7 @@ def test_parse_args_accepts_recipient_override() -> None: e.g. partial re-runs or debugging.""" args = bootstrap.parse_args([ '--campaign', 'foo.jsonc', + '--run-id', 'r-001', '--recipient', 'worker-a', ]) assert args.recipient == 'worker-a' @@ -45,6 +59,11 @@ def test_parse_args_accepts_recipient_override() -> None: # --------------------------------------------------------------------------- # dispatch_bootstrap -- the core async path +# +# The hosted HttpExchange strips AgentId.name from discover() responses, +# so our fake transport returns UID-only AgentIds to mirror that. The +# bootstrap path constructs the recipient AgentId deterministically +# from (run_id, recipient_name) -- no name lookup happens. # --------------------------------------------------------------------------- @@ -81,14 +100,26 @@ def __init__(self, user_task: str = 'do the thing'): self.resources = {} -def test_dispatch_bootstrap_sends_one_message_to_discovered_recipient( +def _seen_agent_id(name: str, run_id: str) -> AgentId[Any]: + """Mirror what the hosted exchange returns from discover(): + deterministic UID, but with the name stripped to None.""" + aid = deterministic_agent_id(run_id=run_id, agent_name=name) + return AgentId(uid=aid.uid, name=None, role='agent') + + +def test_dispatch_bootstrap_sends_one_message_to_deterministic_recipient( monkeypatch, ) -> None: - """Happy path: recipient is on the exchange, helper discovers them, - one Handle.action call gets made, the message_id returned matches - what was sent.""" - target = AgentId.new('coordinator-agent') - transport = _FakeTransport(agents=[target, AgentId.new('other-agent')]) + """Happy path: recipient's mailbox is visible on the exchange, + the wait succeeds, one Handle.action call gets made. The + recipient AgentId is built deterministically from (run_id, + recipient_name), NOT discovered by name.""" + run_id = 'demo-001' + seen = _seen_agent_id('coordinator-agent', run_id) + transport = _FakeTransport(agents=[ + seen, + _seen_agent_id('some-other-campaign-agent', 'unrelated'), + ]) client = _FakeClient(transport) factory = _FakeFactory(client) @@ -106,6 +137,7 @@ async def action(self, name, message): message_id = asyncio.run( bootstrap.dispatch_bootstrap( campaign=_FakeCampaign(), + run_id=run_id, recipient='coordinator-agent', exchange_factory=factory, discover_timeout_s=1.0, @@ -114,36 +146,46 @@ async def action(self, name, message): assert len(sent) == 1 agent_id, action_name, message = sent[0] - assert agent_id is target + # Handle is built with the DETERMINISTIC AgentId -- same UID as + # what the recipient daemon registered, so the message routes to + # the right mailbox. + expected_uid = deterministic_agent_id( + run_id=run_id, agent_name='coordinator-agent', + ).uid + assert agent_id.uid == expected_uid assert action_name == 'receive_message' assert message['recipient'] == 'coordinator-agent' assert message['sender'] == 'campaign' assert message['message_id'] == message_id - # The bootstrap content embeds the campaign's user_task; the - # recipient agent's first round will parse this content. assert 'do the thing' in message['content'] # Client must be closed on the happy path so we don't leak the # aiohttp session that backs the http exchange transport. client.close.assert_awaited_once() -def test_dispatch_bootstrap_closes_client_on_discover_timeout( +def test_dispatch_bootstrap_closes_client_on_recipient_timeout( monkeypatch, ) -> None: - """If the recipient never appears on the exchange the helper must - raise TimeoutError -- AND the client must still be closed so we - don't leak the underlying network resources.""" - transport = _FakeTransport(agents=[AgentId.new('other-agent')]) + """If the recipient's mailbox never appears on the exchange the + helper must raise TimeoutError -- AND the client must still be + closed so we don't leak the underlying network resources.""" + # Transport returns SOME unrelated agent but not our recipient. + transport = _FakeTransport(agents=[ + _seen_agent_id('not-our-recipient', 'unrelated-run'), + ]) client = _FakeClient(transport) factory = _FakeFactory(client) - monkeypatch.setattr(bootstrap, 'Handle', - lambda agent_id: pytest.fail("Handle must not be built on timeout")) + monkeypatch.setattr( + bootstrap, 'Handle', + lambda agent_id: pytest.fail("Handle must not be built on timeout"), + ) with pytest.raises(TimeoutError): asyncio.run( bootstrap.dispatch_bootstrap( campaign=_FakeCampaign(), + run_id='demo-001', recipient='coordinator-agent', exchange_factory=factory, discover_timeout_s=0.05, @@ -162,9 +204,8 @@ def test_main_returns_2_on_recipient_timeout(monkeypatch, capsys) -> None: bootstrap didn't actually happen. The stderr message should be the TimeoutError's text (which names the missing recipient).""" async def _raise(*args, **kwargs): - raise TimeoutError('Timed out ... missing=[\'coordinator-agent\']') + raise TimeoutError("Timed out ... missing=['coordinator-agent']") monkeypatch.setattr(bootstrap, 'dispatch_bootstrap', _raise) - # Bypass the campaign-file load to keep the test offline. monkeypatch.setattr(bootstrap, 'load_campaign', lambda path: _FakeCampaign()) monkeypatch.setattr(bootstrap, 'build_exchange_factory', @@ -172,6 +213,7 @@ async def _raise(*args, **kwargs): code = bootstrap.main([ '--campaign', 'mace-ensemble-screening-20', + '--run-id', 'demo-001', '--exchange-type', 'local', ]) assert code == 2 diff --git a/tests/test_academy_exchange_registration.py b/tests/test_academy_exchange_registration.py index 6ee1246..e3c2f91 100644 --- a/tests/test_academy_exchange_registration.py +++ b/tests/test_academy_exchange_registration.py @@ -11,7 +11,9 @@ from chemgraph.academy.runtime.exchange import build_exchange_factory from chemgraph.academy.runtime.exchange import exchange_uses_redis from chemgraph.academy.runtime.exchange import SUPPORTED_EXCHANGE_TYPES -from chemgraph.academy.runtime.registration import discover_peer_agent_ids +from chemgraph.academy.runtime.registration import deterministic_agent_id +from chemgraph.academy.runtime.registration import deterministic_agent_uid +from chemgraph.academy.runtime.registration import wait_for_peers_alive def _config( @@ -118,14 +120,15 @@ def test_http_exchange_factory_honors_custom_url(tmp_path) -> None: # --------------------------------------------------------------------------- -# discover_peer_agent_ids (federated peer discovery) +# Deterministic peer identity + wait_for_peers_alive # -# Real exchange transports require a running broker. We exercise the -# discovery helper against a fake transport whose ``discover`` returns -# a sequence we control across successive polls. This keeps the tests -# fast and deterministic while still pinning the behavior that matters: -# wait-for-peers, success when all are present, timeout listing the -# missing ones. +# The hosted HttpExchange strips AgentId.name from discover() responses +# (only ``uid`` and ``role`` round-trip). Name-based discovery was +# silently never finding any peer across sites. The replacement: derive +# each peer's mailbox UID deterministically from (run_id, agent_name) +# so every site can construct the same AgentId locally without needing +# discover() to echo the name back. discover() stays useful as a +# liveness probe (matching on UID, which IS preserved). # --------------------------------------------------------------------------- @@ -148,79 +151,110 @@ async def discover(self, agent_class): # noqa: ARG002 - sig match only return tuple(self._rounds[index]) -def _agent_id(name: str) -> AgentId[Any]: - return AgentId.new(name) +def test_deterministic_agent_uid_is_stable() -> None: + """Same (run_id, agent_name) inputs must produce the same UID, + every call, on every machine. This is the load-bearing invariant + of the federated rendezvous: Aurora and Crux compute the same + UID locally and addressing works without any shared lookup.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker') + b = deterministic_agent_uid(run_id='r-001', agent_name='worker') + assert a == b + + +def test_deterministic_agent_uid_differs_by_run_id() -> None: + """Different run-ids must yield different UIDs so two operators + running the SAME campaign with different --run-ids don't collide + on the same mailboxes.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker') + b = deterministic_agent_uid(run_id='r-002', agent_name='worker') + assert a != b + + +def test_deterministic_agent_uid_differs_by_agent_name() -> None: + """Different agent names within the same run must yield different + UIDs so per-agent mailboxes don't collide.""" + a = deterministic_agent_uid(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_uid(run_id='r-001', agent_name='worker-b') + assert a != b + + +def test_deterministic_agent_id_preserves_name_locally() -> None: + """The AgentId we build for our OWN registration carries the + name so it shows up in trace events; the name is only stripped + when the AgentId is round-tripped through the hosted exchange's + discover() response.""" + aid = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + assert aid.name == 'worker-a' + assert aid.uid == deterministic_agent_uid( + run_id='r-001', agent_name='worker-a', + ) -def test_discover_peer_agent_ids_returns_empty_for_empty_peer_list() -> None: - """When the local agent has no allowed_peers the helper short-circuits - -- it must not poll the exchange unnecessarily (the network round-trip - would block daemon startup for nothing).""" - transport = _FakeTransport(rounds=[[_agent_id('anyone')]]) - result = asyncio.run( - discover_peer_agent_ids( +def test_wait_for_peers_alive_returns_immediately_for_empty_list() -> None: + """When the local agent has no allowed_peers the helper short- + circuits -- it must not poll the exchange unnecessarily.""" + transport = _FakeTransport(rounds=[[]]) + asyncio.run( + wait_for_peers_alive( transport, [], agent_class=object, timeout_s=0.01, ), ) - assert result == {} assert transport._calls == 0 -def test_discover_peer_agent_ids_finds_all_peers_on_first_poll() -> None: - """Happy path: every peer is already on the exchange. The helper - must return promptly with a name->AgentId mapping in the same - direction the daemon will use it (peer name -> AgentId for Handle - construction).""" - a = _agent_id('worker-a') - b = _agent_id('worker-b') - c = _agent_id('coordinator') - transport = _FakeTransport(rounds=[[a, b, c]]) - result = asyncio.run( - discover_peer_agent_ids( - transport, ['worker-a', 'worker-b'], - agent_class=object, timeout_s=1.0, +def test_wait_for_peers_alive_succeeds_when_all_uids_present() -> None: + """Happy path: every expected peer's mailbox is on the exchange. + Match by UID (the field discover() preserves), not name.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_id(run_id='r-001', agent_name='worker-b') + # Simulate what the hosted exchange actually returns: AgentIds + # with the right UID but name stripped. Tests would have caught + # the original bug if they'd matched this shape. + a_seen = AgentId(uid=a.uid, name=None, role='agent') + b_seen = AgentId(uid=b.uid, name=None, role='agent') + transport = _FakeTransport(rounds=[[a_seen, b_seen]]) + asyncio.run( + wait_for_peers_alive( + transport, [a, b], agent_class=object, timeout_s=1.0, ), ) - assert result == {'worker-a': a, 'worker-b': b} - # Did NOT include the un-requested coordinator -- filtering by name - # is what keeps cross-operator agents on the shared hosted exchange - # from leaking into each other's peer dicts. - assert 'coordinator' not in result - - -def test_discover_peer_agent_ids_waits_for_late_peers() -> None: - """The federated convergence story: site A registers at t=0 and - polls; site B doesn't register its agent until poll #3; the helper - must keep polling and succeed once B is visible. This is the - behavior that lets operators bring sites up in any order.""" - a = _agent_id('worker-a') - b = _agent_id('worker-b') + + +def test_wait_for_peers_alive_waits_across_polls_for_late_peer() -> None: + """The federated convergence story: bring sites up in any order; + the wait keeps polling and unblocks the moment all UIDs are seen.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + b = deterministic_agent_id(run_id='r-001', agent_name='worker-b') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + b_seen = AgentId(uid=b.uid, name=None, role='agent') rounds = [ - [a], # poll 1: only A visible - [a], # poll 2: still waiting for B - [a, b], # poll 3: B comes up + [a_seen], # poll 1: only A visible + [a_seen], # poll 2: still waiting for B + [a_seen, b_seen], # poll 3: B comes up ] transport = _FakeTransport(rounds=rounds) - result = asyncio.run( - discover_peer_agent_ids( - transport, ['worker-a', 'worker-b'], + asyncio.run( + wait_for_peers_alive( + transport, [a, b], agent_class=object, timeout_s=2.0, - poll_interval_s=0.01, # keep the test fast + poll_interval_s=0.01, ), ) - assert result == {'worker-a': a, 'worker-b': b} -def test_discover_peer_agent_ids_times_out_naming_missing_peers() -> None: - """When a remote site never shows up, the helper must raise with a - message that names the missing peers. Operators reading the log - should immediately know which site to bring up / debug.""" - transport = _FakeTransport(rounds=[[_agent_id('worker-a')]]) - with pytest.raises(TimeoutError, match=r"missing=\['coordinator', 'worker-b'\]"): +def test_wait_for_peers_alive_times_out_naming_missing_uids() -> None: + """When a remote site never registers, raise with a message + naming the missing peers (name + uid). Operators reading the + log can correlate with the missing site's launch logs.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + missing = deterministic_agent_id(run_id='r-001', agent_name='no-such-peer') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + transport = _FakeTransport(rounds=[[a_seen]]) + with pytest.raises(TimeoutError, match='no-such-peer'): asyncio.run( - discover_peer_agent_ids( - transport, ['worker-a', 'worker-b', 'coordinator'], + wait_for_peers_alive( + transport, [a, missing], agent_class=object, timeout_s=0.05, poll_interval_s=0.01, @@ -228,27 +262,24 @@ def test_discover_peer_agent_ids_times_out_naming_missing_peers() -> None: ) -def test_discover_peer_agent_ids_skips_already_found_peers_on_re_poll() -> None: - """If poll N saw peer A, poll N+1 must not overwrite A's AgentId - even if discover returns a fresh AgentId object with the same name - (which the hosted exchange doesn't actually do, but the helper's - behavior shouldn't depend on that). Pin the 'first found wins' - invariant explicitly.""" - a_first = _agent_id('worker-a') - a_again = _agent_id('worker-a') # different uuid, same name - b = _agent_id('worker-b') - rounds = [ - [a_first], - [a_again, b], +def test_wait_for_peers_alive_ignores_unrelated_agents_with_same_class() -> None: + """The hosted exchange returns every ChemGraphLogicalAgent registered + across all operators / campaigns. The wait must filter strictly by + UID and not get confused by other operators' agents.""" + a = deterministic_agent_id(run_id='r-001', agent_name='worker-a') + a_seen = AgentId(uid=a.uid, name=None, role='agent') + # Lots of noise from other operators / runs: + noise = [ + AgentId.new('stranger-1'), + AgentId.new('stranger-2'), + AgentId.new('stranger-3'), ] - transport = _FakeTransport(rounds=rounds) - result = asyncio.run( - discover_peer_agent_ids( - transport, ['worker-a', 'worker-b'], + transport = _FakeTransport(rounds=[noise + [a_seen]]) + asyncio.run( + wait_for_peers_alive( + transport, [a], agent_class=object, - timeout_s=2.0, + timeout_s=1.0, poll_interval_s=0.01, ), ) - assert result['worker-a'] is a_first - assert result['worker-b'] is b From cd353e7b6b7cf8f10d6461ddde3b2c859eb9edae Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:11:03 -0500 Subject: [PATCH 14/25] fix(academy/observability): add bootstrap_message_skipped to allowed event names The skip-trace I added in 5549dbb (operator-visible daemon lifecycle prints) writes a system trace with event name bootstrap_message_skipped, but that name was never added to the CampaignEvent.event Literal enum. The pydantic validator rejected it, crashing the daemon RIGHT AFTER the [daemon] ... is now running inside Academy Runtime print. Cosmetic-but-fatal regression that the test suite missed because no test exercises the skip-bootstrap code path through append_system_trace -- the federated demo is the first place this code path runs end to end. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/observability/event_log.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/chemgraph/academy/observability/event_log.py b/src/chemgraph/academy/observability/event_log.py index c42a41c..a61ef72 100644 --- a/src/chemgraph/academy/observability/event_log.py +++ b/src/chemgraph/academy/observability/event_log.py @@ -45,6 +45,7 @@ "daemon_started", "daemon_stopped", "bootstrap_message_dispatched", + "bootstrap_message_skipped", "llm_tool_calls", "turn_finished_without_external_action", "chemgraph_reasoning_turn_started", From 1b5b9f8cdc5cead89ef2d8992626e2f34e871dae Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:17:11 -0500 Subject: [PATCH 15/25] fix(academy/bootstrap): register_handle before action to bind exchange context handle.action reads the outbound exchange from a contextvar that is only set when a client is active. Runtime sets it for daemon- side code, but the standalone bootstrap command needs to set it explicitly via client.register_handle(handle) -- otherwise Handle.action raises ExchangeClientNotFoundError. The first federated demo attempt failed here: discovery succeeded, the message was built, the Handle was constructed -- and the action call died because the Handle did not know which client to route through. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/runtime/bootstrap.py | 8 ++++++++ tests/test_academy_bootstrap.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/src/chemgraph/academy/runtime/bootstrap.py b/src/chemgraph/academy/runtime/bootstrap.py index d0ef87c..945c911 100644 --- a/src/chemgraph/academy/runtime/bootstrap.py +++ b/src/chemgraph/academy/runtime/bootstrap.py @@ -190,6 +190,14 @@ async def dispatch_bootstrap( confidence=1.0, ) handle: Handle[Any] = Handle(recipient_id) + # The Handle reads its outbound exchange from a contextvar that + # only gets set when a client is "active" -- either via + # ``async with client:`` or via ``client.register_handle()``. + # The daemon-side code path uses Runtime which sets the context + # for us; the standalone bootstrap command does NOT, so without + # this register_handle call the action raises + # ``ExchangeClientNotFoundError``. + client.register_handle(handle) await handle.action('receive_message', message) logger.info( 'Bootstrap message dispatched: recipient=%s message_id=%s', diff --git a/tests/test_academy_bootstrap.py b/tests/test_academy_bootstrap.py index 18ee8c1..fee9e3c 100644 --- a/tests/test_academy_bootstrap.py +++ b/tests/test_academy_bootstrap.py @@ -80,6 +80,14 @@ class _FakeClient: def __init__(self, transport): self._transport = transport self.close = AsyncMock() + self.registered_handles: list = [] + + def register_handle(self, handle): + # Real client binds the handle to its exchange contextvar. + # The fake just records it so tests can assert the bootstrap + # path called register_handle before dispatching the action + # (without which Handle.action raises ExchangeClientNotFoundError). + self.registered_handles.append(handle) class _FakeFactory: From 4a4fa3db6da26505e3a6815ea7aa98b8b90907c5 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:21:30 -0500 Subject: [PATCH 16/25] fix(academy/bootstrap): enter client as async context manager so Handle finds exchange UserExchangeClient.__aenter__ is what sets the academy.handle.exchange_context ContextVar that Handle.action reads to find the outbound exchange. The prior register_handle-only fix binds the handle for inbox routing but does NOT set the contextvar, so the action call still raised ExchangeClientNotFoundError. Restructure dispatch_bootstrap to run the whole send inside async with client: -- exchange_context is set on enter, restored on exit. The aiohttp session gets closed by __aexit__, so the explicit client.close() became redundant. Test fixture _FakeClient is now an async-context-manager stand-in; the two close-on-success / close-on-timeout assertions check enter_count/exit_count instead. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/runtime/bootstrap.py | 84 +++++++++++----------- tests/test_academy_bootstrap.py | 40 +++++++---- 2 files changed, 70 insertions(+), 54 deletions(-) diff --git a/src/chemgraph/academy/runtime/bootstrap.py b/src/chemgraph/academy/runtime/bootstrap.py index 945c911..c2a95b4 100644 --- a/src/chemgraph/academy/runtime/bootstrap.py +++ b/src/chemgraph/academy/runtime/bootstrap.py @@ -165,47 +165,49 @@ async def dispatch_bootstrap( name='chemgraph-bootstrap', start_listener=False, ) - try: - recipient_id = deterministic_agent_id( - run_id=run_id, agent_name=recipient, - ) - # Liveness probe: wait for the recipient's mailbox to actually - # be registered on the exchange before sending. Without this - # we'd happily POST a message to a mailbox that doesn't exist - # yet -- the exchange would reject it. - await wait_for_peers_alive( - client._transport, - [recipient_id], - agent_class=ChemGraphLogicalAgent, - timeout_s=discover_timeout_s, - ) - - message = build_message( - sender='campaign', - recipient=recipient, - content=campaign_bootstrap_text(campaign), - kind='message', - tldr='Campaign bootstrap', - reason='Initial campaign task dispatch (operator-triggered).', - confidence=1.0, - ) - handle: Handle[Any] = Handle(recipient_id) - # The Handle reads its outbound exchange from a contextvar that - # only gets set when a client is "active" -- either via - # ``async with client:`` or via ``client.register_handle()``. - # The daemon-side code path uses Runtime which sets the context - # for us; the standalone bootstrap command does NOT, so without - # this register_handle call the action raises - # ``ExchangeClientNotFoundError``. - client.register_handle(handle) - await handle.action('receive_message', message) - logger.info( - 'Bootstrap message dispatched: recipient=%s message_id=%s', - recipient, message['message_id'], - ) - return message['message_id'] - finally: - await client.close() + # ``Handle.action`` reads its outbound exchange from a + # ``ContextVar`` that ``UserExchangeClient.__aenter__`` sets to + # self. Without entering the client as an async-context-manager + # the contextvar stays unset and Handle.action raises + # ``ExchangeClientNotFoundError``. The daemon-side path gets this + # for free because Academy's Runtime enters the client; the + # standalone bootstrap command has to do it explicitly. + async with client: + try: + recipient_id = deterministic_agent_id( + run_id=run_id, agent_name=recipient, + ) + # Liveness probe: wait for the recipient's mailbox to + # actually be registered on the exchange before sending. + # Without this we'd happily POST a message to a mailbox + # that doesn't exist yet -- the exchange would reject it. + await wait_for_peers_alive( + client._transport, + [recipient_id], + agent_class=ChemGraphLogicalAgent, + timeout_s=discover_timeout_s, + ) + + message = build_message( + sender='campaign', + recipient=recipient, + content=campaign_bootstrap_text(campaign), + kind='message', + tldr='Campaign bootstrap', + reason='Initial campaign task dispatch (operator-triggered).', + confidence=1.0, + ) + handle: Handle[Any] = Handle(recipient_id) + await handle.action('receive_message', message) + logger.info( + 'Bootstrap message dispatched: recipient=%s message_id=%s', + recipient, message['message_id'], + ) + return message['message_id'] + finally: + # __aexit__ does close + clear the contextvar; close() + # here would double-close. The async with handles it. + pass def main(argv: Sequence[str] | None = None) -> int: diff --git a/tests/test_academy_bootstrap.py b/tests/test_academy_bootstrap.py index fee9e3c..3074b1b 100644 --- a/tests/test_academy_bootstrap.py +++ b/tests/test_academy_bootstrap.py @@ -2,7 +2,6 @@ import asyncio from typing import Any -from unittest.mock import AsyncMock import pytest from academy.identifier import AgentId @@ -77,17 +76,27 @@ async def discover(self, agent_class): # noqa: ARG002 - sig match only class _FakeClient: + """Async-context-manager stand-in for academy's UserExchangeClient. + + The real client's __aenter__ sets a contextvar that Handle.action + reads to find the outbound exchange. We don't reproduce that + contextvar plumbing in the fake -- the test monkeypatches + ``bootstrap.Handle`` with a recording stand-in that bypasses the + contextvar lookup entirely -- but we DO support the + async-with shape so the bootstrap code path runs unchanged. + """ def __init__(self, transport): self._transport = transport - self.close = AsyncMock() - self.registered_handles: list = [] + self.enter_count = 0 + self.exit_count = 0 - def register_handle(self, handle): - # Real client binds the handle to its exchange contextvar. - # The fake just records it so tests can assert the bootstrap - # path called register_handle before dispatching the action - # (without which Handle.action raises ExchangeClientNotFoundError). - self.registered_handles.append(handle) + async def __aenter__(self): + self.enter_count += 1 + return self + + async def __aexit__(self, exc_type, exc, tb): + self.exit_count += 1 + return False class _FakeFactory: @@ -166,9 +175,11 @@ async def action(self, name, message): assert message['sender'] == 'campaign' assert message['message_id'] == message_id assert 'do the thing' in message['content'] - # Client must be closed on the happy path so we don't leak the - # aiohttp session that backs the http exchange transport. - client.close.assert_awaited_once() + # Client entered as async-context-manager (which sets the exchange + # context the Handle needs) and exited cleanly. The __aexit__ + # closes the aiohttp session backing the http exchange transport. + assert client.enter_count == 1 + assert client.exit_count == 1 def test_dispatch_bootstrap_closes_client_on_recipient_timeout( @@ -199,7 +210,10 @@ def test_dispatch_bootstrap_closes_client_on_recipient_timeout( discover_timeout_s=0.05, ), ) - client.close.assert_awaited_once() + # async-with __aexit__ runs even on the error path -- the aiohttp + # session is released regardless of whether dispatch succeeded. + assert client.enter_count == 1 + assert client.exit_count == 1 # --------------------------------------------------------------------------- From f84b57f83303f3a62d339cfed0f536d5636cc4e7 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:27:30 -0500 Subject: [PATCH 17/25] fix(academy/models): strip temperature for GPT-5/o-series reasoning models Ported from the synth branch. GPT-5* and o1/o3/o4* reject any non- default temperature with HTTP 400 'Unsupported value: temperature does not support 0.0'. Both ChatOpenAI construction sites (load_openai_model and agent/turn._custom_openai_compatible_kwargs) now consult is_reasoning_model() and drop temperature + the other sampling knobs when the model is one of those. Same module-level is_reasoning_model() helper as on the synth branch so a future merge stays mechanical. This was the last bug between the federated-hello demo daemons making their first LM call and completing the round trip. Both sites successfully discovered each other, received the bootstrap message, and entered their first reasoning round; the round crashed at the LM call because the demo uses GPT-5-mini. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/agent/turn.py | 8 +++++++ src/chemgraph/models/openai.py | 42 +++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/chemgraph/agent/turn.py b/src/chemgraph/agent/turn.py index e565213..61a98b5 100644 --- a/src/chemgraph/agent/turn.py +++ b/src/chemgraph/agent/turn.py @@ -161,6 +161,14 @@ def _custom_openai_compatible_kwargs( user = argo_user or os.getenv("ARGO_USER") if base_url and "argoapi" in base_url and user: kwargs["model_kwargs"] = {"user": user} + # GPT-5* / o-series reject any non-default temperature + sampling + # knobs. Drop them so the request payload matches what the model + # accepts. Import is local to avoid an import cycle with + # chemgraph.models.openai which itself imports langchain_openai. + from chemgraph.models.openai import is_reasoning_model + if is_reasoning_model(model_name): + for k in ("temperature", "top_p", "frequency_penalty", "presence_penalty"): + kwargs.pop(k, None) return kwargs diff --git a/src/chemgraph/models/openai.py b/src/chemgraph/models/openai.py index bb33d38..230ffb2 100644 --- a/src/chemgraph/models/openai.py +++ b/src/chemgraph/models/openai.py @@ -157,6 +157,36 @@ def _is_local_http_endpoint(base_url: str | None) -> bool: } +# Reasoning-model detection. These models (GPT-5*, o1/o3/o4*) reject +# any non-default ``temperature`` (only 1 is supported) and similarly +# reject ``top_p`` / ``frequency_penalty`` / ``presence_penalty``. +# The Argo shim passes these through to OpenAI with the same +# constraint, so the LLM construction sites must drop the field +# entirely from the request payload. Match is case-insensitive and +# covers "GPT-5", "GPT-5-mini", "GPT-5.1" ... "GPT-5.5", "o1", "o3", +# "o3-mini", "o4-mini", their argo: variants, and the hosted-wire +# short forms ("gpt5", "gpt5mini", "gpto3mini"). +_REASONING_MODEL_PREFIXES: tuple[str, ...] = ("gpt-5", "o1", "o3", "o4") + + +def is_reasoning_model(model_name: str | None) -> bool: + """Return True when ``model_name`` is an OpenAI reasoning model. + + See module-level note above for rationale. + """ + if not model_name: + return False + name = model_name.strip().lower().removeprefix("argo:") + if name.startswith("gpto"): + return True # gpto1, gpto3mini, gpto4mini hosted-wire forms + if name.startswith("gpt5"): + return True # gpt5, gpt5mini, gpt5nano hosted-wire forms + return any( + name == p or name.startswith(p + "-") or name.startswith(p + ".") + for p in _REASONING_MODEL_PREFIXES + ) + + def load_openai_model( model_name: str, temperature: float, @@ -264,15 +294,25 @@ def load_openai_model( logger.info( "Using Argo user from config/ARGO_USER/default: %s", argo_user ) + if is_reasoning_model(model_name): + # Reasoning models (GPT-5*, o-series) reject any non- + # default temperature + sampling knobs. Drop them at + # construction time so the request payload matches what + # the model accepts. + for k in ("temperature", "top_p", "frequency_penalty", "presence_penalty"): + llm_kwargs.pop(k, None) llm = ChatOpenAI(**llm_kwargs) else: logger.info(f"Loading OpenAI model: {model_name}") - llm = ChatOpenAI( + openai_kwargs = dict( model=model_name, temperature=temperature, api_key=api_key, max_tokens=6000, ) + if is_reasoning_model(model_name): + openai_kwargs.pop("temperature", None) + llm = ChatOpenAI(**openai_kwargs) # Authentication happens only during invocation. logger.info(f"Requested model: {model_name}") logger.info("OpenAI model loaded successfully") From 547e64725f8d2f794f320e96738792f92d025fc5 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 11:35:15 -0500 Subject: [PATCH 18/25] feat(academy/campaigns): federated-chat multi-turn cross-HPC demo campaign Federated-hello produced 2-3 events per site and the demo was over before the dashboard could render anything. Federated-chat is a back-and-forth counter game between agent-aurora and agent-crux: each turn one agent increments a counter and sends to the peer, until the counter hits 10. ~6 reasoning rounds per agent = ~40 events total in the merged dashboard timeline, plus visible message-flow with cross-site Route labels. Same two-agent shape as federated-hello so the same operator runbook works -- only --campaign federated-chat changes. Registered under 'federated-chat' name with max_decisions=20 slack for retries. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/campaigns/__init__.py | 13 ++++++ .../campaigns/federated-chat/campaign.jsonc | 45 +++++++++++++++++++ .../campaigns/federated-chat/lm_config.json | 11 +++++ .../prompt_profiles/default.json | 12 +++++ 4 files changed, 81 insertions(+) create mode 100644 src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc create mode 100644 src/chemgraph/academy/campaigns/federated-chat/lm_config.json create mode 100644 src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json diff --git a/src/chemgraph/academy/campaigns/__init__.py b/src/chemgraph/academy/campaigns/__init__.py index 9948bf8..7837691 100644 --- a/src/chemgraph/academy/campaigns/__init__.py +++ b/src/chemgraph/academy/campaigns/__init__.py @@ -7,15 +7,18 @@ EXAMPLE_002 = 'example-002-mace-ensemble-screening' FEDERATED_HELLO = 'federated-hello' +FEDERATED_CHAT = 'federated-chat' CAMPAIGNS = { 'mace-ensemble-screening-20': f'{EXAMPLE_002}/campaign.jsonc', 'federated-hello': f'{FEDERATED_HELLO}/campaign.jsonc', + 'federated-chat': f'{FEDERATED_CHAT}/campaign.jsonc', } LM_CONFIG_TEMPLATES = { 'argo-gpt54-mace-template': f'{EXAMPLE_002}/lm_config.json', 'argo-gpt5mini-federated-hello': f'{FEDERATED_HELLO}/lm_config.json', + 'argo-gpt5mini-federated-chat': f'{FEDERATED_CHAT}/lm_config.json', } @@ -46,6 +49,16 @@ class CampaignLaunchDefaults: agents_per_node=1, max_decisions=4, ), + # Multi-turn cross-HPC counter chat. ~10 send/receive round-trips + # so the dashboard has actual material to render. Each agent runs + # ~6 reasoning rounds (send, receive, send, ..., reach 10, + # finish_turn). max_decisions=20 gives slack for retries. + 'federated-chat': CampaignLaunchDefaults( + lm_config_template='argo-gpt5mini-federated-chat', + agent_count=2, + agents_per_node=1, + max_decisions=20, + ), } diff --git a/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc b/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc new file mode 100644 index 0000000..622510e --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/campaign.jsonc @@ -0,0 +1,45 @@ +{ + // --------------------------------------------------------------------- + // Federated-chat: a multi-turn cross-HPC conversation. Two agents + // bounce a counter back and forth N times, each incrementing it, + // until the counter hits a threshold and they finish_turn. Designed + // for demos -- produces ~40 events on the dashboard (decisions, + // send_message, message_received) so the federated UI (Sites + // header, message-flow edge, cross-site Route label) has real + // material to render. + // + // Run as: + // chemgraph academy dashboard -- federated-chat-XXX \ + // --system aurora,crux --campaign federated-chat --overwrite-run + // chemgraph academy spawn-site -- --system aurora --run-id ... \ + // --campaign federated-chat --agents agent-aurora --exchange-type http + // chemgraph academy spawn-site -- --system crux --run-id ... \ + // --campaign federated-chat --agents agent-crux --exchange-type http + // chemgraph academy bootstrap -- --campaign federated-chat \ + // --run-id ... --exchange-type http + // --------------------------------------------------------------------- + "run_id": "federated-chat", + "user_task": "Federated counter chat: bounce an integer counter between agent-aurora and agent-crux, each incrementing it by 1, until it reaches 10. Then both finish_turn.", + "prompt_profile": "prompt_profiles/default.json", + "initial_agent": "agent-aurora", + "resources": {}, + "mcp_servers": [], + "agents": [ + { + "name": "agent-aurora", + "role": "FederatedCounterInitiator", + "mission": "You are agent-aurora, running on the Aurora HPC. You are playing a counter-bouncing game with agent-crux across the HPC boundary. Rules: (1) On the bootstrap round, send EXACTLY ONE message to agent-crux with content 'counter=1' and tldr 'counter=1'. Set reply_requested=true. Then call finish_turn. (2) On every subsequent round where you receive a message from agent-crux containing 'counter=N', if N < 10 then send EXACTLY ONE reply to agent-crux with content 'counter=N+1' (you compute N+1 yourself, e.g. counter=3 if you received counter=2) and tldr 'counter=N+1', reply_requested=true, then finish_turn. If N >= 10, send NOTHING and just call finish_turn -- the game is over. (3) NEVER send more than one message per round. (4) NEVER initiate a new chain; only reply when a peer message arrives.", + "allowed_peers": ["agent-crux"], + "mcp_servers": [], + "resources": [] + }, + { + "name": "agent-crux", + "role": "FederatedCounterResponder", + "mission": "You are agent-crux, running on the Crux HPC. You are playing a counter-bouncing game with agent-aurora across the HPC boundary. Rules: (1) You NEVER initiate a message; you only ever reply. (2) On every round where you receive a message from agent-aurora containing 'counter=N', if N < 10 then send EXACTLY ONE reply to agent-aurora with content 'counter=N+1' (you compute N+1 yourself) and tldr 'counter=N+1', reply_requested=true, then finish_turn. If N >= 10, send NOTHING and just call finish_turn -- the game is over. (3) NEVER send more than one message per round.", + "allowed_peers": ["agent-aurora"], + "mcp_servers": [], + "resources": [] + } + ] +} diff --git a/src/chemgraph/academy/campaigns/federated-chat/lm_config.json b/src/chemgraph/academy/campaigns/federated-chat/lm_config.json new file mode 100644 index 0000000..20c4b7f --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/lm_config.json @@ -0,0 +1,11 @@ +{ + "provider": "openai_compatible_tools", + "base_url": "http://:18186/argoapi/v1", + "model": "GPT-5-mini", + "api_key": "dummy", + "user": "", + "timeout_s": 180, + "max_tokens": 4096, + "max_retries": 3, + "retry_delay_s": 2 +} diff --git a/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json b/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json new file mode 100644 index 0000000..ccbe79f --- /dev/null +++ b/src/chemgraph/academy/campaigns/federated-chat/prompt_profiles/default.json @@ -0,0 +1,12 @@ +{ + "prompt_version": "federated-chat-v1", + "prompt_style": "json_state", + "system_prompt": "You are a persistent ChemGraph-style LM agent hosted inside an Academy daemon on HPC. You communicate with peers ONLY through send_message. This campaign has NO science tools; your only useful actions are send_message and finish_turn. Follow your mission literally; do not invent additional work. The campaign has a clear termination condition (counter reaches 10); when reached, call finish_turn and STOP.", + "protocol_prompt": "Return one or more tool calls. If no action is useful, call finish_turn. Every send_message call must include tldr: one short line for the dashboard. Set reply_requested=true when the peer should answer, otherwise false. Keep arguments concise. Per your mission: send AT MOST ONE message per round. The counter you receive looks like 'counter=N'; parse N, compute N+1, send 'counter=N+1' as both content and tldr. When N>=10 the game is over -- send nothing, just finish_turn.", + "langchain_recursion_limit": 32, + "state_limits": { + "received_messages_last_n": 8, + "tool_results_last_n": 4, + "actions_last_n": 8 + } +} From 5856e7dd00b5d6dd23cc28f1aa5e3677c476d4ff Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 14:20:37 -0500 Subject: [PATCH 19/25] fix(academy/dashboard): render federated runs by merging per-site status to top level The server returns per-site state nested under snapshot.sites[].status / .placement in federated mode, but agents() reads snapshot.status?.agents, so the dashboard rendered an empty graph + zeroed metrics for federated runs even though events were streaming through correctly. Synthesize merged top-level status/placement during load() so every existing single-site reader (agents, metrics, workflow mode detection) works unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/dashboard/static/app.js | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/chemgraph/academy/dashboard/static/app.js b/src/chemgraph/academy/dashboard/static/app.js index f821e63..45bed8c 100644 --- a/src/chemgraph/academy/dashboard/static/app.js +++ b/src/chemgraph/academy/dashboard/static/app.js @@ -99,17 +99,49 @@ : []; nextSnapshot.sitesByAgent = {}; if (nextSnapshot.federated) { + // Merge per-site status/placement/summary up to the top level so + // the rest of the app (agents(), renderMetrics(), workflow code) + // can read snapshot.status / snapshot.placement exactly like in + // single-site mode. Without this merge `snapshot.status?.agents` + // is undefined in federated runs and the entire graph + metrics + // panel renders empty even though events stream in. + const mergedAgents = []; + const mergedPlacements = {}; + const seenAgentIds = new Set(); + let mergedSchema = null; + const mergedStatusExtras = {}; for (const [siteName, siteData] of Object.entries(nextSnapshot.sites)) { const agents = (siteData?.status?.agents) || []; agents.forEach(spec => { const agentId = spec.agent_id || spec.agent_name || spec.name; - if (agentId) nextSnapshot.sitesByAgent[agentId] = siteName; + if (agentId) { + nextSnapshot.sitesByAgent[agentId] = siteName; + if (!seenAgentIds.has(agentId)) { + seenAgentIds.add(agentId); + mergedAgents.push({...spec, site: siteName}); + } + } }); const placements = (siteData?.placement?.agents) || {}; - Object.keys(placements).forEach(agentId => { + Object.entries(placements).forEach(([agentId, placement]) => { if (!(agentId in nextSnapshot.sitesByAgent)) { nextSnapshot.sitesByAgent[agentId] = siteName; } + if (!(agentId in mergedPlacements)) { + mergedPlacements[agentId] = placement; + } + }); + // Carry a representative schema + common status scalars so + // isWorkflowMode() and "campaign / mode" lookups behave the + // same as single-site. Last-write-wins is fine here -- all + // sites in a federated run share the same campaign/mode. + if (siteData?.schema) mergedSchema = siteData.schema; + const siteStatus = siteData?.status || {}; + ['campaign', 'campaign_kind', 'mode', 'converged', 'query', + 'workflow_type', 'model_name'].forEach(key => { + if (siteStatus[key] !== undefined && mergedStatusExtras[key] === undefined) { + mergedStatusExtras[key] = siteStatus[key]; + } }); } // Backfill from events too, since the per-event ``site`` tag @@ -119,6 +151,14 @@ nextSnapshot.sitesByAgent[event.agent_id] = event.site; } }); + nextSnapshot.status = { + ...mergedStatusExtras, + agents: mergedAgents, + }; + nextSnapshot.placement = {agents: mergedPlacements}; + if (mergedSchema && !nextSnapshot.schema) { + nextSnapshot.schema = mergedSchema; + } } const nextIdentity = identityForSnapshot(nextSnapshot); const previousEventCount = snapshot?.events?.length || 0; From 9bb441f4eca7a14d0059f73f3c8f17f6edca98b4 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 14:20:46 -0500 Subject: [PATCH 20/25] fix(academy/relay): handle SIGTERM/SIGINT so kill actually releases the port The relay had no signal handlers, so SIGTERM hit the default disposition and bash kill(1) calls were silently ignored. The python kept running, the port stayed bound, and the next launch failed with "Address already in use" -- requiring a manual UAN sweep to recover. Install SIGTERM/SIGINT handlers that close the listen socket and exit cleanly, with try/except around accept() so the close-from-handler path returns instead of tracebacking. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../runtime/templates/uan_http_relay.py | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/src/chemgraph/academy/runtime/templates/uan_http_relay.py b/src/chemgraph/academy/runtime/templates/uan_http_relay.py index 8ce424f..442492e 100644 --- a/src/chemgraph/academy/runtime/templates/uan_http_relay.py +++ b/src/chemgraph/academy/runtime/templates/uan_http_relay.py @@ -19,7 +19,9 @@ from __future__ import annotations import argparse +import signal import socket +import sys import threading @@ -72,17 +74,36 @@ def parse_args() -> argparse.Namespace: def main() -> int: args = parse_args() - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server: - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind((args.listen_host, args.listen_port)) - server.listen(128) - print( - f'relay listening on {args.listen_host}:{args.listen_port} ' - f'-> {args.target_host}:{args.target_port}', - flush=True, - ) + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind((args.listen_host, args.listen_port)) + server.listen(128) + print( + f'relay listening on {args.listen_host}:{args.listen_port} ' + f'-> {args.target_host}:{args.target_port}', + flush=True, + ) + + def shutdown(_signo: int, _frame: object) -> None: + # Closing the listen socket inside the handler interrupts + # server.accept() with EBADF / OSError, which we catch below + # to fall through to a clean exit. Without this the relay + # ignores SIGTERM (default action) and orphans the port. + try: + server.close() + except OSError: + pass + sys.exit(0) + + signal.signal(signal.SIGTERM, shutdown) + signal.signal(signal.SIGINT, shutdown) + + try: while True: - client, addr = server.accept() + try: + client, addr = server.accept() + except OSError: + break print(f'accepted connection from {addr[0]}:{addr[1]}', flush=True) thread = threading.Thread( target=handle_client, @@ -90,6 +111,12 @@ def main() -> int: daemon=True, ) thread.start() + finally: + try: + server.close() + except OSError: + pass + return 0 if __name__ == '__main__': From 767a97e9a9c752ad4285df34163e0a9f152dd168 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 14:21:47 -0500 Subject: [PATCH 21/25] fix(academy/relay): reap stale relays per-UAN before binding, with xtrace and self-kill guard Aurora's login alias round-robins across uan-0001..uan-0010, so a single pid file on the shared FS was meaningless: the pid only exists on the UAN that ran the relay, and the next launch usually lands on a different UAN where that pid is either absent or belongs to someone else. As a result every crashed launch left an orphan relay holding the port, and manual ssh-into-each-UAN cleanup was the only recovery path. Replace the single-pid bookkeeping with per-UAN cleanup that scans ps for python processes whose argv contains the relay script path, owned by $USER, excluding $$ and $PPID. The self-exclusion is load- bearing: pgrep -f matched our own bash script (the relay script path appears in our argv as well), so the previous attempt killed the caller instead of the orphan. Also drop set -e (pgrep returning 1 was triggering silent exits with no log output) and add exec 2>&1 + set -x so the local relay log contains a full trace when something fails -- previously failures produced empty logs and "Local relay log:" with nothing after it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../academy/runtime/templates/start_relay.sh | 99 +++++++++++++++++-- 1 file changed, 92 insertions(+), 7 deletions(-) diff --git a/src/chemgraph/academy/runtime/templates/start_relay.sh b/src/chemgraph/academy/runtime/templates/start_relay.sh index 1bb9e5f..2e93356 100644 --- a/src/chemgraph/academy/runtime/templates/start_relay.sh +++ b/src/chemgraph/academy/runtime/templates/start_relay.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -euo pipefail +set -uo pipefail REMOTE_ROOT="$1" RELAY_SCRIPT="$2" @@ -10,16 +10,97 @@ RELAY_PORT="$6" REVERSE_PORT="$7" RELAY_PYTHON="$8" +# Trace every line so the local relay log captures exactly which step +# fails. Without xtrace, a bash failure (e.g. cd, redirect) produces +# zero diagnostics and the launcher just reports "Local relay log: +# ". The output goes through the SSH stdout pipe so it lands +# in the Mac-side relay log -- no remote-side tail needed. +exec 2>&1 +echo "[start_relay] uan=$(hostname -f) port=${RELAY_PORT} reverse=${REVERSE_PORT} pid=$$" +set -x + cd "${REMOTE_ROOT}" UAN_HOST="$(hostname -f)" +UAN_SHORT="$(hostname -s)" printf '%s\n' "${UAN_HOST}" > "${RELAY_HOST_FILE}" -if [ -f "${RELAY_PID_FILE}" ]; then - OLD_PID="$(cat "${RELAY_PID_FILE}" 2>/dev/null || true)" - case "${OLD_PID}" in - ''|*[!0-9]*) ;; - *) kill "${OLD_PID}" 2>/dev/null || true ;; - esac +# Pid bookkeeping has to be per-UAN. Aurora's login alias round-robins +# across uan-0001..uan-0010, the shared filesystem is the same, but a +# pid only means something on the UAN that holds the process. Without +# per-host scoping, the launcher would happily kill the wrong pid (or +# none at all) on a sibling UAN and leave an orphan holding the port. +PER_HOST_PID_FILE="${RELAY_PID_FILE%.pid}.${UAN_SHORT}.pid" + +find_orphan_pids() { + # Match python processes whose first argv after the interpreter is + # the relay script path. Using `comm=python` + an argv prefix is far + # more precise than `pgrep -f `, which would also match this + # very bash script (because the path is in OUR argv too) and the + # subsequent `pgrep` invocation itself -- killing them all and + # taking the whole start_relay session down with them. That bug + # produced the silent "log shows kill then nothing" + # failure mode. + local self_pid="$$" + local parent_pid="${PPID:-0}" + ps -u "${USER}" -o pid=,comm=,args= 2>/dev/null \ + | awk -v rs="${RELAY_SCRIPT}" -v me="${self_pid}" -v pp="${parent_pid}" ' + $1 == me || $1 == pp { next } + $2 ~ /python/ { + for (i = 3; i <= NF; i++) if ($i == rs) { print $1; next } + } + ' +} + +kill_local_orphans() { + # Kill prior relay processes on THIS UAN. Scope: only python + # processes that have the relay script as an argv element, owned + # by us, excluding our own pid/ppid. + local pids + pids="$(find_orphan_pids)" + if [ -n "${pids}" ]; then + echo "[start_relay] killing prior relay pids on $(hostname -s): ${pids}" + # shellcheck disable=SC2086 + kill ${pids} 2>/dev/null || true + sleep 1 + pids="$(find_orphan_pids)" + if [ -n "${pids}" ]; then + echo "[start_relay] forcing kill -9 on stubborn pids: ${pids}" + # shellcheck disable=SC2086 + kill -9 ${pids} 2>/dev/null || true + sleep 1 + fi + fi + # Also try the per-host pid file in case the process was renamed or + # something matched a previous launch's bookkeeping that the ps + # scan didn't see. Best-effort. + if [ -f "${PER_HOST_PID_FILE}" ]; then + local old_pid + old_pid="$(cat "${PER_HOST_PID_FILE}" 2>/dev/null || true)" + case "${old_pid}" in + ''|*[!0-9]*) ;; + *) + # Don't kill ourselves or our parent even if a stale file + # happens to record our pid (shouldn't happen, but cheap). + if [ "${old_pid}" != "$$" ] && [ "${old_pid}" != "${PPID:-0}" ]; then + kill "${old_pid}" 2>/dev/null || true + fi + ;; + esac + fi +} + +kill_local_orphans + +# After reaping local orphans, fail fast and clearly if the port is +# still held -- it means another user (or another UAN's process via +# some unusual route) owns it and we can't take it over. +if command -v ss >/dev/null 2>&1; then + if ss -tln 2>/dev/null | awk '{print $4}' | grep -qE "[:.]${RELAY_PORT}\$"; then + echo "ERROR: port ${RELAY_PORT} on ${UAN_SHORT} is still in use after" >&2 + echo " cleaning up our own relays. Inspect with: ss -tlnp | grep ${RELAY_PORT}" >&2 + ss -tlnp 2>/dev/null | grep -E "[:.]${RELAY_PORT}\\b" >&2 || true + exit 1 + fi fi "${RELAY_PYTHON}" "${RELAY_SCRIPT}" \ @@ -29,10 +110,14 @@ fi --target-port "${REVERSE_PORT}" \ > "${RELAY_LOG_FILE}" 2>&1 & RELAY_PID="$!" +printf '%s\n' "${RELAY_PID}" > "${PER_HOST_PID_FILE}" +# Also write the legacy pid path so older launchers / debugging scripts +# that look for the bare uan-relay-.pid see *something* sensible. printf '%s\n' "${RELAY_PID}" > "${RELAY_PID_FILE}" cleanup_remote() { kill "${RELAY_PID}" 2>/dev/null || true + rm -f "${PER_HOST_PID_FILE}" 2>/dev/null || true } trap cleanup_remote EXIT From b51f1891f71f86fa3660cfd7ceff986d73864704 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Fri, 19 Jun 2026 15:30:31 -0500 Subject: [PATCH 22/25] chore(academy/campaigns): drop federated-hello, superseded by federated-chat federated-chat covers the same Aurora<->Crux federation smoke path with more dashboard material to render, so the hello campaign is dead weight. Remove the campaign files + registry entries, and re-point the validate_campaign(federated=True) regression test + registration.py docstring example to federated-chat. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/campaigns/__init__.py | 13 ------ .../campaigns/federated-hello/campaign.jsonc | 45 ------------------- .../campaigns/federated-hello/lm_config.json | 11 ----- .../prompt_profiles/default.json | 12 ----- src/chemgraph/academy/runtime/registration.py | 6 +-- tests/test_academy_campaign.py | 2 +- 6 files changed, 4 insertions(+), 85 deletions(-) delete mode 100644 src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc delete mode 100644 src/chemgraph/academy/campaigns/federated-hello/lm_config.json delete mode 100644 src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json diff --git a/src/chemgraph/academy/campaigns/__init__.py b/src/chemgraph/academy/campaigns/__init__.py index 7837691..f3a003a 100644 --- a/src/chemgraph/academy/campaigns/__init__.py +++ b/src/chemgraph/academy/campaigns/__init__.py @@ -6,18 +6,15 @@ EXAMPLE_002 = 'example-002-mace-ensemble-screening' -FEDERATED_HELLO = 'federated-hello' FEDERATED_CHAT = 'federated-chat' CAMPAIGNS = { 'mace-ensemble-screening-20': f'{EXAMPLE_002}/campaign.jsonc', - 'federated-hello': f'{FEDERATED_HELLO}/campaign.jsonc', 'federated-chat': f'{FEDERATED_CHAT}/campaign.jsonc', } LM_CONFIG_TEMPLATES = { 'argo-gpt54-mace-template': f'{EXAMPLE_002}/lm_config.json', - 'argo-gpt5mini-federated-hello': f'{FEDERATED_HELLO}/lm_config.json', 'argo-gpt5mini-federated-chat': f'{FEDERATED_CHAT}/lm_config.json', } @@ -39,16 +36,6 @@ class CampaignLaunchDefaults: agents_per_node=1, max_decisions=24, ), - # Two-agent federated hello-world. Per-site spawn-site invocations - # override --agent-count from the slice length, so the value below - # is only the "ran via run-compute" single-machine fallback (which - # nobody should actually do for this campaign). - 'federated-hello': CampaignLaunchDefaults( - lm_config_template='argo-gpt5mini-federated-hello', - agent_count=2, - agents_per_node=1, - max_decisions=4, - ), # Multi-turn cross-HPC counter chat. ~10 send/receive round-trips # so the dashboard has actual material to render. Each agent runs # ~6 reasoning rounds (send, receive, send, ..., reach 10, diff --git a/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc b/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc deleted file mode 100644 index f22b31e..0000000 --- a/src/chemgraph/academy/campaigns/federated-hello/campaign.jsonc +++ /dev/null @@ -1,45 +0,0 @@ -{ - // --------------------------------------------------------------------- - // Federated-hello: the smallest possible cross-HPC ChemGraph Academy - // campaign. Two agents, no science tools, no MCP servers. Each agent - // is intended to live on a DIFFERENT machine (one on Aurora, one on - // Crux). The campaign proves the federation story works end-to-end: - // * each site's spawn-site registers its slice on the hosted - // exchange, - // * cross-site peer discovery via transport.discover() succeeds, - // * a real LM-driven send_message round-trip crosses the HPC - // boundary through the public Academy exchange. - // Run as: - // chemgraph academy dashboard -- demo --system aurora,crux \ - // --campaign federated-hello - // chemgraph academy spawn-site -- --system aurora \ - // --campaign federated-hello --agents agent-aurora --exchange-type http - // chemgraph academy spawn-site -- --system crux \ - // --campaign federated-hello --agents agent-crux --exchange-type http - // chemgraph academy bootstrap -- --campaign federated-hello --exchange-type http - // --------------------------------------------------------------------- - "run_id": "federated-hello", - "user_task": "Federated hello: greet your peer once across the HPC boundary.", - "prompt_profile": "prompt_profiles/default.json", - "initial_agent": "agent-aurora", - "resources": {}, - "mcp_servers": [], - "agents": [ - { - "name": "agent-aurora", - "role": "FederatedHelloInitiator", - "mission": "You are agent-aurora, running on the Aurora HPC. On your FIRST decision round (when you receive the campaign bootstrap), send EXACTLY ONE message to agent-crux saying 'hello from aurora' with tldr 'hello'. Set reply_requested=true. Then call finish_turn. On subsequent rounds, if you have received a reply from agent-crux acknowledging your hello, do not send anything further -- just call finish_turn. NEVER send more than one outgoing hello.", - "allowed_peers": ["agent-crux"], - "mcp_servers": [], - "resources": [] - }, - { - "name": "agent-crux", - "role": "FederatedHelloResponder", - "mission": "You are agent-crux, running on the Crux HPC. You are waiting for one incoming message from agent-aurora. When you receive it, send EXACTLY ONE reply back to agent-aurora saying 'hello from crux, received your message' with tldr 'ack'. Set reply_requested=false. Then call finish_turn. NEVER initiate a message on your own; you only ever reply.", - "allowed_peers": ["agent-aurora"], - "mcp_servers": [], - "resources": [] - } - ] -} diff --git a/src/chemgraph/academy/campaigns/federated-hello/lm_config.json b/src/chemgraph/academy/campaigns/federated-hello/lm_config.json deleted file mode 100644 index 20c4b7f..0000000 --- a/src/chemgraph/academy/campaigns/federated-hello/lm_config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "provider": "openai_compatible_tools", - "base_url": "http://:18186/argoapi/v1", - "model": "GPT-5-mini", - "api_key": "dummy", - "user": "", - "timeout_s": 180, - "max_tokens": 4096, - "max_retries": 3, - "retry_delay_s": 2 -} diff --git a/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json b/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json deleted file mode 100644 index 68d3059..0000000 --- a/src/chemgraph/academy/campaigns/federated-hello/prompt_profiles/default.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "prompt_version": "federated-hello-v1", - "prompt_style": "json_state", - "system_prompt": "You are a persistent ChemGraph-style LM agent hosted inside an Academy daemon on HPC. You communicate with peers ONLY through send_message. This campaign has NO science tools; your only useful actions are send_message and finish_turn. Follow your mission literally; do not invent additional work.", - "protocol_prompt": "Return one or more tool calls. If no action is useful, call finish_turn. Every send_message call must include tldr: one short line for the dashboard. Set reply_requested=true when the peer should answer, otherwise false. Keep arguments concise. Do NOT loop -- if you already sent your one hello (initiator) or already sent your one ack (responder), call finish_turn immediately on every subsequent round.", - "langchain_recursion_limit": 32, - "state_limits": { - "received_messages_last_n": 8, - "tool_results_last_n": 4, - "actions_last_n": 8 - } -} diff --git a/src/chemgraph/academy/runtime/registration.py b/src/chemgraph/academy/runtime/registration.py index ec9b241..638a24c 100644 --- a/src/chemgraph/academy/runtime/registration.py +++ b/src/chemgraph/academy/runtime/registration.py @@ -17,11 +17,11 @@ can wait until its peers have actually registered before proceeding. Side effect of this scheme: agent names become campaign-scoped. Two -operators running the SAME ``federated-hello`` campaign concurrently +operators running the SAME ``federated-chat`` campaign concurrently would clash on the same UIDs and crash the registration POST with "mailbox already exists". The run-id is part of the UID namespace, -so as long as operators bump ``--run-id`` (federated-hello-001 vs -federated-hello-002) the UIDs differ and the campaigns don't see +so as long as operators bump ``--run-id`` (federated-chat-001 vs +federated-chat-002) the UIDs differ and the campaigns don't see each other. """ from __future__ import annotations diff --git a/tests/test_academy_campaign.py b/tests/test_academy_campaign.py index 0165df1..af1218c 100644 --- a/tests/test_academy_campaign.py +++ b/tests/test_academy_campaign.py @@ -432,7 +432,7 @@ def test_validate_campaign_federated_loosens_cross_site_peer_check() -> None: from chemgraph.academy.core.campaign import ( filter_agents, load_campaign, validate_campaign, ) - campaign = load_campaign("federated-hello") + campaign = load_campaign("federated-chat") slice_aurora = filter_agents(campaign, ["agent-aurora"]) # Strict validation rejects the cross-site peer reference. From b3b34bf8a5d78c46f6a5557820b8d84e4179ff03 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Mon, 22 Jun 2026 12:38:39 -0500 Subject: [PATCH 23/25] fix(academy/compute): default --lm-user from $ARGO_USER + reject placeholder Two related ergonomics fixes for spawn-site / run-compute launches: * When --lm-user is omitted, fall back to $ARGO_USER from the env. HPC users already export ARGO_USER for the rest of the ChemGraph workflow, so requiring a duplicate --lm-user flag was busywork. * _write_lm_config now refuses to ship lm_config.json with the template's literal "" placeholder. Argo would otherwise silently accept the launch and only reject at first LM call time, after the daemon + relay stack was already running -- expensive to debug. The hard error names the fix directly. Tests: stub HttpExchangeFactory in test_academy_exchange_registration so the http-dispatch tests don't try to authenticate against the real hosted exchange. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../academy/runtime/compute_launcher.py | 20 +++++++++++++++- tests/test_academy_exchange_registration.py | 24 +++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/chemgraph/academy/runtime/compute_launcher.py b/src/chemgraph/academy/runtime/compute_launcher.py index 2d20e4a..6f0a0cc 100644 --- a/src/chemgraph/academy/runtime/compute_launcher.py +++ b/src/chemgraph/academy/runtime/compute_launcher.py @@ -244,6 +244,18 @@ def _write_lm_config( if max_tokens is not None: data["max_tokens"] = max_tokens + # Refuse to ship a config whose `user` is still the template + # placeholder. Argo rejects requests with an unknown user, but + # only at first-call time on the compute node, after the whole + # daemon + relay stack is already running -- expensive to debug. + # Fail here instead, with a message pointing at the fix. + if data.get("user") in (None, "", ""): + raise RuntimeError( + f"lm_config.json was written with user={data.get('user')!r}. " + "Pass --lm-user or export ARGO_USER " + "before launching spawn-site / run-compute." + ) + path = run_dir / "lm_config.json" path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") return path @@ -288,12 +300,18 @@ def prepare_compute_launch(args: argparse.Namespace) -> AllocationPlan: profile=profile, metadata=metadata, ) + # Default lm-user from $ARGO_USER so HPC users who already export it + # for their normal ChemGraph workflow don't have to pass --lm-user + # again. The packaged template ships with a literal "" + # placeholder so it can't accidentally point at someone else's + # Argo account; _write_lm_config refuses to keep that placeholder. + lm_user = args.lm_user or os.environ.get("ARGO_USER") lm_config = _write_lm_config( run_dir=run_dir, template_name=defaults.lm_config_template, base_url=lm_base_url, lm_model=args.lm_model, - lm_user=args.lm_user, + lm_user=lm_user, max_tokens=args.max_tokens, ) _export_workflow_lm_environment(lm_config) diff --git a/tests/test_academy_exchange_registration.py b/tests/test_academy_exchange_registration.py index eb87a44..582706a 100644 --- a/tests/test_academy_exchange_registration.py +++ b/tests/test_academy_exchange_registration.py @@ -2,6 +2,7 @@ import asyncio from pathlib import Path +from types import SimpleNamespace from typing import Any import pytest @@ -11,6 +12,7 @@ pytest.importorskip("academy") from academy.identifier import AgentId +from academy.exchange.cloud.client import DEFAULT_EXCHANGE_URL from chemgraph.academy.core.campaign import ChemGraphDaemonConfig from chemgraph.academy.runtime.exchange import build_exchange_factory @@ -50,6 +52,18 @@ def _config( ) +class HttpExchangeFactory: + def __init__(self, url: str = DEFAULT_EXCHANGE_URL, **kwargs: Any) -> None: + self._info = SimpleNamespace(url=url) + self.kwargs = kwargs + + +def _stub_http_exchange_factory(monkeypatch: pytest.MonkeyPatch) -> None: + import academy.exchange.cloud as cloud + + monkeypatch.setattr(cloud, 'HttpExchangeFactory', HttpExchangeFactory) + + # --------------------------------------------------------------------------- # Exchange factory dispatch # --------------------------------------------------------------------------- @@ -68,7 +82,11 @@ def test_build_exchange_factory_dispatches_by_config( tmp_path, exchange_type, expected_class, + monkeypatch, ) -> None: + if exchange_type == 'http': + _stub_http_exchange_factory(monkeypatch) + factory = build_exchange_factory(_config(tmp_path, exchange_type)) assert type(factory).__name__ == expected_class @@ -99,12 +117,13 @@ def test_exchange_uses_redis_helper_matches_dispatch_table() -> None: def test_http_exchange_factory_uses_hosted_default_when_url_omitted( tmp_path, + monkeypatch, ) -> None: """A ``None`` ``http_exchange_url`` must select Academy's hosted default (https://exchange.academy-agents.org/v1). This is the path every cross-HPC campaign takes unless the operator stands up a self-hosted exchange.""" - from academy.exchange.cloud.client import DEFAULT_EXCHANGE_URL + _stub_http_exchange_factory(monkeypatch) factory = build_exchange_factory(_config(tmp_path, 'http')) # Upstream stores connection details on factory._info; reach into @@ -112,11 +131,12 @@ def test_http_exchange_factory_uses_hosted_default_when_url_omitted( assert factory._info.url == DEFAULT_EXCHANGE_URL -def test_http_exchange_factory_honors_custom_url(tmp_path) -> None: +def test_http_exchange_factory_honors_custom_url(tmp_path, monkeypatch) -> None: """Operators must be able to point at a self-hosted HTTP exchange server (``python -m academy.exchange.cloud``). This is the escape hatch when the public Academy server is unavailable or undesired.""" custom = 'https://my-private-exchange.example.com/v1' + _stub_http_exchange_factory(monkeypatch) factory = build_exchange_factory( _config(tmp_path, 'http', http_exchange_url=custom), ) From 6e0198ba8f4480be814f80e8004d5bb8ee211bdb Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Mon, 22 Jun 2026 13:12:36 -0500 Subject: [PATCH 24/25] docs(academy): federated-chat e2e guide + README Four-terminal walkthrough for the cross-HPC demo: dashboard on Mac + spawn-site on Aurora + spawn-site on Crux + bootstrap kickoff. Mirrors the existing example-002 guide's shape, swaps in the federated flow (deterministic peer UIDs, HTTP exchange, ALCF proxy passthrough, Globus device-flow login). README links to the e2e guide and points at the packaged campaign location. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/academy/federated-chat/README.md | 25 +++ examples/academy/federated-chat/e2e_guide.md | 179 +++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 examples/academy/federated-chat/README.md create mode 100644 examples/academy/federated-chat/e2e_guide.md diff --git a/examples/academy/federated-chat/README.md b/examples/academy/federated-chat/README.md new file mode 100644 index 0000000..134588f --- /dev/null +++ b/examples/academy/federated-chat/README.md @@ -0,0 +1,25 @@ +# Federated-Chat + +Two ChemGraph Academy logical agents running on **different HPCs** +(Aurora and Crux), discovering each other through Academy's hosted +HTTP exchange and exchanging messages across the HPC boundary: + +```text +agent-aurora (running on Aurora) +agent-crux (running on Crux) +``` + +The agents play a counter-bouncing game: agent-aurora sends `counter=1` +to agent-crux, agent-crux replies `counter=2`, and so on until the +counter reaches 10. Tiny on purpose — exercises the federated stack +(deterministic peer UIDs, HTTP exchange, multi-site dashboard) without +needing any science tools. + +The campaign assets are packaged under: + +```text +src/chemgraph/academy/campaigns/federated-chat/ +``` + +See [`e2e_guide.md`](e2e_guide.md) for the full four-terminal walkthrough +(dashboard + Aurora compute + Crux compute + bootstrap kickoff). diff --git a/examples/academy/federated-chat/e2e_guide.md b/examples/academy/federated-chat/e2e_guide.md new file mode 100644 index 0000000..1e912b4 --- /dev/null +++ b/examples/academy/federated-chat/e2e_guide.md @@ -0,0 +1,179 @@ +# Federated-Chat E2E Guide + +This guide runs the `federated-chat` ChemGraph Academy campaign across +**two HPCs simultaneously** (Aurora and Crux), with the dashboard on +your laptop merging both sites into one view. + +The campaign is intentionally minimal: two agents bounce a counter back +and forth across the HPC boundary, each incrementing it, until it hits 10. +It exercises every part of the cross-HPC stack (deterministic peer +discovery, HTTP exchange, cross-site send_message, multi-site dashboard) +without needing any science tools. + +```text +agent-aurora agent-crux + ↓ counter=1 ──► receive + receive ◄── counter=2 ↓ + ↓ counter=3 ──► receive + ... ... + receive ◄── counter=10 ↓ + finish_turn finish_turn +``` + +Four terminals: dashboard (Mac), Aurora compute, Crux compute, bootstrap (Mac). + +## Configure Paths + +Set these in every shell (Mac + both HPCs): + +```bash +export ALCF_PROJECT=ChemGraph +export ALCF_USER= # e.g. jinchu +export ALCF_SSH_USER= # may differ, e.g. jinchuli +export ARGO_USER= # e.g. jinchu.li +export LOCAL_CHEMGRAPH= +``` + +`ALCF_USER` is the shared-filesystem path component (`/flare/$ALCF_PROJECT/$ALCF_USER`). +`ALCF_SSH_USER` is the SSH login. They may differ; the loader defaults +`ALCF_SSH_USER` to `ALCF_USER` if you don't set it. + +## One-Time Setup + +You need the same setup as `example-002-mace-ensemble-screening` (sync +ChemGraph, install `[academy]` extra, build Redis once) on **both** Aurora +and Crux. Plus one extra step: log in to Academy's hosted exchange so the +Globus token is cached on both compute environments: + +```bash +# On Aurora compute (inside an interactive allocation): +python -c "from academy.exchange.cloud import HttpExchangeFactory; HttpExchangeFactory()" +# Follow the device-flow URL printed in the terminal. Same on Crux. +``` + +The token is written to `~/.local/share/academy/storage.db` and is +shared across runs. + +## Terminal 1: Dashboard (Mac) + +```bash +cd "$LOCAL_CHEMGRAPH" + +export RUN_ID=federated-chat-001 + +chemgraph academy dashboard -- "$RUN_ID" \ + --system aurora,crux \ + --campaign federated-chat \ + --reverse-port 18190 \ + --overwrite-run +``` + +This brings up: + +- one SSH ControlMaster + UAN relay + rsync mirror **per site** (`aurora` and `crux`), +- a single merged dashboard server at `http://127.0.0.1:8765`. + +Wait for both relays to print `... relay ready at ...` before continuing. + +## Terminal 2: Aurora compute (inside Aurora PBS allocation) + +```bash +module load frameworks +source /flare/$ALCF_PROJECT/$ALCF_USER/venvs/academy-swarm/bin/activate +export PATH=/flare/$ALCF_PROJECT/$ALCF_USER/bin:$PATH + +# HTTP exchange must reach exchange.academy-agents.org through the ALCF proxy. +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export HTTP_PROXY=$http_proxy +export HTTPS_PROXY=$https_proxy +export no_proxy=127.0.0.1,localhost,.alcf.anl.gov +export NO_PROXY=$no_proxy + +chemgraph academy spawn-site -- \ + --system aurora \ + --run-id "$RUN_ID" \ + --campaign federated-chat \ + --agents agent-aurora \ + --exchange-type http +``` + +Look for the lifecycle landmarks: + +```text +[daemon] rank0 registered 'agent-aurora' on the exchange (uid=...) +[daemon] rank0 waiting for peers ['agent-crux'] to come online (timeout 600s)... +[daemon] rank0 all 1 peer(s) are alive: ['agent-crux'] +[daemon] rank0 agent 'agent-aurora' is now running inside Academy Runtime +[daemon] rank0 skipping inline bootstrap (federated mode); waiting for 'chemgraph academy bootstrap'... +``` + +## Terminal 3: Crux compute (inside Crux PBS allocation) + +```bash +source /eagle/$ALCF_PROJECT/$ALCF_USER/venvs/academy-swarm-crux/bin/activate +export PATH=/eagle/$ALCF_PROJECT/$ALCF_USER/bin:$PATH + +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export HTTP_PROXY=$http_proxy +export HTTPS_PROXY=$https_proxy +export no_proxy=127.0.0.1,localhost,.alcf.anl.gov +export NO_PROXY=$no_proxy + +chemgraph academy spawn-site -- \ + --system crux \ + --run-id "$RUN_ID" \ + --campaign federated-chat \ + --agents agent-crux \ + --exchange-type http +``` + +Same landmarks. Both daemons will block at `waiting for 'chemgraph academy +bootstrap'` once they've discovered each other. + +## Terminal 4: Bootstrap kickoff (Mac, once both sites are waiting) + +```bash +chemgraph academy bootstrap -- \ + --campaign federated-chat \ + --run-id "$RUN_ID" \ + --exchange-type http +``` + +Prints `ok: sent bootstrap to agent-aurora (message_id=...)`. + +## What You Should See + +- **Aurora terminal**: `[agent agent-aurora] first message arrived from + 'campaign' ...`, then decisions firing, then `message_sent` to agent-crux. +- **Crux terminal**: `[agent agent-crux] first message arrived from + 'agent-aurora' ...`, then back-and-forth. +- **Dashboard**: agent nodes appear in the graph, metrics tick up, the + cross-site message-flow edge between aurora and crux fills in, counter + messages climb in the activity log from 1 → 10. + +## Troubleshooting + +**Both sides stuck at `waiting for peers` past ~60s** → one site isn't +actually registered. Check each compute terminal for the `registered` line. +If one is missing, the daemon hit an exception before registration; scroll +up. + +**`Address already in use` on relay startup** → a prior crashed launch +left an orphan. The new self-cleaning relay should handle it +automatically; if it doesn't, the local relay log under +`/tmp/chemgraph-academy---relay.log` will have a full `set +-x` trace showing exactly which step failed. + +**Bootstrap times out** → both sites must already be at `waiting for +'chemgraph academy bootstrap'`. If only one is up, bootstrap can't find +the recipient. + +**Argo `` error** → you didn't export `ARGO_USER` before +launching spawn-site. The launcher refuses to ship a config with the +template placeholder; the error message names the fix. + +**`Could not validate Globus token`** → the device-flow login expired. +Re-run the `python -c "from academy.exchange.cloud ..."` snippet from +the one-time setup section. From 1af22be1a6c07c229a4d1133d5d430d441d5fe00 Mon Sep 17 00:00:00 2001 From: Jinchu Li Date: Mon, 22 Jun 2026 14:08:02 -0500 Subject: [PATCH 25/25] style(academy): fix ruff lint violations introduced by this branch * dashboard_launcher.py: split `import os, shlex, ...` onto separate lines (E401). * mpi.py: drop unused `write_json_atomic` import (F401), left over from the file-based registration scheme that was deleted in 52fa7b5. Pre-existing ruff failures elsewhere in the repo (parsl_tools, mace_mcp_parsl, etc.) are not from this PR and untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chemgraph/academy/runtime/dashboard_launcher.py | 7 ++++++- src/chemgraph/academy/runtime/mpi.py | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/chemgraph/academy/runtime/dashboard_launcher.py b/src/chemgraph/academy/runtime/dashboard_launcher.py index ef474bf..6a28d17 100644 --- a/src/chemgraph/academy/runtime/dashboard_launcher.py +++ b/src/chemgraph/academy/runtime/dashboard_launcher.py @@ -3,7 +3,12 @@ import argparse import dataclasses import json -import os, shlex, shutil, signal, subprocess, threading +import os +import shlex +import shutil +import signal +import subprocess +import threading import time import urllib.error import urllib.request diff --git a/src/chemgraph/academy/runtime/mpi.py b/src/chemgraph/academy/runtime/mpi.py index b0439ab..2ad3531 100644 --- a/src/chemgraph/academy/runtime/mpi.py +++ b/src/chemgraph/academy/runtime/mpi.py @@ -8,7 +8,6 @@ from typing import Any from chemgraph.academy.observability.event_log import EventLog -from chemgraph.academy.observability.run_files import write_json_atomic MPI_RANK_ENV = ( 'PMI_RANK',