diff --git a/src/lib/shields-timer.ts b/src/lib/shields-timer.ts index 7fd2aad2c8..ef5ab8c1e9 100644 --- a/src/lib/shields-timer.ts +++ b/src/lib/shields-timer.ts @@ -7,17 +7,18 @@ // restores the captured policy snapshot. // // Usage (internal — called by shields.ts via fork()): -// node shields-timer.js +// node shields-timer.js const fs = require("fs"); const path = require("path"); const { run } = require("./runner"); const { buildPolicySetCommand } = require("./policies"); +const { lockAgentConfig } = require("./shields"); const STATE_DIR = path.join(process.env.HOME ?? "/tmp", ".nemoclaw", "state"); const AUDIT_FILE = path.join(STATE_DIR, "shields-audit.jsonl"); -const [sandboxName, snapshotPath, restoreAtIso] = process.argv.slice(2); +const [sandboxName, snapshotPath, restoreAtIso, configPath, configDir] = process.argv.slice(2); const STATE_FILE = path.join(STATE_DIR, `shields-${sandboxName}.json`); const restoreAtMs = new Date(restoreAtIso).getTime(); const delayMs = Math.max(0, restoreAtMs - Date.now()); @@ -75,23 +76,10 @@ setTimeout(() => { process.exit(1); } - // Update state first — policy restore can take seconds and callers - // check state to determine whether shields are up or down. - updateState({ - shieldsDown: false, - shieldsDownAt: null, - shieldsDownTimeout: null, - shieldsDownReason: null, - shieldsDownPolicy: null, - }); - // Restore policy (slow — openshell policy set --wait blocks) const result = run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true }); if (result.status !== 0) { - // Policy restore failed — revert state so callers know shields are - // still down. The sandbox remains relaxed. - updateState({ shieldsDown: true }); appendAudit({ action: "shields_up_failed", sandbox: sandboxName, @@ -103,14 +91,58 @@ setTimeout(() => { process.exit(1); } - // Audit - appendAudit({ - action: "shields_auto_restore", - sandbox: sandboxName, - timestamp: now, - restored_by: "auto_timer", - policy_snapshot: snapshotPath, - }); + // Re-lock config file using the shared lockAgentConfig from shields.ts. + // lockAgentConfig runs each operation independently and verifies the + // on-disk state — it throws if verification fails. + let lockVerified = true; + if (configPath) { + try { + lockAgentConfig(sandboxName, { configPath, configDir }); + } catch (lockErr) { + lockVerified = false; + appendAudit({ + action: "shields_auto_restore_lock_warning", + sandbox: sandboxName, + timestamp: now, + restored_by: "auto_timer", + warning: lockErr?.message ?? String(lockErr), + lock_verified: false, + }); + } + } + + // Only mark shields as UP if the lock was verified (or no config path) + if (lockVerified) { + updateState({ + shieldsDown: false, + shieldsDownAt: null, + shieldsDownTimeout: null, + shieldsDownReason: null, + shieldsDownPolicy: null, + }); + + appendAudit({ + action: "shields_auto_restore", + sandbox: sandboxName, + timestamp: now, + restored_by: "auto_timer", + policy_snapshot: snapshotPath, + }); + } else { + // Explicitly ensure state reflects shields are still DOWN. + // shieldsDown() already wrote shieldsDown: true, but be explicit + // rather than relying on the absence of an update. + updateState({ shieldsDown: true }); + appendAudit({ + action: "shields_up_failed", + sandbox: sandboxName, + timestamp: now, + restored_by: "auto_timer", + error: "Config re-lock verification failed — shields remain DOWN", + }); + cleanupMarker(); + process.exit(1); + } } catch (err) { appendAudit({ action: "shields_up_failed", diff --git a/src/lib/shields.ts b/src/lib/shields.ts index 3a4e359c81..7b7bab1270 100644 --- a/src/lib/shields.ts +++ b/src/lib/shields.ts @@ -44,6 +44,14 @@ function kubectlExec(sandboxName: string, cmd: string[]): void { ], { stdio: ["ignore", "pipe", "pipe"], timeout: 15000 }); } +function kubectlExecCapture(sandboxName: string, cmd: string[]): string { + return execFileSync("docker", [ + "exec", K3S_CONTAINER, + "kubectl", "exec", "-n", "openshell", sandboxName, "-c", "agent", "--", + ...cmd, + ], { stdio: ["ignore", "pipe", "pipe"], timeout: 15000 }).toString().trim(); +} + // Re-export for tests and external consumers const MAX_TIMEOUT_SECONDS = MAX_SECONDS; const DEFAULT_TIMEOUT_SECONDS = DEFAULT_SECONDS; @@ -130,17 +138,114 @@ function killTimer(sandboxName: string): void { // // Sets permissions to sandbox:sandbox 0600/0700, matching what OpenClaw // writes natively (mode 384 = 0o600). This keeps `openclaw doctor` happy. +// +// Note on chattr: The entrypoint (nemoclaw-start.sh) sets chattr +i on the +// .openclaw DIRECTORY and its SYMLINKS, but NOT on openclaw.json itself. +// The chattr -i here is best-effort — it may silently fail if kubectl exec +// lacks CAP_LINUX_IMMUTABLE or if the file was never immutable. That's fine: +// the file becomes writable through the permissive policy (disables Landlock +// read_only) + chown/chmod below. // --------------------------------------------------------------------------- function unlockAgentConfig(sandboxName: string, target: { configPath: string; configDir: string }): void { + const errors: string[] = []; + try { kubectlExec(sandboxName, ["chattr", "-i", target.configPath]); } catch { errors.push("chattr -i"); } + try { kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configPath]); } catch { errors.push("chown config file"); } + try { kubectlExec(sandboxName, ["chmod", "600", target.configPath]); } catch { errors.push("chmod 600 config file"); } + try { kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configDir]); } catch { errors.push("chown config dir"); } + try { kubectlExec(sandboxName, ["chmod", "700", target.configDir]); } catch { errors.push("chmod 700 config dir"); } + if (errors.length > 0) { + console.error(` Warning: Some unlock operations failed: ${errors.join(", ")}. Config may remain read-only.`); + } +} + +// --------------------------------------------------------------------------- +// Config lock — shared between shields-up, auto-restore timer, and rollback +// +// Each operation runs independently so a single failure does not skip the +// rest. After all attempts, we verify the actual on-disk state and throw +// if the config is not properly locked. +// +// The config file's protection comes from three layers: +// 1. Landlock read_only path — kernel-level, restored by policy snapshot +// 2. UNIX permissions — 444 root:root (mandatory, verified here) +// 3. chattr +i immutable bit — defense-in-depth (best-effort) +// +// Layer 3 is best-effort because the entrypoint (nemoclaw-start.sh) only +// sets chattr +i on the .openclaw directory and its symlinks, not on +// openclaw.json itself. kubectl exec may also lack CAP_LINUX_IMMUTABLE. +// The file was never immutable via chattr, so failing to set it is not a +// regression — layers 1+2 are sufficient. We still attempt it in case the +// runtime environment supports it. +// --------------------------------------------------------------------------- + +function lockAgentConfig(sandboxName: string, target: { configPath: string; configDir: string }): void { + const errors: string[] = []; + + try { kubectlExec(sandboxName, ["chmod", "444", target.configPath]); } + catch { errors.push("chmod 444 config file"); } + + try { kubectlExec(sandboxName, ["chown", "root:root", target.configPath]); } + catch { errors.push("chown root:root config file"); } + + try { kubectlExec(sandboxName, ["chmod", "755", target.configDir]); } + catch { errors.push("chmod 755 config dir"); } + + try { kubectlExec(sandboxName, ["chown", "root:root", target.configDir]); } + catch { errors.push("chown root:root config dir"); } + + // Best-effort: the config file was never chattr +i'd by the entrypoint + // (only the directory and symlinks were). kubectl exec may also lack + // CAP_LINUX_IMMUTABLE. Track the result so verification doesn't require + // something that was never there. + let chattrSucceeded = true; + try { kubectlExec(sandboxName, ["chattr", "+i", target.configPath]); } + catch { + chattrSucceeded = false; + } + + if (errors.length > 0) { + console.error(` Some lock operations failed: ${errors.join(", ")}`); + } + + // Verify the lock actually took effect. + // Mode + ownership are mandatory (layers 1+2 depend on them). + // Immutable bit is only verified if chattr succeeded above. + const issues: string[] = []; try { - kubectlExec(sandboxName, ["chattr", "-i", target.configPath]); - kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configPath]); - kubectlExec(sandboxName, ["chmod", "600", target.configPath]); - kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configDir]); - kubectlExec(sandboxName, ["chmod", "700", target.configDir]); - } catch { - console.error(" Warning: Could not unlock config file. Config may remain read-only."); + const perms = kubectlExecCapture(sandboxName, ["stat", "-c", "%a %U:%G", target.configPath]); + const [mode, owner] = perms.split(" "); + if (!/^4[0-4][0-4]$/.test(mode)) issues.push(`file mode=${mode} (expected 444)`); + if (owner !== "root:root") issues.push(`file owner=${owner} (expected root:root)`); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + issues.push(`file stat failed: ${msg}`); + } + + try { + const dirPerms = kubectlExecCapture(sandboxName, ["stat", "-c", "%a %U:%G", target.configDir]); + const [dirMode, dirOwner] = dirPerms.split(" "); + if (dirMode !== "755") issues.push(`dir mode=${dirMode} (expected 755)`); + if (dirOwner !== "root:root") issues.push(`dir owner=${dirOwner} (expected root:root)`); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + issues.push(`dir stat failed: ${msg}`); + } + + if (chattrSucceeded) { + try { + const attrs = kubectlExecCapture(sandboxName, ["lsattr", "-d", target.configPath]); + // lsattr format: "----i---------e----- /path/to/file" + // First whitespace-delimited token is the flags field. + const [flags] = attrs.trim().split(/\s+/, 1); + if (!flags.includes("i")) issues.push("immutable bit not set"); + } catch { + // lsattr may not be available on all images — skip + } + } + + if (issues.length > 0) { + throw new Error(`Config not locked: ${issues.join(", ")}`); } } @@ -250,7 +355,7 @@ function shieldsDown(sandboxName: string, opts: ShieldsDownOpts = {}): void { const actualScript = fs.existsSync(timerScriptJs) ? timerScriptJs : timerScript; try { - const child = fork(actualScript, [sandboxName, snapshotPath, restoreAt.toISOString()], { + const child = fork(actualScript, [sandboxName, snapshotPath, restoreAt.toISOString(), target.configPath, target.configDir], { detached: true, stdio: ["ignore", "ignore", "ignore", "ipc"], }); @@ -273,22 +378,32 @@ function shieldsDown(sandboxName: string, opts: ShieldsDownOpts = {}): void { const message = err instanceof Error ? err.message : String(err); console.error(` Cannot start auto-restore timer: ${message}`); console.error(" Rolling back — restoring policy from snapshot..."); - try { - run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true }); - kubectlExec(sandboxName, ["chmod", "444", target.configPath]); - kubectlExec(sandboxName, ["chown", "root:root", target.configPath]); - kubectlExec(sandboxName, ["chattr", "+i", target.configPath]); - } catch { - // Best effort rollback + const rollbackResult = run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true }); + let rollbackLocked = false; + if (rollbackResult.status === 0) { + try { + lockAgentConfig(sandboxName, target); + rollbackLocked = true; + } catch { + console.error(" Warning: Rollback re-lock could not be verified. Check config manually."); + } + } else { + console.error(" Warning: Policy restore failed during rollback."); + } + if (rollbackLocked) { + saveShieldsState(sandboxName, { + shieldsDown: false, + shieldsDownAt: null, + shieldsDownTimeout: null, + shieldsDownReason: null, + shieldsDownPolicy: null, + }); + console.error(" Shields restored to UP. The sandbox was never left unguarded."); + } else { + // Leave state as shieldsDown: true — don't lie about protection level + console.error(" Shields remain DOWN — manual intervention required."); + console.error(` Re-lock manually via kubectl exec, then run: nemoclaw ${sandboxName} shields up`); } - saveShieldsState(sandboxName, { - shieldsDown: false, - shieldsDownAt: null, - shieldsDownTimeout: null, - shieldsDownReason: null, - shieldsDownPolicy: null, - }); - console.error(" Shields restored to UP. The sandbox was never left unguarded."); process.exit(1); } @@ -351,16 +466,18 @@ function shieldsUp(sandboxName: string): void { // 2b. Re-lock config file to read-only. // Restore the Dockerfile's original permissions and immutable bit. // Uses kubectl exec to bypass Landlock (same as shields down). + // Each operation runs independently and the result is verified. + // If verification fails, shields remain DOWN — we do not lie about state. const target = resolveAgentConfig(sandboxName); console.log(` Locking ${target.agentName} config (${target.configPath})...`); try { - kubectlExec(sandboxName, ["chmod", "444", target.configPath]); - kubectlExec(sandboxName, ["chown", "root:root", target.configPath]); - kubectlExec(sandboxName, ["chmod", "755", target.configDir]); - kubectlExec(sandboxName, ["chown", "root:root", target.configDir]); - kubectlExec(sandboxName, ["chattr", "+i", target.configPath]); - } catch { - console.error(" Warning: Could not re-lock config file."); + lockAgentConfig(sandboxName, target); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + console.error(` ERROR: ${message}`); + console.error(" Shields remain DOWN — manual intervention required."); + console.error(` Re-lock manually via kubectl exec, then run: nemoclaw ${sandboxName} shields up`); + process.exit(1); } // 3. Calculate duration @@ -537,6 +654,7 @@ export { shieldsStatus, isShieldsDown, parseDuration, + lockAgentConfig, MAX_TIMEOUT_SECONDS, DEFAULT_TIMEOUT_SECONDS, }; diff --git a/test/e2e/test-shields-config.sh b/test/e2e/test-shields-config.sh index b9218a1b20..2816b21e8d 100755 --- a/test/e2e/test-shields-config.sh +++ b/test/e2e/test-shields-config.sh @@ -485,8 +485,8 @@ else fail "shields should be DOWN: ${STATUS_TIMER}" fi -info "Waiting 15s for auto-restore..." -sleep 15 +info "Waiting 25s for auto-restore..." +sleep 25 # Check if the timer process restored shields # The timer runs as a detached process — it restores the policy and @@ -499,7 +499,7 @@ else info "Status: ${STATUS_AFTER_TIMER}" # Clean up manually nemoclaw "${SANDBOX_NAME}" shields up 2>/dev/null || true - fail "Auto-restore timer did not restore shields within 15s" + fail "Auto-restore timer did not restore shields within 25s" fi # Verify config is re-locked after auto-restore diff --git a/test/e2e/test-token-rotation.sh b/test/e2e/test-token-rotation.sh index 6e3aef2624..daca2ed1b7 100755 --- a/test/e2e/test-token-rotation.sh +++ b/test/e2e/test-token-rotation.sh @@ -25,7 +25,7 @@ set -uo pipefail if [ -z "${NEMOCLAW_E2E_NO_TIMEOUT:-}" ]; then export NEMOCLAW_E2E_NO_TIMEOUT=1 - TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-900}" + TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-2400}" exec timeout -s TERM "$TIMEOUT_SECONDS" "$0" "$@" fi