Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
634e682
fix(shields): verify config lock and fail hard on re-lock failure
ericksoa Apr 18, 2026
eec9189
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 18, 2026
537a717
fix(shields): parse lsattr flags field and verify timer re-lock
ericksoa Apr 18, 2026
3630270
fix(shields): treat immutable bit as best-effort on unsupported fs
ericksoa Apr 19, 2026
499731e
docs(shields): clarify chattr +i is best-effort on config file
ericksoa Apr 19, 2026
596f372
fix(shields): independent unlock ops, timer dir verification, timer h…
ericksoa Apr 19, 2026
11aa705
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 19, 2026
4defd49
fix(e2e): increase timeouts for token-rotation and shields auto-restore
ericksoa Apr 19, 2026
e3b7cc9
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 19, 2026
c43a37d
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 20, 2026
e963103
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 20, 2026
e445f42
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 20, 2026
893e7a6
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 20, 2026
6e7ca4b
fix(shields): address review feedback — dedup lock logic, fix state gaps
ericksoa Apr 20, 2026
81582d1
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
b8f2695
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
51b9e51
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
b044d86
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
41646f5
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
753c7ca
Merge branch 'main' into fix/shields-relock-verification
ericksoa Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 55 additions & 23 deletions src/lib/shields-timer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,18 @@
// restores the captured policy snapshot.
//
// Usage (internal — called by shields.ts via fork()):
// node shields-timer.js <sandbox-name> <snapshot-path> <restore-at-iso>
// node shields-timer.js <sandbox-name> <snapshot-path> <restore-at-iso> <config-path> <config-dir>

const fs = require("fs");
const path = require("path");
const { run } = require("./runner");
const { buildPolicySetCommand } = require("./policies");
const { lockAgentConfig } = require("./shields");

const STATE_DIR = path.join(process.env.HOME ?? "/tmp", ".nemoclaw", "state");
const AUDIT_FILE = path.join(STATE_DIR, "shields-audit.jsonl");

const [sandboxName, snapshotPath, restoreAtIso] = process.argv.slice(2);
const [sandboxName, snapshotPath, restoreAtIso, configPath, configDir] = process.argv.slice(2);
const STATE_FILE = path.join(STATE_DIR, `shields-${sandboxName}.json`);
const restoreAtMs = new Date(restoreAtIso).getTime();
const delayMs = Math.max(0, restoreAtMs - Date.now());
Expand Down Expand Up @@ -75,23 +76,10 @@ setTimeout(() => {
process.exit(1);
}

// Update state first — policy restore can take seconds and callers
// check state to determine whether shields are up or down.
updateState({
shieldsDown: false,
shieldsDownAt: null,
shieldsDownTimeout: null,
shieldsDownReason: null,
shieldsDownPolicy: null,
});

// Restore policy (slow — openshell policy set --wait blocks)
const result = run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true });

if (result.status !== 0) {
// Policy restore failed — revert state so callers know shields are
// still down. The sandbox remains relaxed.
updateState({ shieldsDown: true });
appendAudit({
action: "shields_up_failed",
sandbox: sandboxName,
Expand All @@ -103,14 +91,58 @@ setTimeout(() => {
process.exit(1);
}

// Audit
appendAudit({
action: "shields_auto_restore",
sandbox: sandboxName,
timestamp: now,
restored_by: "auto_timer",
policy_snapshot: snapshotPath,
});
// Re-lock config file using the shared lockAgentConfig from shields.ts.
// lockAgentConfig runs each operation independently and verifies the
// on-disk state — it throws if verification fails.
let lockVerified = true;
if (configPath) {
try {
lockAgentConfig(sandboxName, { configPath, configDir });
} catch (lockErr) {
lockVerified = false;
appendAudit({
action: "shields_auto_restore_lock_warning",
sandbox: sandboxName,
timestamp: now,
restored_by: "auto_timer",
warning: lockErr?.message ?? String(lockErr),
lock_verified: false,
});
}
}

// Only mark shields as UP if the lock was verified (or no config path)
if (lockVerified) {
updateState({
shieldsDown: false,
shieldsDownAt: null,
shieldsDownTimeout: null,
shieldsDownReason: null,
shieldsDownPolicy: null,
});

appendAudit({
action: "shields_auto_restore",
sandbox: sandboxName,
timestamp: now,
restored_by: "auto_timer",
policy_snapshot: snapshotPath,
});
} else {
// Explicitly ensure state reflects shields are still DOWN.
// shieldsDown() already wrote shieldsDown: true, but be explicit
// rather than relying on the absence of an update.
updateState({ shieldsDown: true });
appendAudit({
action: "shields_up_failed",
sandbox: sandboxName,
timestamp: now,
restored_by: "auto_timer",
error: "Config re-lock verification failed — shields remain DOWN",
});
cleanupMarker();
process.exit(1);
}
} catch (err) {
appendAudit({
action: "shields_up_failed",
Expand Down
178 changes: 148 additions & 30 deletions src/lib/shields.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ function kubectlExec(sandboxName: string, cmd: string[]): void {
], { stdio: ["ignore", "pipe", "pipe"], timeout: 15000 });
}

function kubectlExecCapture(sandboxName: string, cmd: string[]): string {
return execFileSync("docker", [
"exec", K3S_CONTAINER,
"kubectl", "exec", "-n", "openshell", sandboxName, "-c", "agent", "--",
...cmd,
], { stdio: ["ignore", "pipe", "pipe"], timeout: 15000 }).toString().trim();
}

// Re-export for tests and external consumers
const MAX_TIMEOUT_SECONDS = MAX_SECONDS;
const DEFAULT_TIMEOUT_SECONDS = DEFAULT_SECONDS;
Expand Down Expand Up @@ -130,17 +138,114 @@ function killTimer(sandboxName: string): void {
//
// Sets permissions to sandbox:sandbox 0600/0700, matching what OpenClaw
// writes natively (mode 384 = 0o600). This keeps `openclaw doctor` happy.
//
// Note on chattr: The entrypoint (nemoclaw-start.sh) sets chattr +i on the
// .openclaw DIRECTORY and its SYMLINKS, but NOT on openclaw.json itself.
// The chattr -i here is best-effort — it may silently fail if kubectl exec
// lacks CAP_LINUX_IMMUTABLE or if the file was never immutable. That's fine:
// the file becomes writable through the permissive policy (disables Landlock
// read_only) + chown/chmod below.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
// ---------------------------------------------------------------------------

function unlockAgentConfig(sandboxName: string, target: { configPath: string; configDir: string }): void {
const errors: string[] = [];
try { kubectlExec(sandboxName, ["chattr", "-i", target.configPath]); } catch { errors.push("chattr -i"); }
try { kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configPath]); } catch { errors.push("chown config file"); }
try { kubectlExec(sandboxName, ["chmod", "600", target.configPath]); } catch { errors.push("chmod 600 config file"); }
try { kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configDir]); } catch { errors.push("chown config dir"); }
try { kubectlExec(sandboxName, ["chmod", "700", target.configDir]); } catch { errors.push("chmod 700 config dir"); }
if (errors.length > 0) {
console.error(` Warning: Some unlock operations failed: ${errors.join(", ")}. Config may remain read-only.`);
}
}

// ---------------------------------------------------------------------------
// Config lock — shared between shields-up, auto-restore timer, and rollback
//
// Each operation runs independently so a single failure does not skip the
// rest. After all attempts, we verify the actual on-disk state and throw
// if the config is not properly locked.
//
// The config file's protection comes from three layers:
// 1. Landlock read_only path — kernel-level, restored by policy snapshot
// 2. UNIX permissions — 444 root:root (mandatory, verified here)
// 3. chattr +i immutable bit — defense-in-depth (best-effort)
//
// Layer 3 is best-effort because the entrypoint (nemoclaw-start.sh) only
// sets chattr +i on the .openclaw directory and its symlinks, not on
// openclaw.json itself. kubectl exec may also lack CAP_LINUX_IMMUTABLE.
// The file was never immutable via chattr, so failing to set it is not a
// regression — layers 1+2 are sufficient. We still attempt it in case the
// runtime environment supports it.
// ---------------------------------------------------------------------------
Comment thread
coderabbitai[bot] marked this conversation as resolved.

function lockAgentConfig(sandboxName: string, target: { configPath: string; configDir: string }): void {
const errors: string[] = [];

try { kubectlExec(sandboxName, ["chmod", "444", target.configPath]); }
catch { errors.push("chmod 444 config file"); }

try { kubectlExec(sandboxName, ["chown", "root:root", target.configPath]); }
catch { errors.push("chown root:root config file"); }

try { kubectlExec(sandboxName, ["chmod", "755", target.configDir]); }
catch { errors.push("chmod 755 config dir"); }

try { kubectlExec(sandboxName, ["chown", "root:root", target.configDir]); }
catch { errors.push("chown root:root config dir"); }

// Best-effort: the config file was never chattr +i'd by the entrypoint
// (only the directory and symlinks were). kubectl exec may also lack
// CAP_LINUX_IMMUTABLE. Track the result so verification doesn't require
// something that was never there.
let chattrSucceeded = true;
try { kubectlExec(sandboxName, ["chattr", "+i", target.configPath]); }
catch {
chattrSucceeded = false;
}

if (errors.length > 0) {
console.error(` Some lock operations failed: ${errors.join(", ")}`);
}

// Verify the lock actually took effect.
// Mode + ownership are mandatory (layers 1+2 depend on them).
// Immutable bit is only verified if chattr succeeded above.
const issues: string[] = [];
try {
kubectlExec(sandboxName, ["chattr", "-i", target.configPath]);
kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configPath]);
kubectlExec(sandboxName, ["chmod", "600", target.configPath]);
kubectlExec(sandboxName, ["chown", "sandbox:sandbox", target.configDir]);
kubectlExec(sandboxName, ["chmod", "700", target.configDir]);
} catch {
console.error(" Warning: Could not unlock config file. Config may remain read-only.");
const perms = kubectlExecCapture(sandboxName, ["stat", "-c", "%a %U:%G", target.configPath]);
const [mode, owner] = perms.split(" ");
if (!/^4[0-4][0-4]$/.test(mode)) issues.push(`file mode=${mode} (expected 444)`);
if (owner !== "root:root") issues.push(`file owner=${owner} (expected root:root)`);
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : String(err);
issues.push(`file stat failed: ${msg}`);
}

try {
const dirPerms = kubectlExecCapture(sandboxName, ["stat", "-c", "%a %U:%G", target.configDir]);
const [dirMode, dirOwner] = dirPerms.split(" ");
if (dirMode !== "755") issues.push(`dir mode=${dirMode} (expected 755)`);
if (dirOwner !== "root:root") issues.push(`dir owner=${dirOwner} (expected root:root)`);
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : String(err);
issues.push(`dir stat failed: ${msg}`);
}

if (chattrSucceeded) {
try {
const attrs = kubectlExecCapture(sandboxName, ["lsattr", "-d", target.configPath]);
// lsattr format: "----i---------e----- /path/to/file"
// First whitespace-delimited token is the flags field.
const [flags] = attrs.trim().split(/\s+/, 1);
if (!flags.includes("i")) issues.push("immutable bit not set");
} catch {
// lsattr may not be available on all images — skip
}
}

if (issues.length > 0) {
throw new Error(`Config not locked: ${issues.join(", ")}`);
}
}

Expand Down Expand Up @@ -250,7 +355,7 @@ function shieldsDown(sandboxName: string, opts: ShieldsDownOpts = {}): void {
const actualScript = fs.existsSync(timerScriptJs) ? timerScriptJs : timerScript;

try {
const child = fork(actualScript, [sandboxName, snapshotPath, restoreAt.toISOString()], {
const child = fork(actualScript, [sandboxName, snapshotPath, restoreAt.toISOString(), target.configPath, target.configDir], {
detached: true,
stdio: ["ignore", "ignore", "ignore", "ipc"],
});
Expand All @@ -273,22 +378,32 @@ function shieldsDown(sandboxName: string, opts: ShieldsDownOpts = {}): void {
const message = err instanceof Error ? err.message : String(err);
console.error(` Cannot start auto-restore timer: ${message}`);
console.error(" Rolling back — restoring policy from snapshot...");
try {
run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true });
kubectlExec(sandboxName, ["chmod", "444", target.configPath]);
kubectlExec(sandboxName, ["chown", "root:root", target.configPath]);
kubectlExec(sandboxName, ["chattr", "+i", target.configPath]);
} catch {
// Best effort rollback
const rollbackResult = run(buildPolicySetCommand(snapshotPath, sandboxName), { ignoreError: true });
let rollbackLocked = false;
if (rollbackResult.status === 0) {
try {
lockAgentConfig(sandboxName, target);
rollbackLocked = true;
} catch {
console.error(" Warning: Rollback re-lock could not be verified. Check config manually.");
}
} else {
console.error(" Warning: Policy restore failed during rollback.");
}
if (rollbackLocked) {
saveShieldsState(sandboxName, {
shieldsDown: false,
shieldsDownAt: null,
shieldsDownTimeout: null,
shieldsDownReason: null,
shieldsDownPolicy: null,
});
console.error(" Shields restored to UP. The sandbox was never left unguarded.");
} else {
// Leave state as shieldsDown: true — don't lie about protection level
console.error(" Shields remain DOWN — manual intervention required.");
console.error(` Re-lock manually via kubectl exec, then run: nemoclaw ${sandboxName} shields up`);
}
saveShieldsState(sandboxName, {
shieldsDown: false,
shieldsDownAt: null,
shieldsDownTimeout: null,
shieldsDownReason: null,
shieldsDownPolicy: null,
});
console.error(" Shields restored to UP. The sandbox was never left unguarded.");
process.exit(1);
}

Expand Down Expand Up @@ -351,16 +466,18 @@ function shieldsUp(sandboxName: string): void {
// 2b. Re-lock config file to read-only.
// Restore the Dockerfile's original permissions and immutable bit.
// Uses kubectl exec to bypass Landlock (same as shields down).
// Each operation runs independently and the result is verified.
// If verification fails, shields remain DOWN — we do not lie about state.
const target = resolveAgentConfig(sandboxName);
console.log(` Locking ${target.agentName} config (${target.configPath})...`);
try {
kubectlExec(sandboxName, ["chmod", "444", target.configPath]);
kubectlExec(sandboxName, ["chown", "root:root", target.configPath]);
kubectlExec(sandboxName, ["chmod", "755", target.configDir]);
kubectlExec(sandboxName, ["chown", "root:root", target.configDir]);
kubectlExec(sandboxName, ["chattr", "+i", target.configPath]);
} catch {
console.error(" Warning: Could not re-lock config file.");
lockAgentConfig(sandboxName, target);
} catch (err: unknown) {
const message = err instanceof Error ? err.message : String(err);
console.error(` ERROR: ${message}`);
console.error(" Shields remain DOWN — manual intervention required.");
console.error(` Re-lock manually via kubectl exec, then run: nemoclaw ${sandboxName} shields up`);
process.exit(1);
}

// 3. Calculate duration
Expand Down Expand Up @@ -537,6 +654,7 @@ export {
shieldsStatus,
isShieldsDown,
parseDuration,
lockAgentConfig,
MAX_TIMEOUT_SECONDS,
DEFAULT_TIMEOUT_SECONDS,
};
6 changes: 3 additions & 3 deletions test/e2e/test-shields-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,8 @@ else
fail "shields should be DOWN: ${STATUS_TIMER}"
fi

info "Waiting 15s for auto-restore..."
sleep 15
info "Waiting 25s for auto-restore..."
sleep 25

# Check if the timer process restored shields
# The timer runs as a detached process — it restores the policy and
Expand All @@ -499,7 +499,7 @@ else
info "Status: ${STATUS_AFTER_TIMER}"
# Clean up manually
nemoclaw "${SANDBOX_NAME}" shields up 2>/dev/null || true
fail "Auto-restore timer did not restore shields within 15s"
fail "Auto-restore timer did not restore shields within 25s"
fi

# Verify config is re-locked after auto-restore
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/test-token-rotation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ set -uo pipefail

if [ -z "${NEMOCLAW_E2E_NO_TIMEOUT:-}" ]; then
export NEMOCLAW_E2E_NO_TIMEOUT=1
TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-900}"
TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-2400}"
exec timeout -s TERM "$TIMEOUT_SECONDS" "$0" "$@"
fi

Expand Down
Loading