Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions src/lib/onboard-session.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,88 @@ describe("onboard session", () => {
expect(written.pid).toBe(process.pid);
});

it("regression #1281: stale-cleanup race does not unlink a fresh lock claimed by another process", () => {
// Reproduces the race: the lock file we read as 'stale' gets replaced
// with a fresh claim from a faster concurrent process between our
// read and our unlink. The slower process must NOT unlink the fresh
// lock, otherwise both processes end up thinking they hold the lock.
fs.mkdirSync(path.dirname(session.LOCK_FILE), { recursive: true });

// 1. Lay down a stale lock from a dead PID (PID 999999 on the test box).
const staleLock = JSON.stringify({
pid: 999999,
startedAt: "2026-03-25T00:00:00.000Z",
command: "nemoclaw onboard",
});
fs.writeFileSync(session.LOCK_FILE, staleLock, { mode: 0o600 });

// 2. Wrap fs.statSync so the swap happens just before stat #2:
// - stat #1 (inside acquireOnboardLock): reads the stale inode
// and returns it unmodified. readFileSync then reads the
// ORIGINAL stale lock (dead PID 999999), isProcessAlive
// returns false, and acquireOnboardLock enters the stale-
// cleanup path calling unlinkIfInodeMatches.
// - stat #2 (inside unlinkIfInodeMatches): BEFORE the actual
// stat, swap the file for a fresh claim. stat #2 then sees
// a different inode → must skip the unlink.
//
// CodeRabbit correctly flagged the original test: swapping on
// stat #1 caused readFileSync to see the live PID and exit
// via isProcessAlive, never reaching unlinkIfInodeMatches.
let statCallCount = 0;
const originalStatSync = fs.statSync;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(fs as any).statSync = function (...args: unknown[]) {
statCallCount += 1;
// Just before stat #2 (inside unlinkIfInodeMatches), simulate
// the race: a concurrent fast process unlinks the stale lock
// and writes a fresh claim. stat #2 then sees a new inode.
if (statCallCount === 2) {
// Write the fresh claim to a temp file first, then rename over
// the stale lock. This guarantees a different inode even on
// tmpfs/overlayfs which can reuse inodes after unlink+recreate.
const tmpClaim = session.LOCK_FILE + ".race-tmp";
fs.writeFileSync(
tmpClaim,
JSON.stringify({
pid: process.ppid,
startedAt: new Date().toISOString(),
command: "nemoclaw onboard (fresh claim from concurrent process)",
}),
{ mode: 0o600 },
);
fs.renameSync(tmpClaim, session.LOCK_FILE);
}
return (originalStatSync as unknown as (...a: unknown[]) => unknown).apply(fs, args);
};

try {
// The acquire call will see EEXIST (stale lock present), stat it,
// then the swap happens, then the second stat (inside the cleanup
// helper) sees a different inode → must NOT unlink.
const result = session.acquireOnboardLock("nemoclaw onboard --resume");
// The fresh lock that the simulated concurrent process wrote
// should still be on disk after acquireOnboardLock returns.
expect(fs.existsSync(session.LOCK_FILE)).toBe(true);
const onDisk = JSON.parse(fs.readFileSync(session.LOCK_FILE, "utf8"));
// The lock content should be the fresh claim, NOT the stale one
// and NOT a new one written by acquireOnboardLock after a wrong
// unlink.
expect(onDisk.command).toContain("fresh claim from concurrent process");
// The fresh claim is held by a different live PID (process.ppid),
// so acquireOnboardLock MUST report acquisition failure and
// surface that pid as the holder. This is the mutual-exclusion
// loser path — without it, the regression would only verify the
// fresh file survived, not that the contender correctly stood
// down.
expect(result.acquired).toBe(false);
expect(result.holderPid).toBe(process.ppid);
} finally {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(fs as any).statSync = originalStatSync;
}
});

it("treats unreadable or transient lock contents as a retry, not a stale lock", () => {
fs.mkdirSync(path.dirname(session.LOCK_FILE), { recursive: true });
fs.writeFileSync(session.LOCK_FILE, "{not-json", { mode: 0o600 });
Expand Down
142 changes: 131 additions & 11 deletions src/lib/onboard-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,13 @@ function isProcessAlive(pid: number): boolean {
}
}

// File descriptor we hold across the lifetime of an acquired lock. On
// release, fstat(fd).ino vs stat(path).ino confirms the on-disk path
// still resolves to the file we created — closing the residual TOCTOU
// window in the inode-only check by tying ownership to a live
// descriptor rather than a value re-read from disk. See #1281.
let heldLockFd: number | null = null;

export function acquireOnboardLock(command: string | null = null): LockResult {
ensureSessionDir();
const payload = JSON.stringify(
Expand All @@ -346,17 +353,38 @@ export function acquireOnboardLock(command: string | null = null): LockResult {
2,
);

for (let attempt = 0; attempt < 2; attempt++) {
// The retry budget here used to be 2, which is the bare minimum needed
// for "see-stale → cleanup → reclaim". With the inode-verified cleanup
// below it can take a few additional spins under contention because
// multiple concurrent stale-cleaners can race and lose to each other
// before one reclaims, so give the loop a little more room.
// See issue #1281.
const MAX_ATTEMPTS = 5;

for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
let fd: number;
try {
fs.writeFileSync(LOCK_FILE, payload, { flag: "wx", mode: 0o600 });
return { acquired: true, lockFile: LOCK_FILE, stale: false };
// openSync(..., "wx", mode) is the atomic create-or-fail
// primitive. We hold the resulting fd at module scope so
// releaseOnboardLock() can later confirm the on-disk path still
// resolves to the same file we created (fstat ino vs stat ino).
fd = fs.openSync(LOCK_FILE, "wx", 0o600);
} catch (error: unknown) {
if ((error as NodeJS.ErrnoException)?.code !== "EEXIST") {
throw error;
}

// Capture both the parsed lock and the inode so we can verify the
// file we're about to unlink is STILL the same stale file we read.
// Without the inode check, two concurrent processes can both read
// the same stale lock, and the slower one will unlink the fresh
// lock the faster one just claimed, breaking mutual exclusion.
// See issue #1281.
let existing: LockInfo | null;
let staleInode: bigint | null;
try {
const stat = fs.statSync(LOCK_FILE, { bigint: true });
staleInode = stat.ino;
existing = parseLockFile(fs.readFileSync(LOCK_FILE, "utf8"));
} catch (readError: unknown) {
if ((readError as NodeJS.ErrnoException)?.code === "ENOENT") {
Expand All @@ -365,9 +393,12 @@ export function acquireOnboardLock(command: string | null = null): LockResult {
throw readError;
}
if (!existing) {
// Malformed lock file — leave it on disk (a human or another
// process may be mid-write) and retry. Pre-#1281 behavior
// preserved: never unlink a malformed lock automatically.
continue;
}
if (existing && isProcessAlive(existing.pid)) {
if (isProcessAlive(existing.pid)) {
return {
acquired: false,
lockFile: LOCK_FILE,
Expand All @@ -378,20 +409,109 @@ export function acquireOnboardLock(command: string | null = null): LockResult {
};
}

try {
fs.unlinkSync(LOCK_FILE);
} catch (unlinkError: unknown) {
if ((unlinkError as NodeJS.ErrnoException)?.code !== "ENOENT") {
throw unlinkError;
}
}
// Stale: unlink ONLY if the file on disk is still the same inode
// we just read. If a concurrent process already cleaned up and
// claimed the lock, the inode will have changed and we'll fall
// through to the next iteration where openSync(wx) will either
// succeed (we win) or fail EEXIST against the new holder (and we
// re-read it).
unlinkIfInodeMatches(LOCK_FILE, staleInode);
continue;
}

// Atomic create succeeded — write the payload and keep the fd open
// for the lifetime of the lock so releaseOnboardLock() can verify
// ownership via the live descriptor.
try {
fs.writeSync(fd, payload);
} catch (writeError) {
Comment on lines +425 to +427
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# Check the file exists and read the relevant section
head -n 5 src/lib/onboard-session.ts

Repository: NVIDIA/NemoClaw

Length of output: 269


🏁 Script executed:

# Read the code around lines 421-423 with broader context
sed -n '410,435p' src/lib/onboard-session.ts

Repository: NVIDIA/NemoClaw

Length of output: 932


🏁 Script executed:

# Check for SPDX header compliance
head -n 5 src/lib/onboard-session.ts

Repository: NVIDIA/NemoClaw

Length of output: 269


🏁 Script executed:

# Get the full lock file management section to understand the context
sed -n '333,463p' src/lib/onboard-session.ts | cat -n

Repository: NVIDIA/NemoClaw

Length of output: 6028


🌐 Web query:

Node.js fs.writeFileSync file descriptor parameter usage

💡 Result:

In fs.writeFileSync(file, data[, options]), the first argument is not a special “fd parameter”—it’s the file parameter, and it can be either:

  • a path (string | Buffer | URL), or
  • a numeric file descriptor (integer) you previously opened with fs.openSync() (or received elsewhere). (nodejs.org)

Using a file descriptor (fd)

import fs from "node:fs";

const fd = fs.openSync("out.txt", "w");   // open -> fd
fs.writeFileSync(fd, "hello\n");          // write using the fd
fs.closeSync(fd);                         // you must close it

Key behaviors when file is an fd:

  • It behaves like writing directly via fs.write() (Node even recommends using fs.write() for the fd case). (nodejs.org)
  • The file is not automatically “replaced/truncated” the way it is when you pass a filename; data is written at the current fd position, and old bytes may remain before/after your write. (nodejs.org)
  • Because you opened the fd yourself, you’re responsible for closing it (otherwise you can leak descriptors). (This is explicitly called out for similar “*FileSync” APIs that accept an fd.) (nodejs.org)

Citations:


🌐 Web query:

Node.js fs.writeSync short write behavior regular files

💡 Result:

fs.writeSync() can legally perform a short write (return a value < length) even when fd refers to a regular file.

  • In Node.js, fs.writeSync(...) returns “the number of bytes written” (it does not promise to write the full requested length). If it returns n, only the first n bytes were written and you must write the remainder yourself if you need “write all”. [1]
  • On POSIX systems, the underlying write(2) may write fewer than count bytes for reasons such as insufficient space, process file-size limits (RLIMIT_FSIZE), or being interrupted by a signal after writing some bytes. [2]

Practical guidance: if you need a “guaranteed full write”, loop until all bytes are written (or an exception is thrown), even for regular files.

Sources: [1] Node.js fs.writeSync docs, [2] Linux write(2) semantics.


Ensure fs.writeSync() writes the full payload in a single call, or loop until all bytes are written.

fs.writeSync(fd, payload) can return fewer bytes than requested, leaving the JSON truncated. For the lock mechanism to function correctly, the entire payload must be written atomically. The suggested fs.writeFileSync() has identical short-write behavior on file descriptors and does not solve this.

Either ensure the payload fits in a single fs.writeSync() call (it does at ~150–300 bytes, but this is not guaranteed across all platforms and configurations), or loop until all bytes are written:

let bytesWritten = 0;
while (bytesWritten < payload.length) {
  bytesWritten += fs.writeSync(fd, payload, bytesWritten);
}

Alternatively, use a platform-safe wrapper if the codebase has one.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/lib/onboard-session.ts` around lines 421 - 423, The current call
fs.writeSync(fd, payload) may perform a short write and truncate the JSON;
update the write logic in onboard-session.ts where fs.writeSync is used (the fd
and payload variables) to loop until the entire payload buffer/string is written
(track bytesWritten and call fs.writeSync repeatedly with an offset/length until
bytesWritten === payload.length), or replace with a platform-safe helper that
guarantees full-write semantics; ensure the loop handles the returned byte count
and throws on unexpected errors.

try { fs.closeSync(fd); } catch { /* ignore */ }
try { fs.unlinkSync(LOCK_FILE); } catch { /* ignore */ }
throw writeError;
}
heldLockFd = fd;
return { acquired: true, lockFile: LOCK_FILE, stale: false };
}

return { acquired: false, lockFile: LOCK_FILE, stale: true };
}

/**
* Unlink LOCK_FILE only if its current inode equals `expectedInode`.
* The dual stat-then-unlink is the only portable POSIX primitive Node
* exposes for this — there's no atomic "unlink-if-inode" syscall — so
* a sufficiently unlucky race can still slip through. The window is
* orders of magnitude smaller than the unconditional unlink it
* replaces, and the outer loop will detect a wrong unlink on its next
* `writeFileSync(wx)` attempt because either we re-create the file
* or we observe the new lock with a different inode.
*/
function unlinkIfInodeMatches(filePath: string, expectedInode: bigint | null): void {
if (expectedInode === null) {
return;
}
try {
const stat = fs.statSync(filePath, { bigint: true });
if (stat.ino !== expectedInode) {
// Someone else replaced the file. Leave it alone.
return;
}
} catch (statError: unknown) {
if ((statError as NodeJS.ErrnoException)?.code === "ENOENT") {
return;
}
throw statError;
}
try {
fs.unlinkSync(filePath);
} catch (unlinkError: unknown) {
if ((unlinkError as NodeJS.ErrnoException)?.code !== "ENOENT") {
throw unlinkError;
}
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

export function releaseOnboardLock(): void {
// Preferred path: we hold the fd from a successful acquireOnboardLock.
// Verify the on-disk path still resolves to the same file (fstat ino
// == stat ino) before unlinking. If they disagree, another process
// has already replaced the lock and we must NOT touch their file.
if (heldLockFd !== null) {
const fd = heldLockFd;
heldLockFd = null;
try {
const fdStat = fs.fstatSync(fd, { bigint: true });
let pathInode: bigint | null = null;
try {
const pathStat = fs.statSync(LOCK_FILE, { bigint: true });
pathInode = pathStat.ino;
} catch (error: unknown) {
if ((error as NodeJS.ErrnoException)?.code !== "ENOENT") {
// Unexpected — fall through to closing the fd.
}
}
if (pathInode !== null && pathInode === fdStat.ino) {
try {
fs.unlinkSync(LOCK_FILE);
} catch (unlinkError: unknown) {
if ((unlinkError as NodeJS.ErrnoException)?.code !== "ENOENT") {
// Best effort — surfacing this would mask the real error.
}
}
}
} catch {
// fstat can fail if the fd was already closed somehow; nothing
// safe to do beyond closing it below.
} finally {
try { fs.closeSync(fd); } catch { /* ignore */ }
}
return;
}

// Fallback (no fd held — e.g., a test wrote the lock file directly,
// or a previous release already ran): preserve the legacy pid-based
// behavior so we never unlink a malformed lock and never unlink a
// lock owned by another pid.
try {
if (!fs.existsSync(LOCK_FILE)) return;
let existing: LockInfo | null = null;
Expand Down
Loading