Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions tests/reconfiguration.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,81 @@ def test_add_node(network, args, from_snapshot=True):
return network


@reqs.description("Adding a node with corrupted ledger file")
def test_add_node_with_corrupted_ledger(network, args):
# Reproduce issue #6612: a node joining with a corrupted (truncated) ledger
# file should fail to start rather than crash unexpectedly.
new_node = network.create_node()

# Set up the join node (copies ledger, snapshots, etc.) but do not start it yet
network.setup_join_node(
new_node,
args.package,
args,
from_snapshot=True,
fetch_recent_snapshot=True,
)

# Find the latest uncommitted ledger file in the node's working directory
ledger_dir = new_node.remote.get_main_ledger_dir()
ledger_files = sorted(
[
f
for f in os.listdir(ledger_dir)
if f.startswith("ledger_") and not f.endswith(".committed")
]
)

if not ledger_files:
LOG.warning("No uncommitted ledger files found, skipping corruption test")
new_node.stop()
network.nodes.remove(new_node)
return network

# Corrupt the latest uncommitted ledger file by truncating it in the middle
# of a transaction, so the transaction size does not match the number of
# bytes available left to read in the file (as described in issue #6612)
ledger = ccf.ledger.Ledger([ledger_dir], committed_only=False)
chunk_filename = None
truncate_offset = None
for chunk in ledger:
for tx in chunk:
offset, next_offset = tx.get_offsets()
chunk_filename = chunk.filename()
truncate_offset = offset + (next_offset - offset) // 2

if truncate_offset is None:
LOG.warning("Could not find a transaction to corrupt, skipping")
new_node.stop()
network.nodes.remove(new_node)
return network

LOG.info(
f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}"
)
with open(chunk_filename, "r+", encoding="utf-8") as f:
f.truncate(truncate_offset)

# Attempt to start the node - it should fail due to the corrupted ledger
try:
network.run_join_node(new_node, timeout=3)
except (RuntimeError, TimeoutError) as e:
LOG.info(
f"Node {new_node.local_node_id} with corrupted ledger failed to start, as expected: {e}"
)
# Cleanup: run_join_node may have already stopped and removed the node
# on TimeoutError, but not on RuntimeError
new_node.stop()
if new_node in network.nodes:
network.nodes.remove(new_node)
else:
assert (
False
), f"Node {new_node.local_node_id} with corrupted ledger unexpectedly started"

return network


@reqs.description("Test ignore_first_sigterm")
def test_ignore_first_sigterm(network, args):
# Note: host is supplied explicitly to avoid having differently
Expand Down
1 change: 1 addition & 0 deletions tests/suite/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
reconfiguration.test_retire_primary,
e2e_logging.test_rekey,
reconfiguration.test_add_node,
reconfiguration.test_add_node_with_corrupted_ledger,
nodes.test_kill_primary,
nodes.test_commit_view_history,
reconfiguration.test_add_node,
Expand Down