Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d5b4485
Implement Kive collation functionality and update related configurations
Donaim May 7, 2026
1538d79
Refactor Kive collation inputs and update argument parsing for metada…
Donaim May 7, 2026
31a377b
Enhance error handling in Kive collation and validate metadata manife…
Donaim May 7, 2026
ebb3329
Add separator argument to main collate test for metadata handling
Donaim May 7, 2026
1f31239
Refactor KiveWatcher dataset retrieval and update test assertions for…
Donaim May 7, 2026
04ef497
Remove unused imports from test files to clean up code
Donaim May 7, 2026
fda7888
Add kive_collate.py to the list of executables in the main script
Donaim May 7, 2026
76ccecb
Enhance KiveWatcher error handling for pipeline failures and update t…
Donaim May 8, 2026
ba8e74a
Add method to move stitcher plot SVG files to results directory
Donaim May 8, 2026
c9f0be2
Refactor KiveWatcher to simplify error handling and folder management…
Donaim May 8, 2026
c53e65d
Enhance kive_collate.py and test_kive_collate.py with improved error …
Donaim May 8, 2026
d5deada
Update KiveWatcher to filter extracted files during tar extraction
Donaim May 8, 2026
110a4ef
Update test_main_collates_csv_and_fasta_from_multiple_samples to filt…
Donaim May 8, 2026
008c493
Refactor KiveWatcher to improve folder state management and enhance e…
Donaim May 11, 2026
b4b9a03
Refactor argument parsing in kive_collate.py to remove optional flag …
Donaim May 11, 2026
09bd9ea
Refactor argument parsing in kive_collate.py to accept command-line a…
Donaim May 11, 2026
347ac6f
Refactor main function in kive_collate.py to improve variable handlin…
Donaim May 11, 2026
02a4d74
Enhance logging in kive_collate.py for better traceability; add debug…
Donaim May 11, 2026
70516da
Refactor logging configuration in kive_collate.py; remove debug2 verb…
Donaim May 11, 2026
fd35fd0
Add debug flag to collation run command in Singularity definition
Donaim May 11, 2026
ea38181
Add debug logging at the start of the main function in kive_collate.py
Donaim May 11, 2026
2f9bd20
Set logging level to DEBUG in kive_collate.py for enhanced log visibi…
Donaim May 11, 2026
63a73fc
Revert "Refactor argument parsing in kive_collate.py to remove option…
Donaim May 11, 2026
ceee679
Remove assertions for deprecated debug2 flag in argument parsing tests
Donaim May 11, 2026
276b433
Refactor argument parsing in kive_collate.py to use --inputs flag; up…
Donaim May 11, 2026
4f48d50
Refactor argument parsing in kive_collate.py to use repeatable --inpu…
Donaim May 11, 2026
fea3bc7
Update argument parsing in kive_collate.py to allow multiple input pa…
Donaim May 11, 2026
8234f9c
Update argument parsing help text in kive_collate.py for clarity; adj…
Donaim May 11, 2026
348f8e5
Remove DEBUG logging and replace with print statement in kive_collate…
Donaim May 11, 2026
e872df8
Update --inputs argument in parse_args to remove required flag in kiv…
Donaim May 11, 2026
ffcb4f1
Redirect kive_collate startup message to stderr for better error visi…
Donaim May 12, 2026
d21d206
Refactor argument parsing in kive_collate.py to make --inputs a posit…
Donaim May 12, 2026
a2c4c16
Debug print input directory
Donaim May 12, 2026
661751d
Replace debug tree command with head for input file preview in main f…
Donaim May 12, 2026
52269d3
Replace head command with cat for full input file preview in main fun…
Donaim May 12, 2026
51df1a0
Add validation for collation app signature in KiveWatcher class
Donaim May 12, 2026
e50212a
Update argument parsing to require inputs as a list and adjust main f…
Donaim May 12, 2026
efc70c1
Add dummy input for manifest dataset in KiveWatcher for debugging pur…
Donaim May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,15 @@ From: debian:bookworm-slim
%apphelp denovo
Standard pipeline with de novo assembly instead of mapping to reference
sequences.

%apphelp collation
Collate per-sample MiCall outputs into run-level grouped files.

%applabels collation
KIVE_INPUTS --inputs*
KIVE_OUTPUTS output
KIVE_THREADS 1
KIVE_MEMORY 1000

%apprun collation
python -m micall.utils.kive_collate --debug "$@"
1 change: 1 addition & 0 deletions micall/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"micall/utils/check_sample_sheet.py",
"micall/utils/cache.py",
"micall/monitor/run_completion_watcher.py",
"micall/utils/kive_collate.py",
]


Expand Down
525 changes: 250 additions & 275 deletions micall/monitor/kive_watcher.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions micall/monitor/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class ConfigInterface(Protocol):
denovo_main_pipeline_id: Optional[int]
micall_filter_quality_pipeline_id: Optional[int]
micall_resistance_pipeline_id: Optional[int]
micall_collation_pipeline_id: Optional[int]
proviral_pipeline_id: Optional[int]
max_active: int
pipeline_version: str
Expand Down
9 changes: 9 additions & 0 deletions micall/monitor/watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def parse_args(argv=None):
type=int,
default=os.environ.get('MICALL_RESISTANCE_PIPELINE_ID', None),
help="id of resistance pipeline's container app")
parser.add_argument(
'--micall_collation_pipeline_id',
type=int,
default=os.environ.get('MICALL_COLLATION_PIPELINE_ID', None),
help="id of collation pipeline's container app")
parser.add_argument(
'--mixed_hcv_pipeline_id',
type=int,
Expand Down Expand Up @@ -120,6 +125,10 @@ def parse_args(argv=None):
else:
parser.error(f"No arguments or environment variables set for main "
f"pipeline ids ({', '.join(main_pipeline_ids)}).")
if args.micall_collation_pipeline_id is None:
parser.error("Argument --micall_collation_pipeline_id not set and "
"$MICALL_COLLATION_PIPELINE_ID environment variable "
"not set.")

return args

Expand Down
155 changes: 155 additions & 0 deletions micall/tests/test_kive_collate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import sys
import tarfile
from pathlib import Path

import pytest

from micall.utils import kive_collate


def test_parse_args_with_optional_multiple_and_separator(monkeypatch, tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample,output_name\n')
output_path = tmp_path / 'out.tar'
monkeypatch.setattr(
sys,
'argv',
['kive_collate', '--inputs', 'a.csv', 'b.csv', str(metadata_path), '--', str(output_path)])

args = kive_collate.parse_args()

assert args.inputs == [Path('a.csv'), Path('b.csv'), metadata_path]
assert args.output == output_path
assert not args.verbose
assert not args.debug
assert not args.quiet


def test_parse_args_with_debug_flag(monkeypatch, tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample,output_name\n')
output_path = tmp_path / 'out.tar'

monkeypatch.setattr(
sys,
'argv',
['kive_collate', '--debug', '--inputs', 'a.csv', str(metadata_path), '--', str(output_path)])

args = kive_collate.parse_args()

assert args.debug
assert not args.verbose
assert not args.quiet


def test_main_collates_csv_and_fasta_from_multiple_samples(monkeypatch, tmp_path):
sample1_cascade = tmp_path / 'sample1_cascade.csv'
sample1_fasta = tmp_path / 'sample1_wg.fasta'
sample2_cascade = tmp_path / 'sample2_cascade.csv'
sample2_fasta = tmp_path / 'sample2_wg.fasta'
sample1_cascade.write_text('x,y\n1,2\n')
sample2_cascade.write_text('x,y\n3,4\n')
sample1_fasta.write_text('>seed\nACTG\n')
sample2_fasta.write_text('>seed\nACTG\n')
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text(
'index,sample,output_name\n'
'0,E11111,cascade_csv\n'
'1,E11111,wg_fasta\n'
'2,E22222,cascade_csv\n'
'3,E22222,wg_fasta\n')
output_path = tmp_path / 'collated.tar'

monkeypatch.setattr(
sys,
'argv',
['kive_collate', '--inputs',
str(sample1_cascade),
str(sample1_fasta),
str(sample2_cascade),
str(sample2_fasta),
str(metadata_path),
'--',
str(output_path)])

kive_collate.main()

extract_path = tmp_path / 'extract'
extract_path.mkdir()
with tarfile.open(output_path) as output_tar:
output_tar.extractall(extract_path, filter='data')

cascade_text = (extract_path / 'cascade.csv').read_text()
assert cascade_text == (
'sample,x,y\n'
'E11111,1,2\n'
'E22222,3,4\n'
)

fasta_text = (extract_path / 'wg.fasta').read_text()
assert fasta_text == (
'>E11111,seed\n'
'ACTG\n'
'>E22222,seed\n'
'ACTG\n'
)


def test_stage_inputs_by_sample_rejects_invalid_index(tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample,output_name\nabc,E11111,cascade_csv\n')
run_outputs = [tmp_path / 'cascade.csv']
run_outputs[0].write_text('x,y\n1,2\n')

with pytest.raises(ValueError, match='invalid index'):
kive_collate.stage_inputs_by_sample(run_outputs, metadata_path, tmp_path / 'scratch')


def test_stage_inputs_by_sample_rejects_invalid_sample_name(tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample,output_name\n0,../escape,cascade_csv\n')
run_outputs = [tmp_path / 'cascade.csv']
run_outputs[0].write_text('x,y\n1,2\n')

with pytest.raises(ValueError, match='invalid sample name'):
kive_collate.stage_inputs_by_sample(run_outputs, metadata_path, tmp_path / 'scratch')


def test_stage_inputs_by_sample_rejects_missing_required_columns(tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample\n0,E11111\n')
run_outputs = [tmp_path / 'cascade.csv']
run_outputs[0].write_text('x,y\n1,2\n')

with pytest.raises(ValueError, match='missing required columns'):
kive_collate.stage_inputs_by_sample(run_outputs, metadata_path, tmp_path / 'scratch')


def test_stage_inputs_by_sample_rejects_duplicate_output_for_sample(tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text(
'index,sample,output_name\n'
'0,E11111,cascade_csv\n'
'1,E11111,cascade_csv\n'
)
run_outputs = [tmp_path / 'cascade1.csv', tmp_path / 'cascade2.csv']
run_outputs[0].write_text('x,y\n1,2\n')
run_outputs[1].write_text('x,y\n3,4\n')

with pytest.raises(ValueError, match='duplicates output'):
kive_collate.stage_inputs_by_sample(run_outputs, metadata_path, tmp_path / 'scratch')


def test_parse_args_with_explicit_output_separator(monkeypatch, tmp_path):
metadata_path = tmp_path / 'metadata.csv'
metadata_path.write_text('index,sample,output_name\n')
output_path = tmp_path / 'out.tar'
monkeypatch.setattr(
sys,
'argv',
['kive_collate', '--inputs', 'a.csv', 'b.csv', str(metadata_path), '--', str(output_path)])

args = kive_collate.parse_args()

assert args.inputs == [Path('a.csv'), Path('b.csv'), metadata_path]
assert args.output == output_path
Loading
Loading