diff --git a/main.nf b/main.nf index 857415a56..a522892ba 100644 --- a/main.nf +++ b/main.nf @@ -566,7 +566,7 @@ workflow NFCORE_PROTEINFOLD { // POST PROCESSING: generate visualisation reports // ch_multiqc_config = channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true).first() - ch_multiqc_custom_config = params.multiqc_config ? channel.fromPath( params.multiqc_config ).first() : channel.empty() + ch_multiqc_custom_config = params.multiqc_config ? channel.fromPath( params.multiqc_config ).first() : channel.fromPath("${projectDir}/multiqc_proteinfold/multiqc_proteinfold_config.yml") ch_multiqc_logo = params.multiqc_logo ? channel.fromPath( params.multiqc_logo ).first() : channel.empty() ch_multiqc_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) ch_report_template = channel.value(file("$projectDir/assets/report_template.html", checkIfExists: true)) diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 8f2086b4f..f17c8bbbe 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -32,7 +32,8 @@ process MULTIQC { def replace = replace_names ? "--replace-names ${replace_names}" : '' def samples = sample_names ? "--sample-names ${sample_names}" : '' """ - multiqc \\ + pip install --target "$PWD/.multiqc_plugins" ${workflow.projectDir} + PYTHONPATH="$PWD/.multiqc_plugins" multiqc \\ --force \\ $args \\ $config \\ diff --git a/multiqc_proteinfold/__init__.py b/multiqc_proteinfold/__init__.py new file mode 100644 index 000000000..d57194bdf --- /dev/null +++ b/multiqc_proteinfold/__init__.py @@ -0,0 +1,6 @@ +"""MultiQC plugin for proteinfold pipeline outputs converting metrics to simple tsvs.""" + +from .proteinfold import MultiqcModule + +__version__ = '0.1.0' +__all__ = ["MultiqcModule"] diff --git a/multiqc_proteinfold/multiqc_proteinfold_config.yml b/multiqc_proteinfold/multiqc_proteinfold_config.yml new file mode 100644 index 000000000..8324880ec --- /dev/null +++ b/multiqc_proteinfold/multiqc_proteinfold_config.yml @@ -0,0 +1,3 @@ +sp: + proteinfold: + fn: "*_{plddt,msa,ptm,iptm,pae}.tsv" diff --git a/multiqc_proteinfold/proteinfold.py b/multiqc_proteinfold/proteinfold.py new file mode 100644 index 000000000..b8693b14c --- /dev/null +++ b/multiqc_proteinfold/proteinfold.py @@ -0,0 +1,248 @@ +from pathlib import Path +import pandas as pd + +from multiqc.base_module import BaseMultiqcModule +from multiqc.plots import linegraph +from multiqc import config # I want the ranks to merge by default, not a user problem +from multiqc.plots.table_object import ColumnDict + +from typing import Dict, Any, cast + + +class MultiqcModule(BaseMultiqcModule): + """ + The module parses results generated by a variety of protein structure prediction programs in the [ProteinFold](https://nf-co.re/proteinfold/) pipeline. + This includes (as of release v2.0): + - [AlphaFold2](https://github.com/google-deepmind/alphafold) + - [AlphaFold3](https://github.com/google-deepmind/alphafold3) + - [ColabFold](https://github.com/sokrypton/ColabFold) + - [ESMFold](https://github.com/facebookresearch/esm) + - [RoseTTaFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom) + - [RoseTTaFold2-Nucleic-Acids](https://github.com/uw-ipd/RoseTTAFold2NA) [Wait on merge] + - [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) + - [Boltz](https://github.com/jwohlwend/boltz) + + This is intended to provide a summary of useful metrics for mass 'folding' a large set of proteins, either in terms of fishing for multimer interactions or comparing methods across whole proteomes. + It provides a visual 'at-a-glance' report of relevant metrics (average pLDDT, ipTM, *etc*) and does not replace the per-protein interactive plot from GENERATE_REPORT in nfcore/proteinfold + """ + + def __init__(self): + super(MultiqcModule, self).__init__( + name="ProteinFold", + anchor="proteinfold", + href=[ + "https://nf-co.re/proteinfold", + "https://github.com/google-deepmind/alphafold", + "https://github.com/google-deepmind/alphafold3", + "https://github.com/sokrypton/ColabFold", + "https://github.com/facebookresearch/esm", + "https://github.com/baker-laboratory/RoseTTAFold-All-Atom", + "https://github.com/uw-ipd/RoseTTAFold2NA", + "https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3", + "https://github.com/jwohlwend/boltz", + ], + info="ProteinFold - protein structure inference methods through a single nextflow pipeline interface", + doi=[ + "10.1038/s41586-021-03819-2", + "10.1038/s41592-022-01488-1", + "10.1126/science.ade2574", + "10.1126/science.adl2528", + "10.1038/s41592-023-02086-5", + "10.48550/arXiv.2408.16975", + "10.1101/2024.11.19.624167", + ], + ) + + # Want to treat the ranked inference runs as 'sub-samples' for grouping logic, even if not separate files + if not hasattr(config, "table_sample_merge"): + config.table_sample_merge = {} + + # Some codes generated 5 inferences for 5 models and all 25 are processed. If a user sets more they're an expert and can custom handle + rank_merge = {f"rank_{i}": [f"_rank_{i}"] for i in range(25)} + + # Load user sample merge config in preference to rank_N default ranking, if user config exists + config.table_sample_merge = {**rank_merge, **getattr(config, "table_sample_merge", {})} + + mode_dict = { + "alphafold2": "AlphaFold2", + "alphafold3": "AlphaFold3", + "colabfold": "ColabFold", + "esmfold": "ESMFold", + "rosettafold2na": "RoseTTAFold2-Nucleic-Acids", + "rosettafold_all_atom": "RoseTTAFold-All-Atom", + "helixfold3": "HelixFold3", + "boltz": "Boltz", + } + + self.proteinfold_data: Dict[str, Any] = {} + # I want to enable sample grouping: https://docs.seqera.io/multiqc/reports/customisation#sample-grouping + + for f in self.find_log_files("proteinfold"): + self.add_data_source(f) + + raw_samplename = f["s_name"].split("rank_")[0] + filepath = Path(f["root"]) / f["fn"] + mode = "UNKNOWN" + for parent in filepath.parents: + if parent.name in mode_dict: # traverse up the filepath until you hit a mode labelled dir + mode = mode_dict[parent.name] + break + + mode_samplename = f"{raw_samplename}_{mode}" + samplename = self.clean_s_name(mode_samplename, f) + self.log.debug(filepath) + self.log.debug(samplename) + + self.proteinfold_data.setdefault(samplename, {}) # Set default creates if doesn't already exist + + if f["fn"].endswith("_msa.tsv"): + self.proteinfold_data[samplename]["msa_depth"] = f["f"].count("\n") + + if f["fn"].endswith("_plddt.tsv"): + df = pd.read_csv(filepath, sep="\t") + rank_cols = [col for col in df.columns if col.startswith("rank_")] + + # Full plddt data frame for plotting purposes + plddt_data = {col: df.set_index("Positions")[col].to_dict() for col in rank_cols} + self.proteinfold_data[samplename]["plddt"] = plddt_data + + rank_means = df[rank_cols].mean().to_dict() + # Parent sample entry should still have the top ranked value + self.proteinfold_data[samplename]["mean_plddt"] = rank_means.get("rank_0") + + for rank in rank_cols: + rank_num = rank.split("rank_")[1] + subsample = f"{samplename}_rank_{rank_num}" + + self.proteinfold_data.setdefault(subsample, {}) + self.proteinfold_data[subsample]["mean_plddt"] = rank_means[rank] + + if f["fn"].endswith("_iptm.tsv") and not f["fn"].endswith("_chainwise_iptm.tsv"): + iptm_series = cast( + pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1) + ) # Squeeze makes a series accessible on index label + if ( + 0 in iptm_series.index + ): # Since pandas infers the index as an int this is an exact int match not a greedy string match + self.proteinfold_data[samplename]["iptm"] = iptm_series.loc[ + 0 + ] # Remember loc is an *int* match on rank 0 index + + for rank_num in iptm_series.index: + subsample = f"{samplename}_rank_{rank_num}" + self.proteinfold_data.setdefault(subsample, {})["iptm"] = iptm_series.loc[rank_num] + + if f["fn"].endswith("_ptm.tsv") and not f["fn"].endswith("_chainwise_ptm.tsv"): + ptm_series = cast( + pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1) + ) # Squeeze on axis avoids int conversion for single entries + if 0 in ptm_series.index: + self.proteinfold_data[samplename]["ptm"] = ptm_series.loc[0] + + for rank_num in ptm_series.index: + subsample = f"{samplename}_rank_{rank_num}" + self.proteinfold_data.setdefault(subsample, {})["ptm"] = ptm_series.loc[rank_num] + + self.write_data_file( + self.proteinfold_data, "proteinfold_data" + ) # I want to structure and rename from avg_plDDT to summary_stats + self.general_stats_table() + # Togglable plDDT by residue plots of all ranks + self.plddt_line_plot() + + def general_stats_table(self): + """ + Put protein structure prediction metrics into a general table for all different Deep Learning methods + """ + # Check for empty metrics to drop those columns where not appropriate + + has_iptm = any( + sample_data.get("iptm") and sample_data.get("iptm") != 0.0 for sample_data in self.proteinfold_data.values() + ) + has_ptm = any( + sample_data.get("ptm") and sample_data.get("ptm") != 0.0 for sample_data in self.proteinfold_data.values() + ) + + headers: Dict[str, ColumnDict] = { + "msa_depth": { + "title": "Related sequence depth (MSA)", + "description": "The number of related sequences (across the whole protein) that could be retrieved from the MSA (Multiple Sequence Alignment) stage", + }, + "mean_plddt": { + "title": "Structure confidence (average pLDDT)", + "description": "Structure prediction confidence score across all residues in the top ranked protein structure - from the mean pLDDT (predicted Local Distance Difference Test) value", + "max": 100, + "min": 0, + "cond_formatting_rules": { + "very-low": [{"lt": 50}], + "low": [{"gt": 50}, {"lt": 70}], + "high": [{"gt": 70}, {"lt": 90}], + "very-high": [{"gt": 90}], + }, + "cond_formatting_colours": [ + {"very-low": "#f0743e"}, + {"low": "#f9d613"}, + {"high": "#60c2e8"}, + {"very-high": "#014ecc"}, + ], + }, + } + + if has_iptm: + headers["iptm"] = { + "title": "Interface accuracy (ipTM)", + "description": "Accuracy of the relative positions of two protein subunits from a multimer calculation - from the ipTM (interface predicted Template Modelling) score", + "max": 1, + "min": 0, + "format": "{:,.2f}", + "scale": "Purples", + } + + if has_ptm: + headers["ptm"] = { + "title": "Global accuracy (TM)", + "description": "Global accuracy of the protein folded, less sensitive to localised inaccuracies than raw 3D atomic deviations (RMSD) - from the pTM (predicted Template Modelling) score", + "max": 1, + "min": 0, + "format": "{:,.2f}", + "scale": "Blues", + } + + self.general_stats_addcols(self.proteinfold_data, headers) + + def plddt_line_plot(self): + """Line plot showing pLDDT confidence across residue position of selected sample, for all ranks""" + + parent_samples = {} + + for sample, metrics in self.proteinfold_data.items(): + if "plddt" in metrics: + if "_rank_" not in sample: # The parent sample already has the plddt data for all ranks + parent_samples[sample] = metrics["plddt"] + + data_labels = [] # Need a data_labels list for the sample switcher + plot_data_list = [] + + # The populated data_labels plot config section is what the switcher uses + for parent_sample, rank_data in parent_samples.items(): + data_labels.append({"name": parent_sample, "ylab": "pLDDT score"}) + plot_data_list.append(rank_data) + + pconfig = { + "id": "proteinfold_plddt_lineplot", + "title": "ProteinFold: pLDDT by Position", + "xlab": "Residue Position", + "ylab": "pLDDT Score", + "ymin": 0, + "ymax": 100, + "data_labels": data_labels, + } + + plot_html = linegraph.plot(plot_data_list, pconfig) + + self.add_section( + name="pLDDT Confidence by residue", + anchor="proteinfold-plddt-per-res", + description="Per-residue confidence scores across all predicted ranks", + plot=plot_html, + ) diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..23bf31f9a --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +setup( + name='multiqc-proteinfold', + version='0.1.0', + author='Keiran Rowell', + author_email='k.rowell@unsw.edu.au', + description='MultiQC plugin for proteinfold pipeline metric tsv outputs', + url='https://github.com/nf-core/proteinfold', + packages=["multiqc_proteinfold"], + package_data={"multiqc_proteinfold": ["*.yaml"]}, + include_package_data=True, + entry_points={ + 'multiqc.modules.v1': [ + 'proteinfold = multiqc_proteinfold.proteinfold:MultiqcModule', + ], + }, + install_requires=[ + 'multiqc>=1.15', + 'pandas', + ], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3', + ], + python_requires='>=3.8', +)