nf-core · keiran-rowell-unsw · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/main.nf b/main.nf
@@ -566,7 +566,7 @@ workflow NFCORE_PROTEINFOLD {
     // POST PROCESSING: generate visualisation reports
     //
     ch_multiqc_config        = channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true).first()
-    ch_multiqc_custom_config = params.multiqc_config ? channel.fromPath( params.multiqc_config ).first()  : channel.empty()
+    ch_multiqc_custom_config = params.multiqc_config ? channel.fromPath( params.multiqc_config ).first()  : channel.fromPath("${projectDir}/multiqc_proteinfold/multiqc_proteinfold_config.yml")
     ch_multiqc_logo          = params.multiqc_logo   ? channel.fromPath( params.multiqc_logo ).first()    : channel.empty()
     ch_multiqc_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true)
     ch_report_template     = channel.value(file("$projectDir/assets/report_template.html", checkIfExists: true))

diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
diff --git a/multiqc_proteinfold/__init__.py b/multiqc_proteinfold/__init__.py
@@ -0,0 +1,6 @@
+"""MultiQC plugin for proteinfold pipeline outputs converting metrics to simple tsvs."""
+
+from .proteinfold import MultiqcModule
+
+__version__ = '0.1.0'
+__all__ = ["MultiqcModule"]
diff --git a/multiqc_proteinfold/multiqc_proteinfold_config.yml b/multiqc_proteinfold/multiqc_proteinfold_config.yml
@@ -0,0 +1,3 @@
+sp:
+  proteinfold:
+    fn: "*_{plddt,msa,ptm,iptm,pae}.tsv"
diff --git a/multiqc_proteinfold/proteinfold.py b/multiqc_proteinfold/proteinfold.py
@@ -0,0 +1,248 @@
+from pathlib import Path
+import pandas as pd
+
+from multiqc.base_module import BaseMultiqcModule
+from multiqc.plots import linegraph
+from multiqc import config  # I want the ranks to merge by default, not a user problem
+from multiqc.plots.table_object import ColumnDict
+
+from typing import Dict, Any, cast
+
+
+class MultiqcModule(BaseMultiqcModule):
+    """
+    The module parses results generated by a variety of protein structure prediction programs in the [ProteinFold](https://nf-co.re/proteinfold/) pipeline.
+    This includes (as of release v2.0):
+        - [AlphaFold2](https://github.com/google-deepmind/alphafold)
+        - [AlphaFold3](https://github.com/google-deepmind/alphafold3)
+        - [ColabFold](https://github.com/sokrypton/ColabFold)
+        - [ESMFold](https://github.com/facebookresearch/esm)
+        - [RoseTTaFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom)
+        - [RoseTTaFold2-Nucleic-Acids](https://github.com/uw-ipd/RoseTTAFold2NA) [Wait on merge]
+        - [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3)
+        - [Boltz](https://github.com/jwohlwend/boltz)
+
+    This is intended to provide a summary of useful metrics for mass 'folding' a large set of proteins, either in terms of fishing for multimer interactions or comparing methods across whole proteomes.
+    It provides a visual 'at-a-glance' report of relevant metrics (average pLDDT, ipTM, *etc*) and does not replace the per-protein interactive plot from GENERATE_REPORT in  nfcore/proteinfold
+    """
+
+    def __init__(self):
+        super(MultiqcModule, self).__init__(
+            name="ProteinFold",
+            anchor="proteinfold",
+            href=[
+                "https://nf-co.re/proteinfold",
+                "https://github.com/google-deepmind/alphafold",
+                "https://github.com/google-deepmind/alphafold3",
+                "https://github.com/sokrypton/ColabFold",
+                "https://github.com/facebookresearch/esm",
+                "https://github.com/baker-laboratory/RoseTTAFold-All-Atom",
+                "https://github.com/uw-ipd/RoseTTAFold2NA",
+                "https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3",
+                "https://github.com/jwohlwend/boltz",
+            ],
+            info="ProteinFold - protein structure inference methods through a single nextflow pipeline interface",
+            doi=[
+                "10.1038/s41586-021-03819-2",
+                "10.1038/s41592-022-01488-1",
+                "10.1126/science.ade2574",
+                "10.1126/science.adl2528",
+                "10.1038/s41592-023-02086-5",
+                "10.48550/arXiv.2408.16975",
+                "10.1101/2024.11.19.624167",
+            ],
+        )
+
+        # Want to treat the ranked inference runs as 'sub-samples' for grouping logic, even if not separate files
+        if not hasattr(config, "table_sample_merge"):
+            config.table_sample_merge = {}
+
+        # Some codes generated 5 inferences for 5 models and all 25 are processed. If a user sets more they're an expert and can custom handle
+        rank_merge = {f"rank_{i}": [f"_rank_{i}"] for i in range(25)}
+
+        # Load user sample merge config in preference to rank_N default ranking, if user config exists
+        config.table_sample_merge = {**rank_merge, **getattr(config, "table_sample_merge", {})}
+
+        mode_dict = {
+            "alphafold2": "AlphaFold2",
+            "alphafold3": "AlphaFold3",
+            "colabfold": "ColabFold",
+            "esmfold": "ESMFold",
+            "rosettafold2na": "RoseTTAFold2-Nucleic-Acids",
+            "rosettafold_all_atom": "RoseTTAFold-All-Atom",
+            "helixfold3": "HelixFold3",
+            "boltz": "Boltz",
+        }
+
+        self.proteinfold_data: Dict[str, Any] = {}
+        # I want to enable sample grouping: https://docs.seqera.io/multiqc/reports/customisation#sample-grouping
+
+        for f in self.find_log_files("proteinfold"):
+            self.add_data_source(f)
+
+            raw_samplename = f["s_name"].split("rank_")[0]
+            filepath = Path(f["root"]) / f["fn"]
+            mode = "UNKNOWN"
+            for parent in filepath.parents:
+                if parent.name in mode_dict:  # traverse up the filepath until you hit a mode labelled dir
+                    mode = mode_dict[parent.name]
+                    break
+
+            mode_samplename = f"{raw_samplename}_{mode}"
+            samplename = self.clean_s_name(mode_samplename, f)
+            self.log.debug(filepath)
+            self.log.debug(samplename)
+
+            self.proteinfold_data.setdefault(samplename, {})  # Set default creates if doesn't already exist
+
+            if f["fn"].endswith("_msa.tsv"):
+                self.proteinfold_data[samplename]["msa_depth"] = f["f"].count("\n")
+
+            if f["fn"].endswith("_plddt.tsv"):
+                df = pd.read_csv(filepath, sep="\t")
+                rank_cols = [col for col in df.columns if col.startswith("rank_")]
+
+                # Full plddt data frame for plotting purposes
+                plddt_data = {col: df.set_index("Positions")[col].to_dict() for col in rank_cols}
+                self.proteinfold_data[samplename]["plddt"] = plddt_data
+
+                rank_means = df[rank_cols].mean().to_dict()
+                # Parent sample entry should still have the top ranked value
+                self.proteinfold_data[samplename]["mean_plddt"] = rank_means.get("rank_0")
+
+                for rank in rank_cols:
+                    rank_num = rank.split("rank_")[1]
+                    subsample = f"{samplename}_rank_{rank_num}"
+
+                    self.proteinfold_data.setdefault(subsample, {})
+                    self.proteinfold_data[subsample]["mean_plddt"] = rank_means[rank]
+
+            if f["fn"].endswith("_iptm.tsv") and not f["fn"].endswith("_chainwise_iptm.tsv"):
+                iptm_series = cast(
+                    pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
+                )  # Squeeze makes a series accessible on index label
+                if (
+                    0 in iptm_series.index
+                ):  # Since pandas infers the index as an int this is an exact int match not a greedy string match
+                    self.proteinfold_data[samplename]["iptm"] = iptm_series.loc[
+                        0
+                    ]  # Remember loc is an *int* match on rank 0 index
+
+                for rank_num in iptm_series.index:
+                    subsample = f"{samplename}_rank_{rank_num}"
+                    self.proteinfold_data.setdefault(subsample, {})["iptm"] = iptm_series.loc[rank_num]
+
+            if f["fn"].endswith("_ptm.tsv") and not f["fn"].endswith("_chainwise_ptm.tsv"):
+                ptm_series = cast(
+                    pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
+                )  # Squeeze on axis avoids int conversion for single entries
+                if 0 in ptm_series.index:
+                    self.proteinfold_data[samplename]["ptm"] = ptm_series.loc[0]
+
+                for rank_num in ptm_series.index:
+                    subsample = f"{samplename}_rank_{rank_num}"
+                    self.proteinfold_data.setdefault(subsample, {})["ptm"] = ptm_series.loc[rank_num]
+
+        self.write_data_file(
+            self.proteinfold_data, "proteinfold_data"
+        )  # I want to structure and rename from avg_plDDT to summary_stats
+        self.general_stats_table()
+        # Togglable plDDT by residue plots of all ranks
+        self.plddt_line_plot()
+
+    def general_stats_table(self):
+        """
+        Put protein structure prediction metrics into a general table for all different Deep Learning methods
+        """
+        # Check for empty metrics to drop those columns where not appropriate
+
+        has_iptm = any(
+            sample_data.get("iptm") and sample_data.get("iptm") != 0.0 for sample_data in self.proteinfold_data.values()
+        )
+        has_ptm = any(
+            sample_data.get("ptm") and sample_data.get("ptm") != 0.0 for sample_data in self.proteinfold_data.values()
+        )
+
+        headers: Dict[str, ColumnDict] = {
+            "msa_depth": {
+                "title": "Related sequence depth (MSA)",
+                "description": "The number of related sequences (across the whole protein) that could be retrieved from the MSA (Multiple Sequence Alignment) stage",
+            },
+            "mean_plddt": {
+                "title": "Structure confidence (average pLDDT)",
+                "description": "Structure prediction confidence score across all residues in the top ranked protein structure - from the mean pLDDT (predicted Local Distance Difference Test) value",
+                "max": 100,
+                "min": 0,
+                "cond_formatting_rules": {
+                    "very-low": [{"lt": 50}],
+                    "low": [{"gt": 50}, {"lt": 70}],
+                    "high": [{"gt": 70}, {"lt": 90}],
+                    "very-high": [{"gt": 90}],
+                },
+                "cond_formatting_colours": [
+                    {"very-low": "#f0743e"},
+                    {"low": "#f9d613"},
+                    {"high": "#60c2e8"},
+                    {"very-high": "#014ecc"},
+                ],
+            },
+        }
+
+        if has_iptm:
+            headers["iptm"] = {
+                "title": "Interface accuracy (ipTM)",
+                "description": "Accuracy of the relative positions of two protein subunits from a multimer calculation - from the ipTM (interface predicted Template Modelling) score",
+                "max": 1,
+                "min": 0,
+                "format": "{:,.2f}",
+                "scale": "Purples",
+            }
+
+        if has_ptm:
+            headers["ptm"] = {
+                "title": "Global accuracy (TM)",
+                "description": "Global accuracy of the protein folded, less sensitive to localised inaccuracies than raw 3D atomic deviations (RMSD) - from the pTM (predicted Template Modelling) score",
+                "max": 1,
+                "min": 0,
+                "format": "{:,.2f}",
+                "scale": "Blues",
+            }
+
+        self.general_stats_addcols(self.proteinfold_data, headers)
+
+    def plddt_line_plot(self):
+        """Line plot showing pLDDT confidence across residue position of selected sample, for all ranks"""
+
+        parent_samples = {}
+
+        for sample, metrics in self.proteinfold_data.items():
+            if "plddt" in metrics:
+                if "_rank_" not in sample:  # The parent sample already has the plddt data for all ranks
+                    parent_samples[sample] = metrics["plddt"]
+
+        data_labels = []  # Need a data_labels list for the sample switcher
+        plot_data_list = []
+
+        # The populated data_labels plot config section is what the switcher uses
+        for parent_sample, rank_data in parent_samples.items():
+            data_labels.append({"name": parent_sample, "ylab": "pLDDT score"})
+            plot_data_list.append(rank_data)
+
+        pconfig = {
+            "id": "proteinfold_plddt_lineplot",
+            "title": "ProteinFold: pLDDT by Position",
+            "xlab": "Residue Position",
+            "ylab": "pLDDT Score",
+            "ymin": 0,
+            "ymax": 100,
+            "data_labels": data_labels,
+        }
+
+        plot_html = linegraph.plot(plot_data_list, pconfig)
+
+        self.add_section(
+            name="pLDDT Confidence by residue",
+            anchor="proteinfold-plddt-per-res",
+            description="Per-residue confidence scores across all predicted ranks",
+            plot=plot_html,
+        )
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='multiqc-proteinfold',
+    version='0.1.0',
+    author='Keiran Rowell',
+    author_email='k.rowell@unsw.edu.au',
+    description='MultiQC plugin for proteinfold pipeline metric tsv outputs',
+    url='https://github.com/nf-core/proteinfold',
+    packages=["multiqc_proteinfold"],
+    package_data={"multiqc_proteinfold": ["*.yaml"]},
+    include_package_data=True,
+    entry_points={
+        'multiqc.modules.v1': [
+            'proteinfold = multiqc_proteinfold.proteinfold:MultiqcModule',
+        ],
+    },
+    install_requires=[
+        'multiqc>=1.15',
+        'pandas',
+    ],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3',
+    ],
+    python_requires='>=3.8',
+)