nf-core · ziadbkh · May 28, 2025 · May 28, 2025 · Jun 5, 2025 · Jul 6, 2025
diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config
@@ -40,7 +40,7 @@ process {
         if (params.use_gpu) {
             accelerator = 1
         }
-        
+
         ext.args    = [
             params.use_gpu                  ? '--use-gpu-relax' : '',
             params.colabfold_use_amber      ? '--amber' : '',

diff --git a/conf/test_colabfold_local.config b/conf/test_colabfold_local.config
@@ -13,8 +13,8 @@ stubRun = true
 // Limit resources so that this can run on GitHub Actions
 process {
     resourceLimits = [
-        cpus: 4,
-        memory: '15.GB',
+        cpus: 1,
+        memory: '1.GB',
         time: '1.h'
     ]
 }

diff --git a/main.nf b/main.nf
@@ -265,7 +265,8 @@ workflow NFCORE_PROTEINFOLD {
             PREPARE_COLABFOLD_DBS.out.params,
             PREPARE_COLABFOLD_DBS.out.colabfold_db,
             PREPARE_COLABFOLD_DBS.out.uniref30,
-            params.colabfold_num_recycles
+            params.colabfold_num_recycles,
+            params.mmseqs_batch_size
         )
 
         ch_multiqc          = ch_multiqc.mix(COLABFOLD.out.multiqc_report)
@@ -549,7 +550,8 @@ workflow NFCORE_PROTEINFOLD {
             PREPARE_BOLTZ_DBS.out.boltz2_mols,
             PREPARE_COLABFOLD_DBS.out.colabfold_db,
             PREPARE_COLABFOLD_DBS.out.uniref30,
-            params.use_msa_server
+            params.use_msa_server,
+            params.mmseqs_batch_size
         )
         ch_multiqc                  = ch_multiqc.mix(BOLTZ.out.multiqc_report)
         ch_versions                 = ch_versions.mix(BOLTZ.out.versions)

diff --git a/modules/local/mmseqs_colabfoldsearch/main.nf b/modules/local/mmseqs_colabfoldsearch/main.nf
@@ -42,7 +42,20 @@ process MMSEQS_COLABFOLDSEARCH {
     stub:
     """
     mkdir results
-    touch results/${meta.id}.a3m
+    input_file="${fasta}"
+    if [[ "\${input_file##*.}" == "csv" ]]; then
+        skip_first=true
+        while IFS=',' read -r filename _; do
+            if \$skip_first; then
+                skip_first=false
+                continue
+            fi
+            [[ -z "\$filename" ]] && continue
+            touch "results/\${filename}.a3m"
+        done < "\$input_file"
+    else
+        touch results/${meta.id}.a3m
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -15,6 +15,7 @@ params {
     use_gpu                     = false
     save_intermediates          = false
     split_fasta                 = false
+    mmseqs_batch_size            = 1000
     db                          = null
     full_dbs                    = false // true/false, globally sets full_dbs if not independently set
     use_msa_server              = false
@@ -390,7 +391,6 @@ profiles {
     test_full_esmfold              { includeConfig 'conf/test_full_esmfold.config'                      }
     test_full_esmfold_multimer     { includeConfig 'conf/test_full_esmfold_multimer.config'             }
     test_full_helixfold3           { includeConfig 'conf/test_full_helixfold3.config'                   }
-    test_full_boltz                { includeConfig 'conf/test_full_boltz.config'                        }
     test_full_rosettafold_all_atom { includeConfig 'conf/test_full_rosettafold_all_atom.config'              }
     test_full_rosettafold2na       { includeConfig 'conf/test_full_rosettafold2na.config'                    }
     test_rosettafold_all_atom      { includeConfig 'conf/test_rosettafold_all_atom.config'              }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -98,6 +98,12 @@
                     "fa_icon": "fas fa-link",
                     "format": "uri",
                     "errorMessage": "Please provide a valid URL for the MMSeqs2 API server"
+                },
+                "mmseqs_batch_size": {
+                    "type": "integer",
+                    "default": 1000,
+                    "description": "Number of samples in each mmseq run (batching leads to better performance time in total.)",
+                    "fa_icon": "fas fa-microchip"
                 }
             }
         },

diff --git a/subworkflows/local/msa.nf b/subworkflows/local/msa.nf
@@ -0,0 +1,152 @@
+//
+// Post processing analysis for the predicted structures
+//
+
+//
+// SUBWORKFLOW: Consisting entirely of nf-core/modules
+//
+include { MMSEQS_COLABFOLDSEARCH } from '../../modules/local/mmseqs_colabfoldsearch'
+
+workflow MSA {
+
+    take:
+    ch_samplesheet
+    ch_colabfold_db        // channel: path(colabfold_db)
+    ch_uniref30            // channel: path(uniref30)
+    mmseq_batch_size
+
+    main:
+    ch_versions = Channel.empty()
+    ch_a3m      = Channel.empty()
+
+    ch_samplesheet
+    .branch {
+        fasta: it[1].extension == "fasta" || it[1].extension == "fa"
+        yaml: it[1].extension == "yaml" || it[1].extension == ".yml"
+        json: it[1].extension == "json"
+    }
+    .set{ch_input}
+
+    ch_input.fasta
+    .map{
+        meta = it[0].clone();
+        meta.cnt = getFastaSequences(it[1].text).size();
+        [meta, it[1]]
+    }
+    .set{ch_input_fasta}
+
+    ch_input.yaml
+    .map {
+        meta = it[0].clone();
+        meta.cnt = getYamlSequences(it[1].text).size();
+        [meta, it[1]]
+    }
+    .set { ch_input_yaml }
+
+    ch_input_full = ch_input_fasta.mix(ch_input_yaml)
+
+    def batch_itr = 0
+    ch_input_full
+    .map{it[1]}
+    .unique()
+    .map {
+        def sequences = it.name.endsWith(".yaml") || it.name.endsWith(".yml")
+            ? getYamlSequences(it.text)
+            : getFastaSequences(it.text)
+
+        "${it.baseName},${sequences.collect { it.sequence }.join(':')}"
+    }
+    .buffer( size: mmseq_batch_size, remainder: true )
+    .collectFile {
+        batch_itr += 1;
+        [ "input_seqs_${batch_itr}.csv", "id,sequence\n" + it.join("\n") + '\n' ]
+    }
+    .map{[["id": it.baseName], it]}
+    .set {ch_input_seqs}
+
+    MMSEQS_COLABFOLDSEARCH (
+        ch_input_seqs,
+        ch_colabfold_db,
+        ch_uniref30
+    )
+    ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)
+
+    ch_a3m = ch_a3m.mix(
+        ch_input_full
+        .map{[it[1].baseName, it[0]]}
+        .combine(
+            MMSEQS_COLABFOLDSEARCH.out.a3m
+            .map{it[1]}
+            .flatten()
+            .map {[it.baseName, it]},
+            by:0
+        )
+        .map{[it[1], it[2]]}
+    )
+
+    emit:
+    formated_input          = ch_input_full
+    a3m            = ch_a3m
+    versions       = ch_versions
+}
+
+def getYamlSequences(yamlData) {
+    List<Map> enrichedEntries = []
+    Map currentEntry = [:]
+    inSequences = false
+    yamlData.split("\n").each { line ->
+        def trimmed = line.trim()
+
+        // Detect start of sequences section
+        if (trimmed == 'sequences:') {
+            inSequences = true
+            return
+        }
+        if (inSequences && !line.startsWith('  ') && !trimmed.isEmpty()) {
+            inSequences = false
+            return
+        }
+        if (!inSequences){
+            return
+        }
+
+
+        if (trimmed.startsWith('-') && trimmed.endsWith(':')) {
+            if (!currentEntry.isEmpty()) {
+                enrichedEntries << currentEntry
+            }
+            currentEntry = ['type': trimmed[1..-2]]
+        }else{
+            def (key, value) = trimmed.split(':', 2)*.trim()
+            currentEntry[key] = value
+        }
+    }
+    if (!currentEntry.isEmpty()) {
+        enrichedEntries << currentEntry
+    }
+    return enrichedEntries
+}
+
+def getFastaSequences(fastaData) {
+    List<Map> fastaEntries = []
+    String currentId = null
+    StringBuilder currentSeq = new StringBuilder()
+
+    fastaData.split("\n").each { line ->
+        if (line.startsWith(">")) {
+            if (currentId) {
+                fastaEntries << [id: currentId, sequence: currentSeq.toString()]
+            }
+            currentId = line[1..-1].trim()  // Remove '>' and trim
+            currentSeq = new StringBuilder()
+        } else {
+            currentSeq.append(line.trim())
+        }
+    }
+
+    if (currentId) {
+        fastaEntries << [id: currentId, sequence: currentSeq.toString()]
+    }
+
+    return fastaEntries
+}
diff --git a/tests/colabfold_local.nf.test.snap b/tests/colabfold_local.nf.test.snap
@@ -15,9 +15,6 @@
                     "colabfold_search": "unknown",
                     "mmseqs": null
                 },
-                "MULTIFASTA_TO_CSV": {
-                    "sed": 4.7
-                },
                 "Workflow": {
                     "nf-core/proteinfold": "v1.2.0dev"
                 }
@@ -39,8 +36,6 @@
                 "colabfold/top_ranked_structures",
                 "colabfold/top_ranked_structures/T1024.pdb",
                 "colabfold/top_ranked_structures/T1026.pdb",
-                "multifasta",
-                "multifasta/input.csv",
                 "multiqc",
                 "multiqc/multiqc_data",
                 "multiqc/multiqc_plots",
@@ -61,7 +56,6 @@
                 "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
                 "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e",
                 "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e",
                 "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e"
             ]
         ],

diff --git a/workflows/boltz.nf b/workflows/boltz.nf
@@ -21,14 +21,14 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main'
 include { BOLTZ_FASTA } from '../modules/local/boltz_fasta'
 include { SPLIT_MSA } from '../modules/local/split_msa'
 include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch'
-include { MULTIFASTA_TO_CSV      } from '../modules/local/multifasta_to_csv'
 //
 // SUBWORKFLOW: Consisting entirely of nf-core/modules
 //
 include { paramsSummaryMap       } from 'plugin/nf-schema'
 include { paramsSummaryMultiqc   } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline'
+include { MSA                    } from '../subworkflows/local/msa'
 
 //
 // MODULE: Boltz
@@ -54,6 +54,7 @@ workflow BOLTZ {
     ch_colabfold_db // channel: [ path(colabfold_db) ]
     ch_uniref30     // channel: [ path(uniref30) ]
     msa_server
+    mmseq_batch_size
 
     main:
     ch_samplesheet
@@ -85,20 +86,16 @@ workflow BOLTZ {
         .set{ch_input}
 
     if (!msa_server){
-        MULTIFASTA_TO_CSV(
-            ch_input.multimer
+        MSA(
+            ch_samplesheet,
+            ch_colabfold_db,
+            ch_uniref30,
+            mmseq_batch_size
         )
-        ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions)
-
-        MMSEQS_COLABFOLDSEARCH (
-                ch_input.monomer.mix(MULTIFASTA_TO_CSV.out.input_csv),
-                ch_colabfold_db,
-                ch_uniref30
-        )
-        ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)
+        ch_versions = ch_versions.mix(MSA.out.versions)
 
         SPLIT_MSA(
-            MMSEQS_COLABFOLDSEARCH.out.a3m
+            MSA.out.a3m
         )
         ch_versions = ch_versions.mix(SPLIT_MSA.out.versions)
         ch_input.monomer

diff --git a/workflows/colabfold.nf b/workflows/colabfold.nf
@@ -12,13 +12,12 @@ include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch
 include { MULTIFASTA_TO_CSV      } from '../modules/local/multifasta_to_csv'
 
 include { modeChannel            } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline'
-
+include { MSA                    } from '../subworkflows/local/msa'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     IMPORT NF-CORE MODULES/SUBWORKFLOWS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -35,7 +34,7 @@ workflow COLABFOLD {
     ch_colabfold_db        // channel: path(colabfold_db)
     ch_uniref30            // channel: path(uniref30)
     num_recycles           // int: Number of recycles for esmfold
-
+    mmseq_batch_size
     main:
     ch_multiqc_report = channel.empty()
 
@@ -63,23 +62,19 @@ workflow COLABFOLD {
         //
         // MODULE: Run mmseqs
         //
-        //Multimer mode
-        MULTIFASTA_TO_CSV(
-            ch_samplesheet
-        )
-        ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions)
-        MMSEQS_COLABFOLDSEARCH (
-            MULTIFASTA_TO_CSV.out.input_csv,
+        MSA(
+            ch_samplesheet,
             ch_colabfold_db,
-            ch_uniref30
+            ch_uniref30,
+            mmseq_batch_size
         )
-        ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)
+        ch_versions = ch_versions.mix(MSA.out.versions)
 
         //
         // MODULE: Run colabfold
         //
         COLABFOLD_BATCH(
-            MMSEQS_COLABFOLDSEARCH.out.a3m,
+            MSA.out.a3m,
             colabfold_model_preset,
             ch_colabfold_params,
             ch_colabfold_db,