diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 4d49fa599..3bc01d143 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -40,7 +40,7 @@ process { if (params.use_gpu) { accelerator = 1 } - + ext.args = [ params.use_gpu ? '--use-gpu-relax' : '', params.colabfold_use_amber ? '--amber' : '', diff --git a/conf/test_colabfold_local.config b/conf/test_colabfold_local.config index 3019f1963..5cf0c1c04 100644 --- a/conf/test_colabfold_local.config +++ b/conf/test_colabfold_local.config @@ -13,8 +13,8 @@ stubRun = true // Limit resources so that this can run on GitHub Actions process { resourceLimits = [ - cpus: 4, - memory: '15.GB', + cpus: 1, + memory: '1.GB', time: '1.h' ] } diff --git a/main.nf b/main.nf index 9ddbee2c5..42527e31f 100644 --- a/main.nf +++ b/main.nf @@ -265,7 +265,8 @@ workflow NFCORE_PROTEINFOLD { PREPARE_COLABFOLD_DBS.out.params, PREPARE_COLABFOLD_DBS.out.colabfold_db, PREPARE_COLABFOLD_DBS.out.uniref30, - params.colabfold_num_recycles + params.colabfold_num_recycles, + params.mmseqs_batch_size ) ch_multiqc = ch_multiqc.mix(COLABFOLD.out.multiqc_report) @@ -549,7 +550,8 @@ workflow NFCORE_PROTEINFOLD { PREPARE_BOLTZ_DBS.out.boltz2_mols, PREPARE_COLABFOLD_DBS.out.colabfold_db, PREPARE_COLABFOLD_DBS.out.uniref30, - params.use_msa_server + params.use_msa_server, + params.mmseqs_batch_size ) ch_multiqc = ch_multiqc.mix(BOLTZ.out.multiqc_report) ch_versions = ch_versions.mix(BOLTZ.out.versions) diff --git a/modules/local/mmseqs_colabfoldsearch/main.nf b/modules/local/mmseqs_colabfoldsearch/main.nf index 5cd2b362a..f21583fed 100644 --- a/modules/local/mmseqs_colabfoldsearch/main.nf +++ b/modules/local/mmseqs_colabfoldsearch/main.nf @@ -42,7 +42,20 @@ process MMSEQS_COLABFOLDSEARCH { stub: """ mkdir results - touch results/${meta.id}.a3m + input_file="${fasta}" + if [[ "\${input_file##*.}" == "csv" ]]; then + skip_first=true + while IFS=',' read -r filename _; do + if \$skip_first; then + skip_first=false + continue + fi + [[ -z "\$filename" ]] && continue + touch "results/\${filename}.a3m" + done < "\$input_file" + else + touch results/${meta.id}.a3m + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index f5908948c..58b9b672d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { use_gpu = false save_intermediates = false split_fasta = false + mmseqs_batch_size = 1000 db = null full_dbs = false // true/false, globally sets full_dbs if not independently set use_msa_server = false @@ -390,7 +391,6 @@ profiles { test_full_esmfold { includeConfig 'conf/test_full_esmfold.config' } test_full_esmfold_multimer { includeConfig 'conf/test_full_esmfold_multimer.config' } test_full_helixfold3 { includeConfig 'conf/test_full_helixfold3.config' } - test_full_boltz { includeConfig 'conf/test_full_boltz.config' } test_full_rosettafold_all_atom { includeConfig 'conf/test_full_rosettafold_all_atom.config' } test_full_rosettafold2na { includeConfig 'conf/test_full_rosettafold2na.config' } test_rosettafold_all_atom { includeConfig 'conf/test_rosettafold_all_atom.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 40b72bdd2..4d61f6f32 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -98,6 +98,12 @@ "fa_icon": "fas fa-link", "format": "uri", "errorMessage": "Please provide a valid URL for the MMSeqs2 API server" + }, + "mmseqs_batch_size": { + "type": "integer", + "default": 1000, + "description": "Number of samples in each mmseq run (batching leads to better performance time in total.)", + "fa_icon": "fas fa-microchip" } } }, diff --git a/subworkflows/local/msa.nf b/subworkflows/local/msa.nf new file mode 100644 index 000000000..33f4b6c75 --- /dev/null +++ b/subworkflows/local/msa.nf @@ -0,0 +1,152 @@ +// +// Post processing analysis for the predicted structures +// + +// +// SUBWORKFLOW: Consisting entirely of nf-core/modules +// +include { MMSEQS_COLABFOLDSEARCH } from '../../modules/local/mmseqs_colabfoldsearch' + +workflow MSA { + + take: + ch_samplesheet + ch_colabfold_db // channel: path(colabfold_db) + ch_uniref30 // channel: path(uniref30) + mmseq_batch_size + + main: + ch_versions = Channel.empty() + ch_a3m = Channel.empty() + + ch_samplesheet + .branch { + fasta: it[1].extension == "fasta" || it[1].extension == "fa" + yaml: it[1].extension == "yaml" || it[1].extension == ".yml" + json: it[1].extension == "json" + } + .set{ch_input} + + ch_input.fasta + .map{ + meta = it[0].clone(); + meta.cnt = getFastaSequences(it[1].text).size(); + [meta, it[1]] + } + .set{ch_input_fasta} + + ch_input.yaml + .map { + meta = it[0].clone(); + meta.cnt = getYamlSequences(it[1].text).size(); + [meta, it[1]] + } + .set { ch_input_yaml } + + ch_input_full = ch_input_fasta.mix(ch_input_yaml) + + def batch_itr = 0 + ch_input_full + .map{it[1]} + .unique() + .map { + def sequences = it.name.endsWith(".yaml") || it.name.endsWith(".yml") + ? getYamlSequences(it.text) + : getFastaSequences(it.text) + + "${it.baseName},${sequences.collect { it.sequence }.join(':')}" + } + .buffer( size: mmseq_batch_size, remainder: true ) + .collectFile { + batch_itr += 1; + [ "input_seqs_${batch_itr}.csv", "id,sequence\n" + it.join("\n") + '\n' ] + } + .map{[["id": it.baseName], it]} + .set {ch_input_seqs} + + MMSEQS_COLABFOLDSEARCH ( + ch_input_seqs, + ch_colabfold_db, + ch_uniref30 + ) + ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) + + ch_a3m = ch_a3m.mix( + ch_input_full + .map{[it[1].baseName, it[0]]} + .combine( + MMSEQS_COLABFOLDSEARCH.out.a3m + .map{it[1]} + .flatten() + .map {[it.baseName, it]}, + by:0 + ) + .map{[it[1], it[2]]} + ) + + emit: + formated_input = ch_input_full + a3m = ch_a3m + versions = ch_versions +} + +def getYamlSequences(yamlData) { + List enrichedEntries = [] + Map currentEntry = [:] + inSequences = false + yamlData.split("\n").each { line -> + def trimmed = line.trim() + + // Detect start of sequences section + if (trimmed == 'sequences:') { + inSequences = true + return + } + if (inSequences && !line.startsWith(' ') && !trimmed.isEmpty()) { + inSequences = false + return + } + if (!inSequences){ + return + } + + + if (trimmed.startsWith('-') && trimmed.endsWith(':')) { + if (!currentEntry.isEmpty()) { + enrichedEntries << currentEntry + } + currentEntry = ['type': trimmed[1..-2]] + }else{ + def (key, value) = trimmed.split(':', 2)*.trim() + currentEntry[key] = value + } + } + if (!currentEntry.isEmpty()) { + enrichedEntries << currentEntry + } + return enrichedEntries +} + +def getFastaSequences(fastaData) { + List fastaEntries = [] + String currentId = null + StringBuilder currentSeq = new StringBuilder() + + fastaData.split("\n").each { line -> + if (line.startsWith(">")) { + if (currentId) { + fastaEntries << [id: currentId, sequence: currentSeq.toString()] + } + currentId = line[1..-1].trim() // Remove '>' and trim + currentSeq = new StringBuilder() + } else { + currentSeq.append(line.trim()) + } + } + + if (currentId) { + fastaEntries << [id: currentId, sequence: currentSeq.toString()] + } + + return fastaEntries +} diff --git a/tests/colabfold_local.nf.test.snap b/tests/colabfold_local.nf.test.snap index 4f6e1fbf6..4fd6787c2 100644 --- a/tests/colabfold_local.nf.test.snap +++ b/tests/colabfold_local.nf.test.snap @@ -15,9 +15,6 @@ "colabfold_search": "unknown", "mmseqs": null }, - "MULTIFASTA_TO_CSV": { - "sed": 4.7 - }, "Workflow": { "nf-core/proteinfold": "v1.2.0dev" } @@ -39,8 +36,6 @@ "colabfold/top_ranked_structures", "colabfold/top_ranked_structures/T1024.pdb", "colabfold/top_ranked_structures/T1026.pdb", - "multifasta", - "multifasta/input.csv", "multiqc", "multiqc/multiqc_data", "multiqc/multiqc_plots", @@ -61,7 +56,6 @@ "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", - "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e", "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], diff --git a/workflows/boltz.nf b/workflows/boltz.nf index a52a5cce3..1c891aad3 100644 --- a/workflows/boltz.nf +++ b/workflows/boltz.nf @@ -21,7 +21,6 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { BOLTZ_FASTA } from '../modules/local/boltz_fasta' include { SPLIT_MSA } from '../modules/local/split_msa' include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch' -include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv' // // SUBWORKFLOW: Consisting entirely of nf-core/modules // @@ -29,6 +28,7 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' +include { MSA } from '../subworkflows/local/msa' // // MODULE: Boltz @@ -54,6 +54,7 @@ workflow BOLTZ { ch_colabfold_db // channel: [ path(colabfold_db) ] ch_uniref30 // channel: [ path(uniref30) ] msa_server + mmseq_batch_size main: ch_samplesheet @@ -85,20 +86,16 @@ workflow BOLTZ { .set{ch_input} if (!msa_server){ - MULTIFASTA_TO_CSV( - ch_input.multimer + MSA( + ch_samplesheet, + ch_colabfold_db, + ch_uniref30, + mmseq_batch_size ) - ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) - - MMSEQS_COLABFOLDSEARCH ( - ch_input.monomer.mix(MULTIFASTA_TO_CSV.out.input_csv), - ch_colabfold_db, - ch_uniref30 - ) - ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) + ch_versions = ch_versions.mix(MSA.out.versions) SPLIT_MSA( - MMSEQS_COLABFOLDSEARCH.out.a3m + MSA.out.a3m ) ch_versions = ch_versions.mix(SPLIT_MSA.out.versions) ch_input.monomer diff --git a/workflows/colabfold.nf b/workflows/colabfold.nf index f085c0c7a..ab43584d5 100644 --- a/workflows/colabfold.nf +++ b/workflows/colabfold.nf @@ -12,13 +12,12 @@ include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv' include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' - +include { MSA } from '../subworkflows/local/msa' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -35,7 +34,7 @@ workflow COLABFOLD { ch_colabfold_db // channel: path(colabfold_db) ch_uniref30 // channel: path(uniref30) num_recycles // int: Number of recycles for esmfold - + mmseq_batch_size main: ch_multiqc_report = channel.empty() @@ -63,23 +62,19 @@ workflow COLABFOLD { // // MODULE: Run mmseqs // - //Multimer mode - MULTIFASTA_TO_CSV( - ch_samplesheet - ) - ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) - MMSEQS_COLABFOLDSEARCH ( - MULTIFASTA_TO_CSV.out.input_csv, + MSA( + ch_samplesheet, ch_colabfold_db, - ch_uniref30 + ch_uniref30, + mmseq_batch_size ) - ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) + ch_versions = ch_versions.mix(MSA.out.versions) // // MODULE: Run colabfold // COLABFOLD_BATCH( - MMSEQS_COLABFOLDSEARCH.out.a3m, + MSA.out.a3m, colabfold_model_preset, ch_colabfold_params, ch_colabfold_db,