Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conf/modules_colabfold.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ process {
if (params.use_gpu) {
accelerator = 1
}

ext.args = [
params.use_gpu ? '--use-gpu-relax' : '',
params.colabfold_use_amber ? '--amber' : '',
Expand Down
4 changes: 2 additions & 2 deletions conf/test_colabfold_local.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ stubRun = true
// Limit resources so that this can run on GitHub Actions
process {
resourceLimits = [
cpus: 4,
memory: '15.GB',
cpus: 1,
memory: '1.GB',
time: '1.h'
]
}
Expand Down
6 changes: 4 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ workflow NFCORE_PROTEINFOLD {
PREPARE_COLABFOLD_DBS.out.params,
PREPARE_COLABFOLD_DBS.out.colabfold_db,
PREPARE_COLABFOLD_DBS.out.uniref30,
params.colabfold_num_recycles
params.colabfold_num_recycles,
params.mmseqs_batch_size
)

ch_multiqc = ch_multiqc.mix(COLABFOLD.out.multiqc_report)
Expand Down Expand Up @@ -549,7 +550,8 @@ workflow NFCORE_PROTEINFOLD {
PREPARE_BOLTZ_DBS.out.boltz2_mols,
PREPARE_COLABFOLD_DBS.out.colabfold_db,
PREPARE_COLABFOLD_DBS.out.uniref30,
params.use_msa_server
params.use_msa_server,
params.mmseqs_batch_size
)
ch_multiqc = ch_multiqc.mix(BOLTZ.out.multiqc_report)
ch_versions = ch_versions.mix(BOLTZ.out.versions)
Expand Down
15 changes: 14 additions & 1 deletion modules/local/mmseqs_colabfoldsearch/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,20 @@ process MMSEQS_COLABFOLDSEARCH {
stub:
"""
mkdir results
touch results/${meta.id}.a3m
input_file="${fasta}"
if [[ "\${input_file##*.}" == "csv" ]]; then
skip_first=true
while IFS=',' read -r filename _; do
if \$skip_first; then
skip_first=false
continue
fi
[[ -z "\$filename" ]] && continue
touch "results/\${filename}.a3m"
done < "\$input_file"
else
touch results/${meta.id}.a3m
fi

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ params {
use_gpu = false
save_intermediates = false
split_fasta = false
mmseqs_batch_size = 1000
db = null
full_dbs = false // true/false, globally sets full_dbs if not independently set
use_msa_server = false
Expand Down Expand Up @@ -390,7 +391,6 @@ profiles {
test_full_esmfold { includeConfig 'conf/test_full_esmfold.config' }
test_full_esmfold_multimer { includeConfig 'conf/test_full_esmfold_multimer.config' }
test_full_helixfold3 { includeConfig 'conf/test_full_helixfold3.config' }
test_full_boltz { includeConfig 'conf/test_full_boltz.config' }
test_full_rosettafold_all_atom { includeConfig 'conf/test_full_rosettafold_all_atom.config' }
test_full_rosettafold2na { includeConfig 'conf/test_full_rosettafold2na.config' }
test_rosettafold_all_atom { includeConfig 'conf/test_rosettafold_all_atom.config' }
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@
"fa_icon": "fas fa-link",
"format": "uri",
"errorMessage": "Please provide a valid URL for the MMSeqs2 API server"
},
"mmseqs_batch_size": {
"type": "integer",
"default": 1000,
"description": "Number of samples in each mmseq run (batching leads to better performance time in total.)",
"fa_icon": "fas fa-microchip"
}
}
},
Expand Down
152 changes: 152 additions & 0 deletions subworkflows/local/msa.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
//
// Post processing analysis for the predicted structures
//

//
// SUBWORKFLOW: Consisting entirely of nf-core/modules
//
include { MMSEQS_COLABFOLDSEARCH } from '../../modules/local/mmseqs_colabfoldsearch'

workflow MSA {

take:
ch_samplesheet
ch_colabfold_db // channel: path(colabfold_db)
ch_uniref30 // channel: path(uniref30)
mmseq_batch_size

main:
ch_versions = Channel.empty()
ch_a3m = Channel.empty()

ch_samplesheet
.branch {
fasta: it[1].extension == "fasta" || it[1].extension == "fa"
yaml: it[1].extension == "yaml" || it[1].extension == ".yml"
json: it[1].extension == "json"
}
.set{ch_input}

ch_input.fasta
.map{
meta = it[0].clone();
meta.cnt = getFastaSequences(it[1].text).size();
[meta, it[1]]
}
.set{ch_input_fasta}

ch_input.yaml
.map {
meta = it[0].clone();
meta.cnt = getYamlSequences(it[1].text).size();
[meta, it[1]]
}
.set { ch_input_yaml }

ch_input_full = ch_input_fasta.mix(ch_input_yaml)

def batch_itr = 0
ch_input_full
.map{it[1]}
.unique()
.map {
def sequences = it.name.endsWith(".yaml") || it.name.endsWith(".yml")
? getYamlSequences(it.text)
: getFastaSequences(it.text)

"${it.baseName},${sequences.collect { it.sequence }.join(':')}"
}
.buffer( size: mmseq_batch_size, remainder: true )
.collectFile {
batch_itr += 1;
[ "input_seqs_${batch_itr}.csv", "id,sequence\n" + it.join("\n") + '\n' ]
}
.map{[["id": it.baseName], it]}
.set {ch_input_seqs}

MMSEQS_COLABFOLDSEARCH (
ch_input_seqs,
ch_colabfold_db,
ch_uniref30
)
ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)

ch_a3m = ch_a3m.mix(
ch_input_full
.map{[it[1].baseName, it[0]]}
.combine(
MMSEQS_COLABFOLDSEARCH.out.a3m
.map{it[1]}
.flatten()
.map {[it.baseName, it]},
by:0
)
.map{[it[1], it[2]]}
)

emit:
formated_input = ch_input_full
a3m = ch_a3m
versions = ch_versions
}

def getYamlSequences(yamlData) {
List<Map> enrichedEntries = []
Map currentEntry = [:]
inSequences = false
yamlData.split("\n").each { line ->
def trimmed = line.trim()

// Detect start of sequences section
if (trimmed == 'sequences:') {
inSequences = true
return
}
if (inSequences && !line.startsWith(' ') && !trimmed.isEmpty()) {
inSequences = false
return
}
if (!inSequences){
return
}


if (trimmed.startsWith('-') && trimmed.endsWith(':')) {
if (!currentEntry.isEmpty()) {
enrichedEntries << currentEntry
}
currentEntry = ['type': trimmed[1..-2]]
}else{
def (key, value) = trimmed.split(':', 2)*.trim()
currentEntry[key] = value
}
}
if (!currentEntry.isEmpty()) {
enrichedEntries << currentEntry
}
return enrichedEntries
}

def getFastaSequences(fastaData) {
List<Map> fastaEntries = []
String currentId = null
StringBuilder currentSeq = new StringBuilder()

fastaData.split("\n").each { line ->
if (line.startsWith(">")) {
if (currentId) {
fastaEntries << [id: currentId, sequence: currentSeq.toString()]
}
currentId = line[1..-1].trim() // Remove '>' and trim
currentSeq = new StringBuilder()
} else {
currentSeq.append(line.trim())
}
}

if (currentId) {
fastaEntries << [id: currentId, sequence: currentSeq.toString()]
}

return fastaEntries
}
6 changes: 0 additions & 6 deletions tests/colabfold_local.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
"colabfold_search": "unknown",
"mmseqs": null
},
"MULTIFASTA_TO_CSV": {
"sed": 4.7
},
"Workflow": {
"nf-core/proteinfold": "v1.2.0dev"
}
Expand All @@ -39,8 +36,6 @@
"colabfold/top_ranked_structures",
"colabfold/top_ranked_structures/T1024.pdb",
"colabfold/top_ranked_structures/T1026.pdb",
"multifasta",
"multifasta/input.csv",
"multiqc",
"multiqc/multiqc_data",
"multiqc/multiqc_plots",
Expand All @@ -61,7 +56,6 @@
"T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
"T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e",
"T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e",
"input.csv:md5,d41d8cd98f00b204e9800998ecf8427e",
"test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
Expand Down
21 changes: 9 additions & 12 deletions workflows/boltz.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { BOLTZ_FASTA } from '../modules/local/boltz_fasta'
include { SPLIT_MSA } from '../modules/local/split_msa'
include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch'
include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv'
//
// SUBWORKFLOW: Consisting entirely of nf-core/modules
//
include { paramsSummaryMap } from 'plugin/nf-schema'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline'
include { MSA } from '../subworkflows/local/msa'

//
// MODULE: Boltz
Expand All @@ -54,6 +54,7 @@ workflow BOLTZ {
ch_colabfold_db // channel: [ path(colabfold_db) ]
ch_uniref30 // channel: [ path(uniref30) ]
msa_server
mmseq_batch_size

main:
ch_samplesheet
Expand Down Expand Up @@ -85,20 +86,16 @@ workflow BOLTZ {
.set{ch_input}

if (!msa_server){
MULTIFASTA_TO_CSV(
ch_input.multimer
MSA(
ch_samplesheet,
ch_colabfold_db,
ch_uniref30,
mmseq_batch_size
)
ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions)

MMSEQS_COLABFOLDSEARCH (
ch_input.monomer.mix(MULTIFASTA_TO_CSV.out.input_csv),
ch_colabfold_db,
ch_uniref30
)
ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)
ch_versions = ch_versions.mix(MSA.out.versions)

SPLIT_MSA(
MMSEQS_COLABFOLDSEARCH.out.a3m
MSA.out.a3m
)
ch_versions = ch_versions.mix(SPLIT_MSA.out.versions)
ch_input.monomer
Expand Down
21 changes: 8 additions & 13 deletions workflows/colabfold.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@ include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch
include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv'

include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline'

include { MSA } from '../subworkflows/local/msa'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT NF-CORE MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
Expand All @@ -35,7 +34,7 @@ workflow COLABFOLD {
ch_colabfold_db // channel: path(colabfold_db)
ch_uniref30 // channel: path(uniref30)
num_recycles // int: Number of recycles for esmfold

mmseq_batch_size
main:
ch_multiqc_report = channel.empty()

Expand Down Expand Up @@ -63,23 +62,19 @@ workflow COLABFOLD {
//
// MODULE: Run mmseqs
//
//Multimer mode
MULTIFASTA_TO_CSV(
ch_samplesheet
)
ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions)
MMSEQS_COLABFOLDSEARCH (
MULTIFASTA_TO_CSV.out.input_csv,
MSA(
ch_samplesheet,
ch_colabfold_db,
ch_uniref30
ch_uniref30,
mmseq_batch_size
)
ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions)
ch_versions = ch_versions.mix(MSA.out.versions)

//
// MODULE: Run colabfold
//
COLABFOLD_BATCH(
MMSEQS_COLABFOLDSEARCH.out.a3m,
MSA.out.a3m,
colabfold_model_preset,
ch_colabfold_params,
ch_colabfold_db,
Expand Down
Loading