diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e0901..97c8c97fe 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,20 +1,20 @@ { "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], + "image": "nfcore/devcontainer:latest", - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python" - }, + "remoteUser": "root", + "privileged": true, - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } + "remoteEnv": { + // Workspace path on the host for mounting with docker-outside-of-docker + "LOCAL_WORKSPACE_FOLDER": "${localWorkspaceFolder}" + }, + + "onCreateCommand": "./.devcontainer/setup.sh", + + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" } } diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100755 index 000000000..cdb32f4b3 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Customise the terminal command prompt +echo "export PROMPT_DIRTRIM=2" >> $HOME/.bashrc +echo "export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] '" >> $HOME/.bashrc +export PROMPT_DIRTRIM=2 +export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] ' + +# Update Nextflow +nextflow self-update + +# Update welcome message +echo "Welcome to the nf-core/proteinfold devcontainer!" > /usr/local/etc/vscode-dev-containers/first-run-notice.txt diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 72dda289a..000000000 --- a/.editorconfig +++ /dev/null @@ -1,33 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ad8a7f875..62a22e4fb 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# nf-core/proteinfold: Contributing Guidelines +# `nf-core/proteinfold`: Contributing Guidelines Hi there! Many thanks for taking an interest in improving nf-core/proteinfold. @@ -19,7 +19,7 @@ If you'd like to write some code for nf-core/proteinfold, the standard workflow 1. Check that there isn't already an issue about your idea in the [nf-core/proteinfold issues](https://github.com/nf-core/proteinfold/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/proteinfold repository](https://github.com/nf-core/proteinfold) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) -4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). +4. Use `nf-core pipelines schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged If you're not used to this workflow with git, you can start with some [docs from GitHub](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) or even their [excellent `git` resources](https://try.github.io/). @@ -29,7 +29,7 @@ If you're not used to this workflow with git, you can start with some [docs from You have the option to test your changes locally by running the pipeline. For receiving warnings about process selectors and other `debug` information, it is recommended to use the debug profile. Execute all the tests with the following command: ```bash -nextflow run . --profile debug,test,docker --outdir +nextflow run . -profile debug,test,docker --outdir ``` When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. @@ -40,7 +40,7 @@ There are typically two types of tests that run: ### Lint tests `nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. -To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. +To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core pipelines lint ` command. If any failures or warnings are encountered, please follow the listed URL for more documentation. @@ -55,9 +55,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -- On your own fork, make a new branch `patch` based on `upstream/master`. +- On your own fork, make a new branch `patch` based on `upstream/main` or `upstream/master`. - Fix the bug, and bump version (X.Y.Z+1). -- A PR should be made on `master` from patch to directly this particular bug. +- Open a pull-request from `patch` to `main`/`master` with the changes. ## Getting help @@ -65,32 +65,32 @@ For further information/help, please consult the [nf-core/proteinfold documentat ## Pipeline contribution conventions -To make the nf-core/proteinfold code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the `nf-core/proteinfold` code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step If you wish to contribute a new step, please use the following coding standards: -1. Define the corresponding input channel into your new process from the expected previous process channel +1. Define the corresponding input channel into your new process from the expected previous process channel. 2. Write the process block (see below). 3. Define the output channel if needed (see below). 4. Add any new parameters to `nextflow.config` with a default (see below). -5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core schema build` tool). +5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. 9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. ### Default values -Parameters should be initialised / defined with default values in `nextflow.config` under the `params` scope. +Parameters should be initialised / defined with default values within the `params` scope in `nextflow.config`. -Once there, use `nf-core schema build` to add to `nextflow_schema.json`. +Once there, use `nf-core pipelines schema build` to add to `nextflow_schema.json`. ### Default processes resource requirements -Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. +Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/main/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. The process resources can be passed on to the tool dynamically within the process with the `${task.cpus}` and `${task.memory}` variables in the `script:` block. @@ -103,7 +103,7 @@ Please use the following naming schemes, to make it easy to understand what is g ### Nextflow version bumping -If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core bump-version --nextflow . [min-nf-version]` +If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core pipelines bump-version --nextflow . [min-nf-version]` ### Images and figures diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 257da8265..dd3ef7a97 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -9,7 +9,6 @@ body: - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) - [nf-core/proteinfold pipeline documentation](https://nf-co.re/proteinfold/usage) - - type: textarea id: description attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8dc3e6a46..992c391e8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,7 +17,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/prot - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/proteinfold/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/proteinfold _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. -- [ ] Make sure your code lints (`nf-core lint`). +- [ ] Make sure your code lints (`nf-core pipelines lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 000000000..34085279f --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 000000000..3b9724c76 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,111 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 3774758d3..ba6ce42f7 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -1,20 +1,25 @@ name: nf-core AWS full size tests -# This workflow is triggered on published releases. +# This workflow is triggered on PRs opened against the main/master branch. # It can be additionally triggered manually with GitHub actions workflow dispatch button. # It runs the -profile 'test_full' on AWS batch on: + workflow_dispatch: + pull_request_review: + types: [submitted] release: types: [published] - workflow_dispatch: + jobs: run-platform: name: Run AWS full tests - if: github.repository == 'nf-core/proteinfold' + # run only if the PR is approved by at least 2 reviewers and against the master/main branch or manually triggered + if: github.repository == 'nf-core/proteinfold' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' runs-on: ubuntu-latest - # Do a full-scale run on each of the mode + # Do a full test on each of the modes strategy: matrix: + compute_env: ["gpu"] mode: [ "alphafold2_standard", @@ -25,27 +30,43 @@ jobs: "colabfold_multimer", "esmfold", "esmfold_multimer", + "boltz", + "helixfold3", + "rosettafold_all_atom", + "rosettafold2na", ] steps: + - name: Set revision variable + id: revision + run: | + echo "revision=${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'release') && github.sha || 'dev' }}" >> "$GITHUB_OUTPUT" + + - name: Set compute environment + id: compute_env + run: | + echo "compute_env=${{ matrix.compute_env == 'gpu' && vars.TOWER_COMPUTE_ENV_GPU || vars.TOWER_COMPUTE_ENV }}" >> "$GITHUB_OUTPUT" + echo "Mode: ${{ matrix.mode }}" + echo "Compute env type: ${{ matrix.compute_env }}" + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/proteinfold/work-${{ github.sha }} + compute_env: ${{ steps.compute_env.outputs.compute_env }} + revision: ${{ steps.revision.outputs.revision }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/proteinfold/work-${{ steps.revision.outputs.revision }} parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/proteinfold/results-${{ github.sha }}/mode_${{ matrix.mode }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/proteinfold/results-${{ steps.revision.outputs.revision }}", + "use_gpu": true } profiles: test_full_${{ matrix.mode }} - - uses: actions/upload-artifact@v4 - if: success() || failure() + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: - name: Seqera Platform debug log file + name: Seqera Platform debug log file ${{ matrix.mode }} path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index ee725793f..a134c938e 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -14,20 +14,20 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/proteinfold/work-${{ github.sha }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/proteinfold/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/proteinfold/results-test-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/proteinfold/results-test-${{ github.sha }}" } profiles: test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index cabcdbddb..d1c7e5f4c 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -1,15 +1,17 @@ name: nf-core branch protection -# This workflow is triggered on PRs to master branch on the repository -# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +# This workflow is triggered on PRs to `main`/`master` branch on the repository +# It fails when someone tries to make a PR against the nf-core `main`/`master` branch instead of `dev` on: pull_request_target: - branches: [master] + branches: + - main + - master jobs: test: runs-on: ubuntu-latest steps: - # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + # PRs to the nf-core repo main/master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs if: github.repository == 'nf-core/proteinfold' run: | @@ -22,7 +24,7 @@ jobs: uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2 with: message: | - ## This PR is against the `master` branch :x: + ## This PR is against the `${{github.event.pull_request.base.ref}}` branch :x: * Do not close this PR * Click _Edit_ and change the `base` to `dev` @@ -32,9 +34,9 @@ jobs: Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. - The `master` branch on nf-core repositories should always contain code from the latest release. - Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) ${{github.event.pull_request.base.ref}} branch. + The ${{github.event.pull_request.base.ref}} branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to ${{github.event.pull_request.base.ref}} are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 47ad67072..000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - -env: - NXF_ANSI_LOG: false - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/proteinfold') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "23.04.0" - - "latest-everything" - parameters: - - "test" - - "test_alphafold2_split" - - "test_alphafold2_download" - - "test_colabfold_local" - - "test_colabfold_webserver" - - "test_colabfold_download" - - "test_esmfold" - - steps: - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Install Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Run pipeline with test data ${{ matrix.parameters }} profile - run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.parameters }},docker --outdir ./results_${{ matrix.parameters }} diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f272..6adb0fff4 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 640ac03cf..45884ff90 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -1,33 +1,42 @@ -name: Test successful pipeline download with 'nf-core download' +name: Test successful pipeline download with 'nf-core pipelines download' # Run the workflow when: # - dispatched manually -# - when a PR is opened or reopened to master branch +# - when a PR is opened or reopened to main/master branch # - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. on: workflow_dispatch: inputs: testbranch: - description: "The specific branch you wish to utilize for the test execution of nf-core download." + description: "The specific branch you wish to utilize for the test execution of nf-core pipelines download." required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - master - pull_request_target: branches: + - main - master env: NXF_ANSI_LOG: false jobs: + configure: + runs-on: ubuntu-latest + outputs: + REPO_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPO_LOWERCASE }} + REPOTITLE_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPOTITLE_LOWERCASE }} + REPO_BRANCH: ${{ steps.get_repo_properties.outputs.REPO_BRANCH }} + steps: + - name: Get the repository name and current branch + id: get_repo_properties + run: | + echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> "$GITHUB_OUTPUT" + echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> "$GITHUB_OUTPUT" + echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> "$GITHUB_OUTPUT" + download: runs-on: ubuntu-latest + needs: configure steps: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 @@ -35,52 +44,91 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 + + - name: Setup Apptainer + uses: eWaterCycle/setup-apptainer@4bb22c52d4f63406c49e94c804632975787312b3 # v2.0.0 with: - singularity-version: 3.8.3 + apptainer-version: 1.3.4 - name: Install dependencies run: | python -m pip install --upgrade pip pip install git+https://github.com/nf-core/tools.git - - name: Get the repository name and current branch set as environment variable + - name: Make a cache directory for the container images run: | - echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} - echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> ${GITHUB_ENV} + mkdir -p ./singularity_container_images - name: Download the pipeline env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images run: | - nf-core download ${{ env.REPO_LOWERCASE }} \ - --revision ${{ env.REPO_BRANCH }} \ - --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ + nf-core pipelines download ${{ needs.configure.outputs.REPO_LOWERCASE }} \ + --revision ${{ needs.configure.outputs.REPO_BRANCH }} \ + --outdir ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} \ --compress "none" \ --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ + --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io/library/" \ --container-cache-utilisation 'amend' \ - --download-configuration + --download-configuration 'yes' - name: Inspect download - run: tree ./${{ env.REPOTITLE_LOWERCASE }} + run: tree ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} + + - name: Inspect container images + run: tree ./singularity_container_images | tee ./container_initial + + - name: Count the downloaded number of container images + id: count_initial + run: | + image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) + echo "Initial container image count: $image_count" + echo "IMAGE_COUNT_INITIAL=$image_count" >> "$GITHUB_OUTPUT" - name: Run the downloaded pipeline (stub) id: stub_run_pipeline continue-on-error: true env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results + run: nextflow run ./${{needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results - name: Run the downloaded pipeline (stub run not supported) id: run_pipeline - if: ${{ job.steps.stub_run_pipeline.status == failure() }} + if: ${{ steps.stub_run_pipeline.outcome == 'failure' }} env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results + run: nextflow run ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -profile test,singularity --outdir ./results + + - name: Count the downloaded number of container images + id: count_afterwards + run: | + image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) + echo "Post-pipeline run container image count: $image_count" + echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" + + - name: Compare container image counts + id: count_comparison + run: | + if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then + initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} + final_count=${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }} + difference=$((final_count - initial_count)) + echo "$difference additional container images were \n downloaded at runtime . The pipeline has no support for offline runs!" + tree ./singularity_container_images > ./container_afterwards + diff ./container_initial ./container_afterwards + exit 1 + else + echo "The pipeline can be downloaded successfully!" + fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 80% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index ddaa085ad..ab3220dd7 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -13,13 +13,13 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: token: ${{ secrets.nf_core_bot_auth_token }} # indication that the linting is being fixed - name: React on comment - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: eyes @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -47,7 +47,7 @@ jobs: # indication that the linting has finished - name: react if linting finished succesfully if: steps.pre-commit.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: "+1" @@ -67,21 +67,21 @@ jobs: - name: react if linting errors were fixed id: react-if-fixed if: steps.commit-and-push.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: hooray - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: confused - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: issue-number: ${{ github.event.issue.number }} body: | diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 1fcafe880..7a527a346 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,11 +1,8 @@ name: nf-core linting # This workflow is triggered on pushes and PRs to the repository. -# It runs the `nf-core lint` and markdown lint tests to ensure +# It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -14,12 +11,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - - name: Set up Python 3.12 - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - name: Set up Python 3.14 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -31,27 +28,42 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" + - name: read .nf-core.yml + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 + id: read_yml + with: + config: ${{ github.workspace }}/.nf-core.yml + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }} + + - name: Run nf-core pipelines lint + if: ${{ github.base_ref != 'master' }} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md - - name: Run nf-core lint + - name: Run nf-core pipelines lint --release + if: ${{ github.base_ref == 'master' }} env: GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} - run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md - name: Save PR number if: ${{ always() }} @@ -59,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 40acc23f5..e6e9bc269 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 000000000..c75cb0eea --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,142 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test-changes + - runner=4cpu-linux-x64 + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test + - runner=4cpu-linux-x64 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # # Exclude singularity on dev + exclude: + - isMain: false + profile: "singularity" + NXF_VER: + - "25.10.2" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ matrix.NXF_VER }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-confirm-pass + - runner=2cpu-linux-x64 + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 03ecfcf72..431d3d445 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -12,8 +12,12 @@ jobs: - name: get topics and convert to hashtags id: get_topics run: | - echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" >> $GITHUB_OUTPUT + echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" | sed 's/-//g' >> $GITHUB_OUTPUT + - name: get description + id: get_description + run: | + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -22,48 +26,15 @@ jobs: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - + ${{ steps.get_description.outputs.description }} Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics - send-tweet: - runs-on: ubuntu-latest - - steps: - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 - with: - python-version: "3.10" - - name: Install dependencies - run: pip install tweepy==4.14.0 - - name: Send tweet - shell: python - run: | - import os - import tweepy - - client = tweepy.Client( - access_token=os.getenv("TWITTER_ACCESS_TOKEN"), - access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), - consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), - consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), - ) - tweet = os.getenv("TWEET") - client.create_tweet(text=tweet) - env: - TWEET: | - Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - - Please see the changelog: ${{ github.event.release.html_url }} - TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} - TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} - TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} - TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} - bsky-post: runs-on: ubuntu-latest steps: - - uses: zentered/bluesky-post-action@80dbe0a7697de18c15ad22f4619919ceb5ccf597 # v0.1.0 + - uses: zentered/bluesky-post-action@6461056ea355ea43b977e149f7bf76aaa572e5e8 # v0.3.0 with: post: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! diff --git a/.github/workflows/template-version-comment.yml b/.github/workflows/template-version-comment.yml new file mode 100644 index 000000000..e8560fc7c --- /dev/null +++ b/.github/workflows/template-version-comment.yml @@ -0,0 +1,46 @@ +name: nf-core template version comment +# This workflow is triggered on PRs to check if the pipeline template version matches the latest nf-core version. +# It posts a comment to the PR, even if it comes from a fork. + +on: pull_request_target + +jobs: + template_version: + runs-on: ubuntu-latest + steps: + - name: Check out pipeline code + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Read template version from .nf-core.yml + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 + id: read_yml + with: + config: ${{ github.workspace }}/.nf-core.yml + + - name: Install nf-core + run: | + python -m pip install --upgrade pip + pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }} + + - name: Check nf-core outdated + id: nf_core_outdated + run: echo "OUTPUT=$(pip list --outdated | grep nf-core)" >> ${GITHUB_ENV} + + - name: Post nf-core template version comment + uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2 + if: | + contains(env.OUTPUT, 'nf-core') + with: + repo-token: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + allow-repeats: false + message: | + > [!WARNING] + > Newer version of the nf-core template is available. + > + > Your pipeline is using an old version of the nf-core template: ${{ steps.read_yml.outputs['nf_core_version'] }}. + > Please update your pipeline to the latest version. + > + > For more documentation on how to update your pipeline, please see the [nf-core documentation](https://github.com/nf-core/tools?tab=readme-ov-file#sync-a-pipeline-with-the-template) and [Synchronisation documentation](https://nf-co.re/docs/contributing/sync). + # diff --git a/.github/workflows/workflow_json.yml b/.github/workflows/workflow_json.yml new file mode 100644 index 000000000..b83d850f2 --- /dev/null +++ b/.github/workflows/workflow_json.yml @@ -0,0 +1,93 @@ +name: workflow JSON generation +# This workflow generates a workflow JSON file for the nf-core/proteinfold pipeline using bioflow-insight +on: + pull_request_target: + branches: + - dev + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true + +jobs: + generate-workflow-json: + name: Create workflow JSON using bioflow-insight + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 + with: + token: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + + - name: Safety check + if: | + github.event_name != 'pull_request_target' || + github.event.pull_request.head.ref == 'dev' || + github.event.pull_request.head.ref == 'master' || + github.event.pull_request.head.ref == 'TEMPLATE' + run: | + echo "Safety check failed:" + echo " - github event that triggered the action: ${github.event_name} is not pull_request_target" + echo " - branch of origin: ${github.event.pull_request.head.ref} is either dev, master or TEMPLATE" + exit 1 + + # pull_request_target runs in the context of the base branch, so we need to checkout the PR branch + # Use the GitHub CLI to check out the PR: + - name: Checkout Pull Request + env: + GH_TOKEN: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + run: | + PR_NUMBER="${{ github.event.pull_request.number }}" + gh pr checkout $PR_NUMBER + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install bioflow-insight==2.0.9 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v2 + + - name: Install Graphviz + run: | + sudo apt-get update + sudo apt-get install -y graphviz + + - name: Install prek + run: curl --proto '=https' --tlsv1.2 -LsSf https://github.com/j178/prek/releases/download/v0.2.22/prek-installer.sh | sh + + - name: Run bioflow-insight to generate metro map + id: generate-json + run: | + bioflow-insight ${GITHUB_WORKSPACE}/main.nf --analysis metroflow --output-dir ${GITHUB_WORKSPACE}/assets + + - name: Upload workflow JSON artifact + id: upload-artifact + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 + with: + name: workflow JSON file + path: | + assets/graphs/*.json + + - name: Commit and run prek to workflow JSON files + if: steps.generate-json.outcome == 'success' && steps.upload-artifact.outcome == 'success' + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + run: | + git config user.email "core@nf-co.re" + git config user.name "nf-core-bot" + mkdir -p assets/workflow_jsons + mv assets/graphs/*.json assets/workflow_jsons/ + git add assets/workflow_jsons + prek run --config .pre-commit-config.yaml --files assets/workflow_jsons/* + git add assets/workflow_jsons + + - name: Commit and push workflow JSON files + if: steps.generate-json.outcome == 'success' && steps.upload-artifact.outcome == 'success' + env: + GH_TOKEN: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + run: | + git add assets/workflow_jsons + git status + git commit -m "[automated] Add workflow JSON artifacts (metro map)" + git push diff --git a/.gitignore b/.gitignore index 5124c9ac7..c162b2451 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ results/ testing/ testing* *.pyc +null/ +.nf* diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 105a1821a..000000000 --- a/.gitpod.yml +++ /dev/null @@ -1,20 +0,0 @@ -image: nfcore/gitpod:latest -tasks: - - name: Update Nextflow and setup pre-commit - command: | - pre-commit install --install-hooks - nextflow self-update - - name: unset JAVA_TOOL_OPTIONS - command: | - unset JAVA_TOOL_OPTIONS - -vscode: - extensions: # based on nf-core.nf-core-extensionpack - - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code - - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files - - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar - - mechatroner.rainbow-csv # Highlight columns in csv files in different colors - # - nextflow.nextflow # Nextflow syntax highlighting - - oderwat.indent-rainbow # Highlight indentation level - - streetsidesoftware.code-spell-checker # Spelling checker for source code - - charliermarsh.ruff # Code linter Ruff diff --git a/.nf-core.yml b/.nf-core.yml index 69e8d9bfc..7e8bea365 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,6 +1,20 @@ -repository_type: pipeline -nf_core_version: "2.14.1" lint: files_unchanged: + - .github/workflows/linting.yml - .github/CONTRIBUTING.md + actions_schema_validation: false multiqc_config: false +nf_core_version: 3.5.1 +repository_type: pipeline +template: + author: Athanasios Baltzis, Jose Espinosa-Carrasco, Harshil Patel + description: Protein 3D structure prediction pipeline + force: false + is_nfcore: true + name: proteinfold + org: nf-core + outdir: . + skip_features: + - fastqc + - igenomes + version: 2.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4dc0f1dcd..d06777a8f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,24 @@ repos: hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 - - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "2.7.3" + - prettier@3.6.2 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ diff --git a/.prettierignore b/.prettierignore index 437d763d0..dd749d43d 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,3 +10,7 @@ testing/ testing* *.pyc bin/ +.nf-test/ +ro-crate-metadata.json +modules/nf-core/ +subworkflows/nf-core/ diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a766..07dbd8bb9 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..a33b527cc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "markdown.styles": ["public/vscode_markdown.css"] +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fbddadb6..d91a695b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,280 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.1.1](https://github.com/nf-core/proteinfold/releases/tag/1.1.1)] - 2025-07-30 +## [[2.0.0](https://github.com/nf-core/proteinfold/releases/tag/2.0.0)] - 2026-03-27 -- Minor patch release to fix multiqc report. +### Enhancements & fixes + +- [[#177](https://github.com/nf-core/proteinfold/issues/177)] - Fix typo in some instances of model preset `alphafold2_ptm`. +- [[PR #178](https://github.com/nf-core/proteinfold/pull/178)] - Enable running multiple modes in parallel. +- [[#179](https://github.com/nf-core/proteinfold/issues/179)] - Produce an interactive html report for the predicted structures. +- [[#180](https://github.com/nf-core/proteinfold/issues/180)] - Implement Foldseek. +- [[#188](https://github.com/nf-core/proteinfold/issues/188)] - Fix colabfold image to run in gpus. +- [[PR ##205](https://github.com/nf-core/proteinfold/pull/205)] - Change input schema from `sequence,fasta` to `id,fasta`. +- [[PR #210](https://github.com/nf-core/proteinfold/pull/210)] - Moving post-processing logic to a subworkflow, change wave images pointing to oras to point to https and refactor module to match nf-core folder structure. +- [[#214](https://github.com/nf-core/proteinfold/issues/214)] - Fix colabfold image to run in cpus after [#188](https://github.com/nf-core/proteinfold/issues/188) fix. +- [[PR ##220](https://github.com/nf-core/proteinfold/pull/220)] - Add RoseTTAFold-All-Atom module. +- [[PR ##223](https://github.com/nf-core/proteinfold/pull/223)] - Add HelixFold3 module. +- [[#235](https://github.com/nf-core/proteinfold/issues/235)] - Update samplesheet to new version (switch from `sequence` column to `id`). +- [[#239](https://github.com/nf-core/proteinfold/issues/239)] - Update alphafold2 standard mode Dockerfile. +- [[#240](https://github.com/nf-core/proteinfold/issues/240)] - Separate download and input of pdb `mmcif` files and `obsolete` database. +- [[#229](https://github.com/nf-core/proteinfold/issues/229)] - Add Boltz pipeline [PR #227](https://github.com/nf-core/proteinfold/pull/227). +- [[PR #249](https://github.com/nf-core/proteinfold/pull/249)] - Update pipeline template to [nf-core/tools 3.2.0](https://github.com/nf-core/tools/releases/tag/3.2.0). +- [[PR #271](https://github.com/nf-core/proteinfold/pull/271)] - Update RFAA and HF3 dockerfiles for quicker building and reduction in image size. +- [[PR #274](https://github.com/nf-core/proteinfold/pull/274)] - Simplify run_helixfold3 module and move arguments to `modules.config`. +- [[#276](https://github.com/nf-core/proteinfold/issues/276)] - Update helixfold3 dockerfile to make it compatible with H100 gpus. +- [[#259](https://github.com/nf-core/proteinfold/issues/259)] - Fix `esmfold` docker image to make it compatible with hopper GPU architecture. +- [[#281](https://github.com/nf-core/proteinfold/issues/281)] - Fix how argument `--nv` is passed to apptainer and singularity in the config. +- [[PR #283](https://github.com/nf-core/proteinfold/pull/283)] - Fixes to meet language server requirements and update link of the helixfold3 image. +- [[PR #287](https://github.com/nf-core/proteinfold/pull/287)] - Fixes symlinking of every mmcif file causing excess I/O. +- [[#293](https://github.com/nf-core/proteinfold/issues/293)] - Add back `alphafold2_model_preset` input to the call to `run_alphafold2_pred`. +- [[PR #294](https://github.com/nf-core/proteinfold/pull/294)] - Temporary downgrade of schema for passing CI tests with Nextflow edge version. +- [[#272](https://github.com/nf-core/proteinfold/issues/272)] - Colouring scheme conforming to AlphaFold2 confidence bands in html report. +- [[PR #297](https://github.com/nf-core/proteinfold/pull/297)] - Update pipeline template to [nf-core/tools 3.2.1](https://github.com/nf-core/tools/releases/tag/3.2.1). +- [[#273](https://github.com/nf-core/proteinfold/issues/273)] - Fixes comparison report to correctly label msa coverage plots with corresponding method label. +- [[#290](https://github.com/nf-core/proteinfold/issues/290)] - Update Alphafold2 split images to make them compatible Hopper gpus. +- [[PR #302](https://github.com/nf-core/proteinfold/pull/302)] - Fix HF3 dbs and max_template_date. +- [[PR #305](https://github.com/nf-core/proteinfold/pull/305)] - Stop RFAA and HF3 symlinking scripts into workdir. +- [[PR #306](https://github.com/nf-core/proteinfold/pull/306)] - extract_output.py -> extract_metrics.py so pLDDT, MSA, PAE emitted as raw data .tsv files +- [[PR #307](https://github.com/nf-core/proteinfold/pull/307)] - Update Boltz-1 boilerplate and formatting. +- [[PR #314](https://github.com/nf-core/proteinfold/pull/314)] - Fix extract metrics for broken modules. +- [[PR #312](https://github.com/nf-core/proteinfold/pull/312)] - pTM & ipTM metrics now extracted +- [[PR #315](https://github.com/nf-core/proteinfold/pull/315)] - Add global db flag. +- [[#263](https://github.com/nf-core/proteinfold/issues/263)] - Removed broken colabfold options (`auto` and `alphafold2`) +- [[PR #316](https://github.com/nf-core/proteinfold/pull/316)] - Add process_gpu label to modules which use GPU. +- [[PR #319](https://github.com/nf-core/proteinfold/pull/319)] - Update boltz workflow to accept YAML as input. +- [[PR #322](https://github.com/nf-core/proteinfold/pull/322)] - Updates and reorganises the reference database directory structure. +- [[PR #329](https://github.com/nf-core/proteinfold/pull/329)] - Updates Boltz module to include Boltz-2. +- [[PR #332](https://github.com/nf-core/proteinfold/pull/332)] - Fix rare superposition bug in reports. +- [[PR #333](https://github.com/nf-core/proteinfold/pull/333)] - Updates the RFAA dockerfile for better versioning and smaller image size. +- [[PR #335](https://github.com/nf-core/proteinfold/pull/335)] - Update pipeline template to [nf-core/tools 3.3.1](https://github.com/nf-core/tools/releases/tag/3.3.1). +- [[PR #346](https://github.com/nf-core/proteinfold/pull/346)] - Update pipeline template to [nf-core/tools 3.3.2](https://github.com/nf-core/tools/releases/tag/3.3.2). +- [[PR #351](https://github.com/nf-core/proteinfold/pull/351)] - add chain-wise (i)pTM values and summary file for AF3-generation codes. +- [[PR #354](https://github.com/nf-core/proteinfold/pull/354)] - Update documentation with mode specific usage, capabilities. +- [[PR #355](https://github.com/nf-core/proteinfold/pull/355)] - Remove unneccesary params from Boltz and Helixfold3 modes. +- [[PR #356](https://github.com/nf-core/proteinfold/pull/356)] - Update AF2 defaults to use split mode and monomer_ptm model. +- [[PR #357](https://github.com/nf-core/proteinfold/pull/357)] - Update ColabFold module and image. +- [[PR #359](https://github.com/nf-core/proteinfold/pull/359)] - Harmonize parameters across modes. +- [[PR #360](https://github.com/nf-core/proteinfold/pull/360)] - Rename some DBs paths in the run modules so they are equal to those when DBs are downloaded. +- [[PR #362](https://github.com/nf-core/proteinfold/pull/355)] - Update boltz Dockerfile and image pinning specific version (2.0.3). +- [[#364](https://github.com/nf-core/proteinfold/issues/364)] - Move Dockerfiles to its corresponding module. +- [[PR #370](https://github.com/nf-core/proteinfold/pull/370)] - Fix extract chain metrics. +- [[#367](https://github.com/nf-core/proteinfold/issues/367)] - Boltz post-processing crashes. +- [[#368](https://github.com/nf-core/proteinfold/issues/368)] - Helixfold3 iPTM output missing when dealing with monomers make the process to fail. +- [[#369](https://github.com/nf-core/proteinfold/issues/369)] - Download all Alphafold3 DBs. +- [[PR #350](https://github.com/nf-core/proteinfold/pull/350)] - PAE of model 0 in Boltz HTML report, AlphaFold2 to pass the build system +- [[PR #377](https://github.com/nf-core/proteinfold/pull/377)] - Fix sequence msa synch for af2 split. +- [[#380](https://github.com/nf-core/proteinfold/issues/380)] - Fixes alphafold2_model_preset bug on retry. +- [[#382](https://github.com/nf-core/proteinfold/issues/382)] - Readds `--full_dbs` as a global option. +- [[#378](https://github.com/nf-core/proteinfold/issues/378)] - Fix nested obsolete pdbs from pdb70. +- [[#388](https://github.com/nf-core/proteinfold/issues/388)] - Fix colabfold prefix handling for output metrics. +- [[#387](https://github.com/nf-core/proteinfold/issues/387)] - Fix alphafold2_standard obsolete.dat path error. +- [[#389](https://github.com/nf-core/proteinfold/issues/389)] - Locked version numbers for HelixFold3 image to prevent bug caused by newer mamba versions. +- [[PR #397](https://github.com/nf-core/proteinfold/pull/397)] - Fix AF2 mgnify handling and improve version reporting for AlphaFold2 containers. +- [[PR #398](https://github.com/nf-core/proteinfold/pull/398)] - Fix issues with PREPARE_DBS subworkflows. +- [[PR #399](https://github.com/nf-core/proteinfold/pulls/399)] - Update alphafold2 and alphafold2_pred Dockerfiles. +- [[PR #404](https://github.com/nf-core/proteinfold/pulls/404)] - Boltz cache files moved to workdir, fixed version checks and Boltz stubRun. +- [[#401](https://github.com/nf-core/proteinfold/issues/401)] - Get rid of symlinking in the prediction tools processes when using "PREPARE_DBS" subworkflows +- [[#410](https://github.com/nf-core/proteinfold/issues/410)] - Switch RosettaFold2NA to Boltz-style multi-chain FASTA inputs and drop the interactions sheet. +- [[PR #407](https://github.com/nf-core/proteinfold/pulls/407)] - Several changes to meet nf-core standards. +- [[PR #409](https://github.com/nf-core/proteinfold/pulls/409)] - Force single pdb workflow outputs to return as a list +- [[PR #396](https://github.com/nf-core/proteinfold/pulls/396)] - Split ColabFold into separate optimised containers with version pinning and significant size reduction. +- [[#412](https://github.com/nf-core/proteinfold/issues/412)] - Substitute "/" with "\_" from fasta headers used to name files when using "--split_fasta". +- [[PR #424](https://github.com/nf-core/proteinfold/pulls/424)] - Bump docker image version for release to 2.0.0, make code more friendly with Nextflow language server and other format issues/fixes. +- [[#423](https://github.com/nf-core/proteinfold/issues/423)] - Generate json workflow using bioflow-insight. +- [[#425](https://github.com/nf-core/proteinfold/issues/425)] - Pass as a single input channel fasta and features to get rid of meta2 in RUN_ALPHAFOLD2_PRED. +- [[#440](https://github.com/nf-core/proteinfold/issues/440)] - Support single-letter RF2NA type tags (`type=P/R/D/S`) in ROSETTAFOLD2NA FASTA headers. +- [[PR #442](https://github.com/nf-core/proteinfold/pulls/442)] - Bump version 2.6.1 of nf-schema, Nextflow minimum version to 25.10.2 and update utils_nfschema_plugin subworkflow. +- [[PR #443](https://github.com/nf-core/proteinfold/pull/443)] - Add documentation guide for contributing new prediction modes. +- [[PR #446](https://github.com/nf-core/proteinfold/pulls/446)] - Fix warnings from Nextflow lint. +- [[PR #451](https://github.com/nf-core/proteinfold/pulls/451)] - Remove af2 multimer padding from msa plots. +- [[#417](https://github.com/nf-core/proteinfold/issues/417)] - Add `boltz_use_kernels` parameter to enable/disable using optimized Triton-based CUDA kernels CUDA kernels for Boltz inference. +- [[#417](https://github.com/nf-core/proteinfold/issues/417)] - Handle incompatible CUDA kernel errors in Boltz by automatically retrying with `--use_kernels` false. +- [[#285](https://github.com/nf-core/proteinfold/issues/285)] - Adding contributors to manifest. +- [[PR #460](https://github.com/nf-core/proteinfold/pulls/460)] - Use `nvidia-smi` to obtain number of SM. +- [[PR #454](https://github.com/nf-core/proteinfold/pulls/454)] - Update publishdir patterns for alphafold2 modules. +- [[PR #458](https://github.com/nf-core/proteinfold/pulls/458)] - Update publishdir patterns for colabfold module. +- [[#313](https://github.com/nf-core/proteinfold/issues/313)] - Harmonize colabfold metrics extraction with other modes. +- [[#455](https://github.com/nf-core/proteinfold/issues/455)] - Fix colabfold monomer inheriting id from fasta header. +- [[#457](https://github.com/nf-core/proteinfold/issues/457)] - Fix colabfold multimer always downloading model weights. +- [[PR #461](https://github.com/nf-core/proteinfold/pulls/461)] - Update publishdir patterns for HelixFold3 module +- [[PR #462](https://github.com/nf-core/proteinfold/pulls/462)] - Update publishdir patterns for RoseTTAFold-All-Atom modules +- [[PR #464](https://github.com/nf-core/proteinfold/pulls/454)] - Update publishdir patterns for Boltz module +- [[PR #466](https://github.com/nf-core/proteinfold/pulls/464)] - Update module conf and publishdir patterns for ESMFold, pass through container args +- [[PR #469](https://github.com/nf-core/proteinfold/pulls/454)] - HTML reports now in /reports output directory +- [[PR #468](https://github.com/nf-core/proteinfold/pulls/468)] - Update publishdir patterns for Alphafold3 module +- [[PR #471](https://github.com/nf-core/proteinfold/pulls/471)] - Update publishdir patterns for Rosettafold2na module +- [[#473](https://github.com/nf-core/proteinfold/issues/473)] - Add nf-test for `rosettafold-aa`, `rosettafold2na`, `helixfold3` and `boltz` modes. +- [[PR #475](https://github.com/nf-core/proteinfold/pulls/475)] - Update and simplify outputs.md with the latest structure +- [[#480](https://github.com/nf-core/proteinfold/issues/480)] - Make version reporting consistent for all local modules. +- [[PR #482](https://github.com/nf-core/proteinfold/pulls/482)] - Update utils_nfschema to fix help message with strict syntax. +- [[PR #483](https://github.com/nf-core/proteinfold/pulls/483)] - Move foldseek logic to the `post_processing` subworkflow and set sensible time to aria2 processes. +- [[PR #492](https://github.com/nf-core/proteinfold/pulls/492)] - Clean TODOs from code and create issues instead for 2.0.0 release preparation. +- [[PR #493](https://github.com/nf-core/proteinfold/pulls/493)] - Standardise Dockerfiles labels and bump version 2.0.0 to prepare release. +- [[#494](https://github.com/nf-core/proteinfold/issues/494)] - Publish Colabfold DBs when downloaded to be directly consumable using `colabfold_db` parameter. +- [[#496](https://github.com/nf-core/proteinfold/issues/496)] - Publish all DBs when downloaded to be directly consumable using the corresponding mode parameter. +- [[#494](https://github.com/nf-core/proteinfold/issues/494)] - Publish Colabfold DBs when downloaded to be directly consumable using `colabfold_db` parameter. +- [[#499](https://github.com/nf-core/proteinfold/issues/499)] - Get rid of `ENTRYPOINT` in alphafold2 dockerfiles. +- [[PR #501](https://github.com/nf-core/proteinfold/pulls/501)] - Move python code of `BOLTZ_FASTA` to a python script in `bin`. +- [[#503](https://github.com/nf-core/proteinfold/issues/503)] - Add checkIfExists validation to user-provided database paths across all prepare DB subworkflows. +- [[#507](https://github.com/nf-core/proteinfold/issues/507)] - Implement missing full tests and check that the others work before release 2.0.0. +- [[PR #509](https://github.com/nf-core/proteinfold/pulls/509)] - Setup gpu environment for AWS full tests. +- [[#519](https://github.com/nf-core/proteinfold/issues/519)] - Fix AWS full test before release 2.0.0. +- [[PR #525](https://github.com/nf-core/proteinfold/pulls/525)] - Reduce AlphaFold2 multimer full test to a single replicate. +- [[PR #531](https://github.com/nf-core/proteinfold/pulls/531)] - Fix alphafold2_random_seed type. +- [[PR #538](https://github.com/nf-core/proteinfold/pulls/538)] - Bump version 2.0.0 for release preparation. +- [[PR #563](https://github.com/nf-core/proteinfold/pulls/563)] - Fix boltz csv sync. +- [[PR #570](https://github.com/nf-core/proteinfold/pulls/570)] - Update parameters table in changelog. + +### Parameters + +| Old parameter | New parameter | +| ---------------------------- | ------------------------------------------- | +| `--max_template_date` | `--alphafold2_max_template_date` | +| `--bfd_link` | `--alphafold2_bfd_link` | +| `--small_bfd_link` | `--alphafold2_small_bfd_link` | +| `--mgnify_link` | `--alphafold2_mgnify_link` | +| `--pdb70_link` | `--alphafold2_pdb70_link` | +| `--pdb_mmcif_link` | `--alphafold2_pdb_mmcif_link` | +| `--pdb_obsolete_link` | `--alphafold2_pdb_obsolete_link` | +| `--uniref30_alphafold2_link` | `--alphafold2_uniref30_link` | +| `--uniref90_link` | `--alphafold2_uniref90_link` | +| `--pdb_seqres_link` | `--alphafold2_pdb_seqres_link` | +| `--uniprot_sprot_link` | `--alphafold2_uniprot_sprot_link` | +| `--uniprot_trembl_link` | `--alphafold2_uniprot_trembl_link` | +| `--bfd_path` | `--alphafold2_bfd_path` | +| `--small_bfd_path` | `--alphafold2_small_bfd_path` | +| `--mgnify_path` | `--alphafold2_mgnify_path` | +| `--pdb70_path` | `--alphafold2_pdb70_path` | +| `--pdb_mmcif_path` | `--alphafold2_pdb_mmcif_path` | +| `--uniref30_alphafold2_path` | `--alphafold2_uniref30_path` | +| `--uniref90_path` | `--alphafold2_uniref90_path` | +| `--pdb_seqres_path` | `--alphafold2_pdb_seqres_path` | +| `--uniprot_path` | `--alphafold2_uniprot_path` | +| `--colabfold_server` | `--use_msa_server` | +| `--host_url` | `--msa_server_url` | +| `--uniref30_colabfold_link` | `--colabfold_uniref30_link` | +| `--uniref30_colabfold_path` | `--colabfold_uniref30_path` | +| `--colabfold_db_path` | `--colabfold_envdb_path` | +| `--create_colabfold_index` | `--colabfold_create_index` | +| `--use_amber` | `--colabfold_use_amber` | +| `--use_templates` | `--colabfold_use_templates` | +| `--db_load_mode` | `--colabfold_db_load_mode` | +| `--num_recycles_colabfold` | `--colabfold_num_recycles` | +| `--num_recycles_esmfold` | `--esmfold_num_recycles` | +| | `--save_intermediates` | +| | `--split_fasta` | +| | `--db` | +| | `--alphafold2_full_dbs` | +| | `--uniref30_prefix` | +| | `--skip_visualisation` | +| | `--skip_foldseek` | +| | `--foldseek_easysearch_arg` | +| | `--alphafold2_random_seed` | +| | `--alphafold2_pdb_obsolete_path` | +| | `--alphafold3_db` | +| | `--alphafold3_small_bfd_link` | +| | `--alphafold3_small_bfd_path` | +| | `--alphafold3_mgnify_link` | +| | `--alphafold3_mgnify_path` | +| | `--alphafold3_pdb_mmcif_link` | +| | `--alphafold3_pdb_mmcif_path` | +| | `--alphafold3_uniref90_link` | +| | `--alphafold3_uniref90_path` | +| | `--alphafold3_pdb_seqres_link` | +| | `--alphafold3_pdb_seqres_path` | +| | `--alphafold3_uniprot_link` | +| | `--alphafold3_uniprot_path` | +| | `--alphafold3_params_path` | +| | `--alphafold3_rnacentral_link` | +| | `--alphafold3_rnacentral_path` | +| | `--alphafold3_nt_rna_link` | +| | `--alphafold3_nt_rna_path` | +| | `--alphafold3_rfam_link` | +| | `--alphafold3_rfam_path` | +| | `--boltz_model` | +| | `--boltz_use_potentials` | +| | `--boltz_use_kernels` | +| | `--boltz_ccd_link` | +| | `--boltz_ccd_path` | +| | `--boltz_model_link` | +| | `--boltz_model_path` | +| | `--boltz2_aff_link` | +| | `--boltz2_aff_path` | +| | `--boltz2_conf_link` | +| | `--boltz2_conf_path` | +| | `--boltz2_mols_link` | +| | `--boltz2_mols_path` | +| | `--boltz_db` | +| | `--helixfold3_db` | +| | `--helixfold3_precision` | +| | `--helixfold3_infer_times` | +| | `--helixfold3_max_template_date` | +| | `--helixfold3_uniclust30_link` | +| | `--helixfold3_uniclust30_path` | +| | `--helixfold3_ccd_preprocessed_link` | +| | `--helixfold3_ccd_preprocessed_path` | +| | `--helixfold3_rfam_link` | +| | `--helixfold3_rfam_path` | +| | `--helixfold3_init_models_link` | +| | `--helixfold3_init_models_path` | +| | `--helixfold3_bfd_link` | +| | `--helixfold3_bfd_path` | +| | `--helixfold3_small_bfd_link` | +| | `--helixfold3_small_bfd_path` | +| | `--helixfold3_uniprot_sprot_link` | +| | `--helixfold3_uniprot_trembl_link` | +| | `--helixfold3_uniprot_path` | +| | `--helixfold3_pdb_seqres_link` | +| | `--helixfold3_pdb_seqres_path` | +| | `--helixfold3_uniref90_link` | +| | `--helixfold3_uniref90_path` | +| | `--helixfold3_mgnify_link` | +| | `--helixfold3_mgnify_path` | +| | `--helixfold3_pdb_mmcif_link` | +| | `--helixfold3_pdb_mmcif_path` | +| | `--helixfold3_obsolete_link` | +| | `--helixfold3_obsolete_path` | +| | `--helixfold3_maxit_src_link` | +| | `--helixfold3_maxit_src_path` | +| | `--rosettafold_all_atom_db` | +| | `--rosettafold_all_atom_uniref30_link` | +| | `--rosettafold_all_atom_uniref30_path` | +| | `--rosettafold_all_atom_pdb100_link` | +| | `--rosettafold_all_atom_pdb100_path` | +| | `--rosettafold_all_atom_bfd_link` | +| | `--rosettafold_all_atom_bfd_path` | +| | `--rosettafold_all_atom_paper_weights_link` | +| | `--rosettafold_all_atom_paper_weights_path` | +| | `--rosettafold2na_db` | +| | `--rosettafold2na_uniref30_link` | +| | `--rosettafold2na_uniref30_path` | +| | `--rosettafold2na_bfd_link` | +| | `--rosettafold2na_bfd_path` | +| | `--rosettafold2na_pdb100_link` | +| | `--rosettafold2na_pdb100_path` | +| | `--rosettafold2na_weights_link` | +| | `--rosettafold2na_weights_path` | +| | `--rosettafold2na_rna_path` | +| | `--rfam_full_region_link` | +| | `--rfam_cm_link` | +| | `--rnacentral_rfam_annotations_link` | +| | `--rnacentral_id_mapping_link` | +| | `--rnacentral_sequences_link` | +| `--max_memory` | | +| `--max_cpus` | | +| `--max_time` | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if parameter information is present only for the old parameter. + +## [[1.1.1](https://github.com/nf-core/proteinfold/releases/tag/1.1.1)] - 2025-07-30 ### Enhancements & fixes +- Minor patch release to fix multiqc report. + ## [[1.1.0](https://github.com/nf-core/proteinfold/releases/tag/1.1.0)] - 2025-06-25 ### Credits @@ -80,8 +348,8 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements | `--uniprot_sprot` | `--uniprot_sprot_link` | | `--uniprot_trembl` | `--uniprot_trembl_link` | | `--uniclust30_path` | `--uniref30_alphafold2_path` | -| `--uniref30` | `--uniref30_colabfold_link` | -| `--uniref30_path` | `--uniref30_colabfold_path` | +| `--uniref30` | `--colabfold_uniref30_link` | +| `--uniref30_path` | `--colabfold_uniref30_path` | | `--num_recycle` | `--num_recycles_colabfold` | | | `--num_recycles_esmfold` | | | `--uniref30_alphafold2_link` | @@ -92,6 +360,8 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements | | `--esm2_t36_3B_UR50D_contact_regression` | | | `--esmfold_params_path` | | | `--skip_multiqc` | +| | `--rosettafold_all_atom_db` | +| | `--helixfold3_db` | > **NB:** Parameter has been **updated** if both old and new parameter information is present. > **NB:** Parameter has been **added** if just the new parameter information is present. diff --git a/LICENSE b/LICENSE index 935559442..7f0ffbad5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Athanasios Baltzis, Jose Espinosa-Carrasco, Harshil Patel +Copyright (c) The nf-core/proteinfold team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 92e39651a..1eed72699 100644 --- a/README.md +++ b/README.md @@ -5,17 +5,18 @@ -[![GitHub Actions CI Status](https://github.com/nf-core/proteinfold/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinfold/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7629995-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7629995) -[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) +[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinfold) +[![GitHub Actions CI Status](https://github.com/nf-core/proteinfold/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/nf-test.yml) +[![GitHub Actions Linting Status](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinfold/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.13135393-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.13135393) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinfold) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinfold-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinfold)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinfold-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinfold)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -27,120 +28,82 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Pipeline summary -![Alt text](docs/images/nf-core-proteinfold_metro_map_1.1.0.png?raw=true "nf-core-proteinfold 1.1.0 metro map") +![Alt text](docs/images/nf-core-proteinfold_metro_map_2.0.0.png?raw=true "nf-core-proteinfold 2.0.0 metro map") -1. Choice of protein structure prediction method: +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :------------------------------------------------------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [AlphaFold2](https://github.com/deepmind/alphafold) | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | +| [ESMFold](https://github.com/facebookresearch/esm) | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| [ColabFold](https://github.com/sokrypton/ColabFold) | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | +| [RoseTTAFold2NA](https://github.com/uw-ipd/RoseTTAFold2NA) | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| [AlphaFold3](https://github.com/google-deepmind/alphafold3) | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| [Boltz](https://github.com/jwohlwend/boltz/) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | - i. [AlphaFold2](https://github.com/deepmind/alphafold) - Regular AlphaFold2 (MSA computation and model inference in the same process) +**nf-core/proteinfold** supports multiple tools for general molecular structure prediction. Each of the methods have overlapping functionality which can be utilized within the pipeline. All tools support predicting protein structure from an input amino acid sequence. The pipeline is composed of the following steps: - ii. [AlphaFold2 split](https://github.com/luisas/alphafold_split) - AlphaFold2 MSA computation and model inference in separate processes +1. Split input fasta file (Optional): The pipeline can split large batches of monomeric sequences (eg an entire genome) from a multi-entry fasta input using the `--split_fasta` flag. - iii. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 API server followed by ColabFold +2. Prepare databases for chosen methods: The pipeline downloads any required reference data. - iv. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 local search followed by ColabFold +3. Structure prediction: - v. [ESMFold](https://github.com/facebookresearch/esm) - Regular ESM + i. Combined: MSA Search + Model Inference: Structures are predicted from MSAs generated using built-in homolog search pipelines. + + ii. Split: AlphaFold2 MSA Search + Model Inference: The AlphaFold2 MSA generation pipeline is executed independently and then provided as input for AlphaFold2 structure prediction. + + iii. Split: ColabFold MSA Search + Model Inference: The ColabFold MSA generation pipeline is used to produce input MSAs which can be used by ColabFold and Boltz. + + iv. pLM: Protein Language Model: The ESMFold model is used to predict structures without generating an MSA. + +4. Generate Report: The pipeline produces an interactive HTML report to visualize structure prediction outputs. + +5. Comparison Report: The structures predicted by parallel modes are combined in an interactive HTML report. + +6. MultiQC: The overall QC statistics are summarized. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +First, prepare a samplesheet with your input data that looks as follows: + +```csv title="samplesheet.csv" +id,fasta +T1024,T1024.fasta +T1026,T1026.fasta +``` + Now, you can run the pipeline using: ```bash nextflow run nf-core/proteinfold \ -profile \ --input samplesheet.csv \ - --outdir + --outdir \ + --mode +``` + +The pipeline takes care of downloading the databases and parameters required by each of the modes. In case you have already downloaded the required files, you can skip this step by providing the path to the databases using the `--db` parameter. + +```bash +nextflow run nf-core/proteinfold \ + -profile \ + --input samplesheet.csv \ + --outdir \ + --mode \ + --db ``` -The pipeline takes care of downloading the databases and parameters required by AlphaFold2, Colabfold or ESMFold. In case you have already downloaded the required files, you can skip this step by providing the path to the databases using the corresponding parameter [`--alphafold2_db`], [`--colabfold_db`] or [`--esmfold_db`]. Please refer to the [usage documentation](https://nf-co.re/proteinfold/usage) to check the directory structure you need to provide for each of the databases. - -- The typical command to run AlphaFold2 mode is shown below: - - ```console - nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode alphafold2 \ - --alphafold2_db \ - --full_dbs \ - --alphafold2_model_preset monomer \ - --use_gpu \ - -profile - ``` - -- Here is the command to run AlphaFold2 splitting the MSA from the prediction execution: - - ```console - nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode alphafold2 \ - --alphafold2_mode split_msa_prediction \ - --alphafold2_db \ - --full_dbs \ - --alphafold2_model_preset monomer \ - --use_gpu \ - -profile - ``` - -- Below, the command to run colabfold_local mode: - - ```console - nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode colabfold \ - --colabfold_server local \ - --colabfold_db \ - --num_recycles_colabfold 3 \ - --use_amber \ - --colabfold_model_preset "AlphaFold2-ptm" \ - --use_gpu \ - --db_load_mode 0 - -profile - ``` - -- The typical command to run colabfold_webserver mode would be: - - ```console - nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode colabfold \ - --colabfold_server webserver \ - --host_url \ - --colabfold_db \ - --num_recycles_colabfold 3 \ - --use_amber \ - --colabfold_model_preset "AlphaFold2-ptm" \ - --use_gpu \ - -profile - ``` - - [!WARNING] - - > If you aim to carry out a large amount of predictions using the colabfold_webserver mode, please setup and use your own custom MMSeqs2 API Server. You can find instructions [here](https://github.com/sokrypton/ColabFold/tree/main/MsaServer). - -- The esmfold mode can be run using the command below: - - ```console - nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode esmfold \ - --esmfold_model_preset \ - --esmfold_db \ - --num_recycles_esmfold 4 \ - --use_gpu \ - -profile - ``` +> [!WARNING] +> The reference data for most methods is extremely large and may exceed individual user disk allocations on shared HPC systems. + +In order to run multiple methods simultaneously where reference data is stored at different locations, the `--db` flag can be overwritten for each specific mode (e.g. `--alphafold2_db`, `--colabfold_db`, `--esmfold_db` and `--rosettafold_all_atom_db`). Please refer to the [usage documentation](https://nf-co.re/proteinfold/usage) to check the directory structure you must provide for each database. > [!WARNING] -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; -> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinfold/usage) and the [parameter documentation](https://nf-co.re/proteinfold/parameters). @@ -150,6 +113,10 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/proteinfold/output). +## Adding new modes to the pipeline + +For details on how to contribute new modes to the pipeline please refer to the [Howto contribute new modes](https://nf-co.re/proteinfold/usage/HOWTO_CONTRIBUTE_NEW_MODES). + ## Credits nf-core/proteinfold was originally written by Athanasios Baltzis ([@athbaltzis](https://github.com/athbaltzis)), Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)), Luisa Santus ([@luisas](https://github.com/luisas)) and Leila Mansouri ([@l-mansouri](https://github.com/l-mansouri)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/) under the umbrella of the [BovReg project](https://www.bovreg.eu/) and Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/). diff --git a/assets/NO_FILE b/assets/NO_FILE new file mode 100644 index 000000000..e69de29bb diff --git a/assets/NO_FILE_PAE b/assets/NO_FILE_PAE new file mode 100644 index 000000000..e69de29bb diff --git a/assets/comparison_template.html b/assets/comparison_template.html new file mode 100644 index 000000000..fffe4b133 --- /dev/null +++ b/assets/comparison_template.html @@ -0,0 +1,820 @@ + + + + + + + Protein structure comparison + + + + + + + + + + + + + + + +
+ +
+ +
+ + + +
+ +
+ + + + + +
+ +
+ +
+ +
+
Navigation
+
+
+ Scroll up/down + to zoom in and out +
+
+ Click + drag + to rotate the structure +
+
+ CTRL + click + drag + to move the structure +
+
+ Click + an atom to bring it into focus +
+
+
+
+
Display
+
+ + +
+
+
+
+ +
+
+
+ +
+
    +
    + +
    +
    +
    Information
    +
    +
    Program: *prog_name*
    +
    ID: *sample_name*
    +
    + Average pLDDT: + +
    +
    +
    +
    +
    Download
    +
    + + +
    +
    +
    +
    +
    +
    pLDDT
    +
    +
    +
    +
    +
    +
    +
    Sequence Coverage
    +
    +
    + +
    + +
    +
    +
    + + + +
    +
    +

    + The Australian BioCommons + is supported by + Bioplatforms Australia +

    +

    + Bioplatforms Australia + is enabled by + NCRIS +

    +
    +
    +
    + + + diff --git a/assets/dummy_db_dir/RNA/dummy b/assets/dummy_db_dir/RNA/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/UniRef30_2020_06/dummy b/assets/dummy_db_dir/UniRef30_2020_06/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/bfd/dummy b/assets/dummy_db_dir/bfd/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/colabfold_envdb/dummy_colabfold_envdb b/assets/dummy_db_dir/colabfold_envdb/dummy_colabfold_envdb new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/colabfold_uniref30/dummy_uniref30 b/assets/dummy_db_dir/colabfold_uniref30/dummy_uniref30 new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/maxit-v11.200-prod-src/dummy b/assets/dummy_db_dir/maxit-v11.200-prod-src/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/mgnify/dummy b/assets/dummy_db_dir/mgnify/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/nt_rna/dummy b/assets/dummy_db_dir/nt_rna/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/HelixFold3-240814.pdparams b/assets/dummy_db_dir/params/HelixFold3-240814.pdparams new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/RFAA_paper_weights.pt b/assets/dummy_db_dir/params/RFAA_paper_weights.pt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/alphafold_params_2021-07-14/dummy b/assets/dummy_db_dir/params/alphafold_params_2021-07-14/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/alphafold_params_2022-12-06/dummy b/assets/dummy_db_dir/params/alphafold_params_2022-12-06/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/boltz1_conf.ckpt b/assets/dummy_db_dir/params/boltz1_conf.ckpt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/boltz2_aff.ckpt b/assets/dummy_db_dir/params/boltz2_aff.ckpt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/boltz2_conf.ckpt b/assets/dummy_db_dir/params/boltz2_conf.ckpt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/ccd.pkl b/assets/dummy_db_dir/params/ccd.pkl new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/ccd_preprocessed_etkdg.pkl.gz b/assets/dummy_db_dir/params/ccd_preprocessed_etkdg.pkl.gz new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/mols/dummy b/assets/dummy_db_dir/params/mols/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/params/network/weights/RF2NA_apr23.pt b/assets/dummy_db_dir/params/network/weights/RF2NA_apr23.pt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/pdb100/dummy b/assets/dummy_db_dir/pdb100/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/pdb70/dummy b/assets/dummy_db_dir/pdb70/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/pdb_mmcif/mmcif_files/dummy b/assets/dummy_db_dir/pdb_mmcif/mmcif_files/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/pdb_mmcif/obsolete.dat b/assets/dummy_db_dir/pdb_mmcif/obsolete.dat new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/pdb_seqres/dummy b/assets/dummy_db_dir/pdb_seqres/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/rfam/Rfam-14.9_rep_seq.fasta b/assets/dummy_db_dir/rfam/Rfam-14.9_rep_seq.fasta new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/rnacentral/dummy b/assets/dummy_db_dir/rnacentral/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/small_bfd/dummy b/assets/dummy_db_dir/small_bfd/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/uniprot/dummy b/assets/dummy_db_dir/uniprot/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/uniref30/dummy b/assets/dummy_db_dir/uniref30/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/dummy_db_dir/uniref90/dummy b/assets/dummy_db_dir/uniref90/dummy new file mode 100644 index 000000000..e69de29bb diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 3b58e3d09..699e87c8c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,5 @@ report_comment: > - This report has been generated by the nf-core/proteinfold - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/proteinfold analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: "nf-core-proteinfold-methods-description": order: -1000 diff --git a/assets/report_template.html b/assets/report_template.html new file mode 100644 index 000000000..48f644970 --- /dev/null +++ b/assets/report_template.html @@ -0,0 +1,934 @@ + + + + + + + Protein structure prediction + + + + + + + + + + + + + + + +
    + +
    + +
    + + + +
    + +
    + +
    +
    +
    +
    +
    +
    +
    Very High
    +
    +
    +
    +
    +
    +
    High
    +
    +
    +
    +
    +
    +
    Low
    +
    +
    +
    +
    +
    +
    Very Low
    +
    +
    +
    +
    +
    +

    + Alphafold produces a + + per-residue confidence score (pLDDT) + + between 0 and 100. Some regions below 50 pLDDT may be unstructured in isolation. +

    +
    +
    + + + + +
    + +
    + +
    +
    + +
    +
    Information
    +
    +
    +
    Program: *prog_name*
    +
    ID: *sample_name*
    +
    +
    + Average pLDDT: + +
    +
    +
    + +
    +
    Navigation
    +
    +
    + Scroll up/down + to zoom in and out +
    +
    + Click + drag + to rotate the structure +
    +
    + CTRL + click + drag + to move the structure +
    +
    + Click + an atom to bring it into focus +
    +
    +
    +
    + +
    +
    +
    Representations
    +
    + + + + +
    +
    +
    +
    +
    +
    Display
    +
    + + +
    +
    +
    +
    Download
    +
    + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    + +
    +
    +
    + Sequence Coverage – MSA +
    +
    +
    + +
    +
    +
    +
    + + +
    +
    +
    + Residue confidence - pLDDT +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    + Residue-pair alignment error - PAE +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    + + + +
    +
    +

    + The Australian BioCommons + is supported by + Bioplatforms Australia +

    +

    + Bioplatforms Australia + is enabled by + NCRIS +

    +
    +
    +
    + + + diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 467fdcf0f..b458d6043 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sequence,fasta +id,fasta T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index b16e3ae50..a8782c43d 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/proteinfold/master/assets/schema_input.json", "title": "nf-core/proteinfold pipeline - params.input schema", "description": "Schema for the file provided with params.input", @@ -8,6 +8,12 @@ "type": "object", "properties": { "sequence": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sequence name must be provided and cannot contain spaces", + "meta": ["sequence"] + }, + "id": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Sequence name must be provided and cannot contain spaces", @@ -17,10 +23,11 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.fa(sta)?$", - "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'" + "pattern": "^\\S+\\.(fa(sta)?|faa|yaml|yml|json)$", + "errorMessage": "Fasta, yaml or json file must be provided, cannot contain spaces and must have extension '.fa', '.faa', '.fasta', '.yaml', '.yml', or '.json'" } }, - "required": ["sequence", "fasta"] + "required": ["fasta"], + "anyOf": [{ "required": ["sequence"] }, { "required": ["id"] }] } } diff --git a/bin/boltz_wrapper.py b/bin/boltz_wrapper.py new file mode 100755 index 000000000..9de64b0fa --- /dev/null +++ b/bin/boltz_wrapper.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +# Written by Jose Espinosa-Carrasco, released under the MIT license +# See https://opensource.org/license/mit for details +# Created on January 29th 2026 +# See https://github.com/nf-core/proteinfold/issues/417 for context +# Wrapper script to run Boltz with MIG patch for pynvml.nvmlDeviceGetNumGpuCores + +import sys +import pynvml +import os +import torch + +# Cores per SM by architecture +CORES_PER_SM = { + "Fermi": 32, + "Kepler": 192, + "Maxwell": 128, + "Pascal": 64, + "Volta": 64, + "Ampere": 64, + "Hopper": 128, + "Blackwell": 128, +} + +# Get number of CUDA cores for a MIG GPU instance +def get_cuda_cores(sm_count: int) -> int: + """Get CUDA cores for a MIG GPU instance profile.""" + + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + name = pynvml.nvmlDeviceGetName(handle) + + if "B100" in name or "B200" in name: + arch = "Blackwell" + elif "H100" in name or "H200" in name: + arch = "Hopper" + elif "A100" in name or "A30" in name or "A40" in name: + arch = "Ampere" + elif "V100" in name: + arch = "Volta" + elif "P100" in name: + arch = "Pascal" + else: + raise RuntimeError(f"Unknown GPU architecture for device: {name}") + + n_cores = sm_count * CORES_PER_SM[arch] + print(f">>> Detected GPU: {name}, Architecture: {arch}, SM Count: {sm_count}, Total CUDA Cores: {n_cores}") + + return sm_count * CORES_PER_SM[arch] + +# Apply the monkey patch to "nvmlDeviceGetNumGpuCores" pynvml function +def apply_mig_patch() -> None: + """Monkey-patch pynvml.nvmlDeviceGetNumGpuCores for MIG mode.""" + sm_count = get_mig_devices_and_sm_counts() + n_cores = get_cuda_cores(sm_count) + pynvml.nvmlDeviceGetNumGpuCores = lambda h: n_cores + print(">>> MIG PATCH: Successfully mocked nvmlDeviceGetNumGpuCores", file=sys.stderr) + +# Get the multiprocessor count for a CUDA device +def get_multiprocessor_count(device_index: int) -> int: + properties = torch.cuda.get_device_properties(device_index) + return int(properties.multi_processor_count) + +# Get the number of visible GPU devices and their multiprocessor counts, return the minimum +# multiprocessor count across all visible devices (to handle mixed MIG slice sizes) +def get_mig_devices_and_sm_counts() -> int: + + visible_device_count = torch.cuda.device_count() + current_device_index = torch.cuda.current_device() + + all_counts = [get_multiprocessor_count(i) for i in range(visible_device_count)] + current_count = get_multiprocessor_count(current_device_index) + min_count = min(all_counts) + + if visible_device_count > 1: + unique_counts = sorted(set(all_counts)) + if len(unique_counts) > 1: + print ( + "Visible GPU devices report different multiprocessor_count values " + f"(counts per device: {all_counts}). This is expected if you have mixed MIG slice sizes. " + "If you expected identical slices, check your allocation and device isolation." + ) + + print (f">>> Visible GPU devices: {visible_device_count}, Current device index: {current_device_index}, current device multiprocessor_count: {current_count}, all device multiprocessor_counts: {all_counts}, minimal multiprocessor_count: {min_count}", file=sys.stderr) + + return min_count + +# Main execution +if __name__ == "__main__": + apply_mig_patch() + + from boltz.main import cli + sys.argv[0] = 'boltz' + sys.exit(cli()) diff --git a/bin/extract_metrics.py b/bin/extract_metrics.py new file mode 100755 index 000000000..a026aa646 --- /dev/null +++ b/bin/extract_metrics.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python + +import pickle +import os +import argparse +import json +#import torch moved to a conditional import since too bulky import if not used +import numpy as np +import csv +import string +from utils import plddt_from_struct_b_factor, get_chain_ids + +# TODO: Issue #309, make into a proper separate process, it its own module so that dependencies can be managed better +# TODO: Need a sense of ranking, so that metrics can be traced back to correct model structure, even if they're not in sequential order. The enumerates() here are not sufficient. +# Needs to be program-dependent, (see item below). +# TODO: look into have a --prog argument that could set filenames etc, logically seperate it? +# {name}_{prog}_{metric}.tsv might be easier for MultiQC to parse a complex workdir, than without the .prog +# TODO: read --prog from ${meta.model} in the NextFlow pipes. This also allows case switching in a proper EXTRACT_METRICS process. +# E.g. in main.nf of EXTACT_METRICS process, we could have: +# match ${meta.mode}: +# case 'alphafold2': +# ... +# case 'rosettafold_all_atom': +# ... +#... +# ^ overwrought with duplication, but can catch program specific weirdness, and lower barrier to adding new programs in the future. + +# TODO: Chain-wise iPTM since the relevant interface might not always be the average of all. +# Would complete Issue #308 +# Proposed format is pair-interfaces in rows, structure inference number in cols: https://github.com/nf-core/proteinfold/pull/312#issuecomment-2917709432 +# KR - changed to have both sides of the matrix, because it's not symmetrical (see comment in Issue #306) + +# Mapping of characters to integers for MSA parsing. +# 20 is for unknown characters, and 21 is for gaps. +AA_to_int = { + "A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, + "M": 10, "N": 11, "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "V": 17, "W": 18, "Y": 19, + ".": 20, "-": 21 +} + +def a3m_to_int(a3m_file): + """ + Convert an A3M MSA representation into an integer representation (0-21). + """ + with open(a3m_file, "r") as f: + msa = f.read() + + # Convert each sequence in the MSA + int_sequences = [] + for idx, line in enumerate(msa.splitlines()): + if idx == 0 and not line.startswith(">"): # If there's an additional header (non-FASTA) skip it. E.g ColabFold + continue + + if not line.startswith(">"): # Ignore header lines + filtered_line = ''.join(char for char in line if not char.islower()) # Remove inserts (lower-case chars) in a3m + int_sequence = [AA_to_int.get(char.upper(), 20) for char in filtered_line] + int_sequences.append(int_sequence) + + int_sequences_array = np.array(int_sequences, dtype=object) + return int_sequences_array + +def format_msa_rows(msa_data): + return [[str(x) for x in val] for val in msa_data] + +def format_pae_rows(pae_data): + return [[f"{num:.4f}" for num in row] for row in pae_data] + +def format_iptm_rows(chain_pair_entries, chain_ids=None): + """ + Format iPTM data into a list of rows for writing to a TSV file. + Each row contains: the chain-pair in uppercase, e.g. "A:B", "B:A", A:C", etc. and then the iPTM value formatted to 4 decimal places. + """ + def idx_to_letter(idx): + """ Convert the index integer of the matrix to a letter representation that wraps to double representation, e.g. 0 -> A, 1 -> B, ..., 25 -> Z, 26 -> AA, 27 -> AB, etc. + This is somewhat compatible with how protein structure chain names are numbered by biochemists. + But we should move away from fixed-format PDB files -- we have nothing to lose but our chains.""" + result = "" + while idx >= 0: + result = string.ascii_uppercase[idx % 26] + result + idx = idx // 26 - 1 + if idx < 0: + break + return result + + if chain_ids: + #would be better with some model_id sorting + iptm_rows = [[""]+[f"{chain_ids[idx[0]]}:{chain_ids[idx[1]]}" for idx, val in next(iter(chain_pair_entries.values()))]] + else: + iptm_rows = [[""]+[f"{idx_to_letter(idx[0])}:{idx_to_letter(idx[1])}" for idx, val in next(iter(chain_pair_entries.values()))]] + + for model_idx, chain_pair_entries_values in chain_pair_entries.items(): + iptm_rows.append([model_idx]+[f"{val:.4f}" for idx, val in chain_pair_entries_values]) + + return [list(row) for row in zip(*iptm_rows)] + + +def chain_iptm_matrix_to_pairs(iptm_matrix): + """ + Convert a chain-wise iPTM matrix to pair values by taking off-diagonal elements. + """ + # From AlphaFold3 output docs: + # 'chain_pair_iptm': An [num_chains, num_chains] array. + # Off-diagonal element (i, j) of the array contains the ipTM restricted to tokens from chains i and j. + # Diagonal element (i, i) contains the pTM restricted to chain i. + return [(idx, val) for idx, val in np.ndenumerate(iptm_matrix) if idx[0] != idx[1]] + +def chainwise_iptm_matrix_to_ptms(iptm_matrix): + return [(idx, val) for idx, val in np.ndenumerate(iptm_matrix) if idx[0] == idx[1]] + +def write_tsv(file_path, rows): + with open(file_path, 'w') as out_f: + writer = csv.writer(out_f, delimiter='\t') + writer.writerows(rows) + +def extract_structs_plddt_to_tsv(name, structures): + """ + Write out a tsv file contain pLDDTs for reading by MultiQC in nf-core/proteinfold + Uses utils function with BioPython PDB package to extract residue pLDDT values from the b-factor column. + """ + plddt_cols = [plddt_from_struct_b_factor(structure) for structure in structures] + res_counts = [len(plddt_col) for plddt_col in plddt_cols] + + if len(set(res_counts)) != 1: + raise ValueError("Not all structures have the same number of residues!") + + rank_names = [f"rank_{i}" for i in range(len(structures))] + # Create header as the first row + plddt_rows = [["Positions"] + rank_names] + res_id_col = list(range(len(plddt_cols[0]))) + plddt_rows.extend(zip(res_id_col, *plddt_cols)) # Combine lists column-wise to make rows + write_tsv(f"{name}_plddt.tsv", plddt_rows) + +def read_pkl(name, pkl_files): + """ + Adapted from the Galaxy AlphaFold tool (https://github.com/usegalaxy-au/tools-au/blob/de94df520c8dc7b8652aedb92e90f6ebb312f95f/tools/alphafold/scripts/outputs.py), originally authored by @neoformit and @graceahall and funded by Australian Biocommons and QCIF Australia. + """ + ptm_data = {} + iptm_data = {} + for pkl_file in pkl_files: + print(f"Processing {pkl_file}") + data = pickle.load(open(pkl_file, "rb")) + + # Process MSA data + if pkl_file.endswith("final_features.pkl"): # HelixFold3 - This one must be first + write_tsv(f"{name}_msa.tsv", format_msa_rows(data["feat"]["msa"])) + elif pkl_file.endswith("features.pkl"): # AlphaFold2.3 + try: + N = data["num_alignments"][0] #monomer + except: + N = data["num_alignments"] #multimer + write_tsv(f"{name}_msa.tsv", format_msa_rows(data["msa"][:N])) + else: + model_info = os.path.basename(pkl_file).replace("result_", "").replace(".pkl", "") + #TODO: Make this explicit input + with open(os.path.join(os.path.dirname(pkl_file),"ranking_debug.json")) as f: + ranking_data = json.load(f)['order'] + model_id = ranking_data.index(model_info) + if 'predicted_aligned_error' not in data.keys(): + print(f"No PAE output in {pkl_file}, it was likely a monomer calculation") + else: + write_tsv(f"{name}_{model_id}_pae.tsv", format_pae_rows(data["predicted_aligned_error"])) + + if 'ptm' not in data.keys(): + print(f"No pTM/iPTM output in {pkl_file}, it was likely a monomer calculation") + else: + ptm_data[model_id] = f"{np.round(data['ptm'],3)}\n" + + if 'iptm' in data: + iptm_data[model_id] = f"{np.round(data['iptm'],3)}\n" + if ptm_data: + ptm_rows = sorted([[k, v.strip()] for k, v in ptm_data.items()], key=lambda x: x[0]) + write_tsv(f"{name}_ptm.tsv", ptm_rows) + + if iptm_data: + iptm_rows = sorted([[k, v.strip()] for k, v in iptm_data.items()], key=lambda x: x[0]) + write_tsv(f"{name}_iptm.tsv", iptm_rows) + +def read_paired_a3m(name, a3m_file): + msa_rows = a3m_to_int(a3m_file) + write_tsv(f"{name}_msa.tsv", format_msa_rows(msa_rows)) + +def read_a3m(name, a3m_files): + # RosettaFold-All-Atom + #TODO: DRY with unpaired below for Boltz + msa_rows = {} + for a3m_file in a3m_files: #Should already be alphabetical by chain + msa_rows[a3m_file] = a3m_to_int(a3m_file) + + final_rows = [] + temp_row = [] + for a3m_file in a3m_files: + temp_row.extend(msa_rows[a3m_file][0]) + final_rows.append(temp_row) + + # Un-paired TODO: get pairing code from RF-AA source + # https://github.com/baker-laboratory/RoseTTAFold-All-Atom/blob/main/rf2aa/data/parsers.py#L405 + msa_widths = [len(msa_rows[chain][0]) for chain in a3m_files] + msa_heights = [len(msa_rows[chain]) for chain in a3m_files] + + cum_total_rows = np.cumsum(msa_heights) + for row_idx in range(cum_total_rows[-1]): + temp_row = [] + + for i, chain in enumerate(a3m_files): + msa = msa_rows[chain] + width = msa_widths[i] + if i == 0: + minrow = 0 + else: + minrow = cum_total_rows[i-1] + maxrow = cum_total_rows[i] + + if minrow <= row_idx < maxrow: + msa_row_idx = row_idx - minrow + temp_row.extend(msa[msa_row_idx]) + else: + temp_row.extend(["21"] * width) #gap + final_rows.append(temp_row) + + write_tsv(f"{name}_msa.tsv", format_msa_rows(final_rows)) + +def read_npz(name, npz_files): + for idx, npz_file in enumerate(npz_files): + data = np.load(npz_file) + #Boltz PAE files if --write_full_pae is used + if npz_file.split('/')[-1].startswith('pae') and npz_file.endswith('.npz'): + model_id = os.path.basename(npz_file).split('_model_')[-1].split('.npz')[0] + write_tsv(f"{name}_{model_id}_pae.tsv", format_pae_rows(data["pae"])) + +# Boltz MSA processing +def read_csv(name, csv_files): + if not os.path.isfile(csv_files[0]): return #TODO: Fix temporary workaround + msa_rows = {} + unpaired_msa_rows = {} + for csv_file in sorted(csv_files, key=lambda x: int(x.split('_')[-1].split('.csv')[0])): + msa_lines = [] + unpaired_msa_lines = [] + with open(csv_file) as f: + f.readline() + for line in f: + if line.split(',')[0] == '-1' and len(csv_files)>1: #Server MSA appears as un-paired + unpaired_msa_lines.append(''.join(c for c in line.strip('\n').split(',')[1] if not c.islower())) + else: + msa_lines.append(''.join(c for c in line.strip('\n').split(',')[1] if not c.islower())) + msa_rows[csv_file.split('_')[-1].split('.csv')[0]] = [[str(AA_to_int.get(residue, 20)) for residue in line] for line in msa_lines] + unpaired_msa_rows[csv_file.split('_')[-1].split('.csv')[0]] = [[str(AA_to_int.get(residue, 20)) for residue in line] for line in unpaired_msa_lines] + + # Get Chain to MSA mapping (ie non-redundant for homomers) + # TODO: Make this explicit input + with open(f'boltz_results_{name}/processed/manifest.json') as f: + manifest = json.load(f) + + final_rows = [] + # Paired + for i in range(len(msa_rows["0"])): #The number of paired lines is common to all MSAs + temp_row = [] + #This needs to be fixed if inference is batched in future. + for chain in manifest["records"][0]["chains"]: + j = chain["msa_id"].split("_")[-1] + temp_row.extend(msa_rows[j][i]) + final_rows.append(temp_row) + + # Un-paired + msa_widths = [len(msa_rows[chain["msa_id"].split("_")[-1]][0]) for chain in manifest["records"][0]["chains"]] + msa_heights = [len(unpaired_msa_rows[chain["msa_id"].split("_")[-1]]) for chain in manifest["records"][0]["chains"]] + + cum_total_rows = np.cumsum(msa_heights) + + for row_idx in range(cum_total_rows[-1]): + temp_row = [] + + for i, chain in enumerate(manifest["records"][0]["chains"]): + msa = unpaired_msa_rows[chain["msa_id"].split("_")[-1]] + width = msa_widths[i] + if i == 0: + minrow = 0 + else: + minrow = cum_total_rows[i-1] + maxrow = cum_total_rows[i] + + if minrow <= row_idx < maxrow: + msa_row_idx = row_idx - minrow + temp_row.extend(msa[msa_row_idx]) + else: + temp_row.extend(["21"] * width) #gap + final_rows.append(temp_row) + + write_tsv(f"{name}_msa.tsv", final_rows) + +def read_json(name, json_files): + ptm_data = {} + iptm_data = {} + chain_pair_iptm_data = {} # For iPTM data to be converted into formatted pairs with non-self elements + chain_pair_entries = {} + chainwise_ptms = {} + chain_ids = [] + + for idx, json_file in enumerate(json_files): + with open(json_file, 'r') as f: + data = json.load(f) + if json_file.endswith("_data.json"): #AF3 output with MSA info + # Can't just used format_msa_rows since there's FASTA headers in the json content + paired_msa_rows = [] + unpaired_msa_rows = [] + for chain in data['sequences']: + unpaired_MSA = chain['protein']['unpairedMsa'] + unpaired_msa_lines = [''.join(c for c in line if not c.islower()) for line in unpaired_MSA.split("\n") if line.strip() and not line.startswith(">")] + unpaired_msa_rows.append([[str(AA_to_int.get(residue, 20)) for residue in line] for line in unpaired_msa_lines]) + paired_MSA = chain['protein']['pairedMsa'] + paired_msa_lines = [''.join(c for c in line if not c.islower()) for line in paired_MSA.split("\n") if line.strip() and not line.startswith(">")] + paired_msa_rows.append([[str(AA_to_int.get(residue, 20)) for residue in line] for line in paired_msa_lines]) + + chains = len(data['sequences']) + final_rows = [] + # Paired + for i in range(len(paired_msa_rows[0])): #The number of paired lines is common to all MSAs + temp_row = [] + #This needs to be fixed if inference is batched in future. + for j in range(chains): + temp_row.extend(paired_msa_rows[j][i]) + final_rows.append(temp_row) + + # Un-paired + msa_widths = [len(paired_msa_rows[chain][0]) for chain in range(chains)] + msa_heights = [len(unpaired_msa_rows[chain]) for chain in range(chains)] + + cum_total_rows = np.cumsum(msa_heights) + + for row_idx in range(cum_total_rows[-1]): + temp_row = [] + + for i in range(chains): + msa = unpaired_msa_rows[i] + width = msa_widths[i] + if i == 0: + minrow = 0 + else: + minrow = cum_total_rows[i-1] + maxrow = cum_total_rows[i] + if minrow <= row_idx < maxrow: + msa_row_idx = row_idx - minrow + temp_row.extend(msa[msa_row_idx]) + else: + temp_row.extend(["21"] * width) #gap + final_rows.append(temp_row) + write_tsv(f"{name}_msa.tsv", final_rows) + #AF3 output with PAE info, or HF3 PAE data. TODO: Need to make sure the workflow points to [protein]/[protein]_rank1/all_results.json + + # TODO: I think I need to capture model_id and inference_id -- MUST FIX since this is so fragile and will be different for different programs. + #if '_alphafold2_ptm_model_' in json_file: # ColabFold, multimer or monomer + ## Might want to cut more if I just want ${meta.id}_[metric].tsv + # model_id = os.path.basename(json_file) + # print(model_id) + if 'all_results' in json_file: # Individual predictions in HF3 + model_id = int(os.path.dirname(json_file).split('-rank')[-1]) #Use re-ranked output + if 'predictions' in json_file: # Boltz-1 confidences in predictions/[protein]/confidence_[protein]_model_*.json + # TODO: haven't tested this for multiple models with --diffusion_samples + model_id = os.path.basename(json_file).split('_model_')[-1].split('.json')[0] + #TODO: Fix this for AF3 - the top-ranked files are in the top-level directory + if 'confidences' in json_file: #Prevent crash when model_id is not defined + #model_id = os.path.basename(json_file).split('confidences_')[-1].split('.json')[0] + model_id = 0 + + if "pae" not in data.keys(): + print(f"No PAE output in {json_file}, it was likely a monomer calculation") + else: + write_tsv(f"{name}_{model_id}_pae.tsv", format_pae_rows(data["pae"])) + + if 'ptm' not in data.keys(): + print(f"No pTM/iPTM output in {json_file}, it was likely a monomer calculation") + #This message should change - currently called on boltz files not expected to contain ptm + else: + ptm_data[model_id] = f"{np.round(data['ptm'],3)}\n" + + if 'iptm' not in data.keys(): + print(f"No pTM/iPTM output in {json_file}, it was likely a monomer calculation") + else: + if data['iptm']: #ie not null + iptm_data[model_id] = f"{np.round(data['iptm'],3)}\n" + + if 'chain_pair_iptm' not in data.keys() and 'pair_chains_iptm' not in data.keys(): + print(f"No chain-wise iPTM output in {json_file}, it was likely a monomer calculation") + else: + if 'chain_pair_iptm' in data.keys(): + chain_pair_iptm_data = data['chain_pair_iptm'] + chain_iptm_matrix = np.array(chain_pair_iptm_data) + elif 'pair_chains_iptm' in data.keys(): #Boltz key + chain_pair_iptm_data = data['pair_chains_iptm'] + # casting to int works for sorting boltz - need to carefully check other modes + chain_iptm_matrix = np.array([[chain_pair_iptm_data[row][col] for col in sorted(chain_pair_iptm_data[row], key=int)] for row in sorted(chain_pair_iptm_data, key=int)]) + basename = os.path.basename(json_file) + dirname = os.path.dirname(json_file) + pdb_name = ".".join(basename[11:].split('.')[:-1])+'.pdb' #TODO: Fix magic number + chain_ids = get_chain_ids(os.path.join(dirname,pdb_name)) + else: + raise ValueError("No chain-wise iPTM data found in the JSON file.") + + chain_pair_entries[model_id] = chain_iptm_matrix_to_pairs(chain_iptm_matrix) + chainwise_ptms[model_id] = chainwise_iptm_matrix_to_ptms(chain_iptm_matrix) + + if chainwise_ptms: + write_tsv(f"{name}_chainwise_ptm.tsv", format_iptm_rows(chainwise_ptms, chain_ids=chain_ids)) + + if chain_pair_entries: + write_tsv(f"{name}_chainwise_iptm.tsv", format_iptm_rows(chain_pair_entries, chain_ids=chain_ids)) + + if ptm_data: + ptm_rows = [[k, v.strip()] for k, v in sorted(ptm_data.items(), key=lambda x: x[0])] + write_tsv(f"{name}_ptm.tsv", ptm_rows) + + if iptm_data: + iptm_rows = [[k, v.strip()] for k, v in sorted(iptm_data.items(), key=lambda x: x[0])] + write_tsv(f"{name}_iptm.tsv", iptm_rows) + + +def read_pt(name, pt_files): + import torch # moved to a conditional import since too bulky import if not used + #TODO: Handle this better when refactored - Is this just RFAA?? + for pt_file in pt_files: + with open(pt_file, 'rb') as f: # TODO: point to [protein]_aux.pt + data = torch.load(f, map_location="cpu") + if 'pae' in data: + # The pt file contains a tensor that needs to be cast as an array + # Squeeze leading dimension (batch?) + write_tsv(f"{name}_0_pae.tsv", format_pae_rows(np.squeeze(data["pae"].numpy()))) + break + +def read_colabfold_metrics(name, colabfold_metrics_fns): + ptm_rows = [] + iptm_rows = [] + for fn in colabfold_metrics_fns: + with open(fn) as f: + data = json.load(f) + rank_id = int(fn.split("rank_")[1].split("_")[0])-1 + model_id = int(fn.split("model_")[1].split("_")[0]) + seed_id = int(fn.split("seed_")[1].split(".")[0]) + if "pae" in data: + write_tsv(f"{name}_{rank_id}_pae.tsv", format_pae_rows(data["pae"])) + if "ptm" in data: + ptm_rows.append((f"{rank_id}", data["ptm"])) + if "iptm" in data: + iptm_rows.append((f"{rank_id}", data["iptm"])) + if len(ptm_rows)>0: + write_tsv(f"{name}_ptm.tsv", sorted(ptm_rows, key = lambda x: x[0])) + if len(iptm_rows)>0: + write_tsv(f"{name}_iptm.tsv", sorted(iptm_rows, key = lambda x: x[0])) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--pkls", dest="pkls", required=False, nargs="+") # For reading both HelixFold3 and AlphaFold2 MSA formats + parser.add_argument("--npzs", dest="npzs", required=False, nargs="+") # For reading the Boltz-1 PAE formats. TODO: Boltz-1 MSA not implemented (go straight to .a3m file), implement + parser.add_argument("--a3ms", dest="a3ms", required=False, nargs="+") # For reading the RosettaFold-All-Atom MSA formats + parser.add_argument("--paired_a3m", dest="paired_a3m", required=False) # For reading the ColabFold MSA format + parser.add_argument("--csvs", dest="csvs", required=False, nargs="+") # For reading boltz csvs + parser.add_argument("--jsons", dest="jsons", required=False, nargs="+") # For reading the AF3 MSA & PAE, HF3 PAE + parser.add_argument("--colabfold_metrics_fns", required=False, nargs="+") + parser.add_argument("--pts", dest="pts", required=False, nargs="+") # For read RFAA pytorch model to get PAE data + parser.add_argument("--structs", dest="structs", required=False, nargs="+") + parser.add_argument("--name", default="untitled", dest="name") # might need a --name $meta.id + args = parser.parse_args() + + if args.pkls: + read_pkl(args.name, args.pkls) + if args.a3ms: + read_a3m(args.name, args.a3ms) + if args.paired_a3m: + read_paired_a3m(args.name, args.paired_a3m) + if args.csvs: + read_csv(args.name, args.csvs) + if args.npzs: + read_npz(args.name, args.npzs) + if args.jsons: + read_json(args.name, args.jsons) + if args.pts: + read_pt(args.name, args.pts) + if args.structs: + extract_structs_plddt_to_tsv(args.name, args.structs) + if args.colabfold_metrics_fns: + read_colabfold_metrics(args.name, args.colabfold_metrics_fns) + +if __name__ == "__main__": + main() diff --git a/bin/fasta_to_alphafold3_json.py b/bin/fasta_to_alphafold3_json.py new file mode 100755 index 000000000..b4640ed1f --- /dev/null +++ b/bin/fasta_to_alphafold3_json.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import json +import string +import re +from Bio import SeqIO + +def parse_args(args=None): + """ + Parse command line arguments for the script. + + Required arguments: + FILE_IN: Input fasta file path + ID: Identifier for the protein sequence (will be used in output filename and JSON) + + Optional arguments: + -ms/--model_seed: AlphaFold3 model seed(s) to use (default: [11]) + """ + Description = "Convert fasta files to Alphafold3 json format." + Epilog = "Example usage: python fasta_to_alphafold3_json.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + + ## REQUIRED PARAMETERS + parser.add_argument( + "FILE_IN", + help="Input fasta file." + ) + parser.add_argument( + "ID", + help="ID for file name and for json id tag." + ) + + ## OPTIONAL PARAMETERS + parser.add_argument( + "-ms", + "--model_seed", + type=int, + nargs='+', + dest="MODEL_SEED", + default=[11], + help="Alphafold 3 model seed." + ) + + return parser.parse_args(args) + +def infer_entity_type(header, sequence): + """ + Infer the entity type from the header and sequence. + + Args: + header (str): Sequence header + sequence (str): Sequence + + Returns: + str: Entity type (one of "protein", "ccd", "smiles", "dna", "rna", or "unknown") + """ + ENTITY_TYPES = ["protein", "ccd", "smiles", "dna", "rna"] + + header_lower = header.lower() + + for entity in ENTITY_TYPES: + if entity in header_lower: + return entity + seq = sequence.strip() + seq_set = set(seq) + # RNA: only A,C,U,G,N + if len(seq_set - set("ACUGN")) == 0: + return "rna" + # DNA: only A,C,T,G,N + if len(seq_set - set("ACTGN")) == 0: + return "dna" + # Protein: only 20 AA, not just A,C,T,G,U,N + protein_letters = set("ACDEFGHIKLMNPQRSTVWY") + if len(seq_set - protein_letters) == 0 and not (seq_set <= set("ACUGTN")): + return "protein" + # SMILES: fallback + if re.fullmatch(r"[A-Za-z0-9@+\\-\\[\\]\\(\\)=#\\\$%]+", seq): + return "smiles" + return "unknown" + +def sanitised_name(id): + """ + Sanitize the input ID to create a valid filename. + + This function is copied from AlphaFold3 source code to ensure consistent naming: + https://github.com/google-deepmind/alphafold3/blob/7fdf96161d61a6e18048e5c62bf7e1d711992943/src/alphafold3/common/folding_input.py#L1166-L1170 + It converts the ID to lowercase, replaces spaces with underscores, and removes + any characters that aren't allowed in filenames. + + Args: + id (str): Input identifier + + Returns: + str: Sanitized version of the ID suitable for use as a filename + """ + lower_spaceless_name = id.lower().replace(' ', '_') + allowed_chars = set(string.ascii_lowercase + string.digits + '_-.') + return ''.join(l for l in lower_spaceless_name if l in allowed_chars) + +def fasta_to_alphafold3_json(file_in): + """ + Convert a FASTA file to a list of entities in AlphaFold3 format. + + Args: + file_in (str): Path to the input FASTA file + + Returns: + list: List of entities in AlphaFold3 format + """ + VALID_CHAIN_IDS = list(string.ascii_uppercase) + list(string.ascii_lowercase) + [str(x) for x in range(0, 10)] + entities = [] + + for i, record in enumerate(SeqIO.parse(file_in, "fasta")): + sequence = record.seq._data.decode() + header = record.description + entity_type = infer_entity_type(header, sequence) + entities.append((entity_type, VALID_CHAIN_IDS[i], sequence)) + + return entities + +def create_json_dict(id, entities, model_seed): + """ + Create the final JSON dictionary in AlphaFold3 format. + + The function takes in the sequence ID, a list of entities, and a list of model seeds and + creates a JSON structure that follows AlphaFold3's requirements: + { + "name": "sequence_id", + "sequences": [ + { + "protein": { + "id": "A", + "sequence": "protein_sequence" + } + } + ], + "modelSeeds": [seed_values], + "dialect": "alphafold3", + "version": 1 + } + + Args: + id (str): Sequence ID + entities (list): List of entities in AlphaFold3 format + model_seed (list): List of model seeds to use + + Returns: + dict: JSON-compatible dictionary in AlphaFold3 format + """ + + json_sequence_list = [] + + for entity in entities: + item = { + entity[0]: { + "id": entity[1], + "sequence": entity[2] + } + } + + json_sequence_list.append(item) + + + alphafold3_json_dict = { + "name": f"{id}", + "sequences": json_sequence_list, + "modelSeeds": model_seed, + "dialect": "alphafold3", + "version": 1 + } + + return alphafold3_json_dict + +def main(args=None): + """ + Main function to process FASTA files and create AlphaFold3 JSON files. + + The script: + 1. Parses command line arguments + 2. Sanitizes the input ID for filename use + 3. Reads and processes the FASTA file + 4. Creates the JSON structure + 5. Writes the output to a JSON file + + The output filename will be the sanitized ID with .json extension. + """ + args = parse_args(args) + id = args.ID + + if id.endswith(".json"): + id = id[:-5] + reformatted_id = sanitised_name(id) + else: + reformatted_id = sanitised_name(id) + + out_json = f"{reformatted_id}.json" + + entities = fasta_to_alphafold3_json(args.FILE_IN) + json_dict = create_json_dict(reformatted_id, entities, args.MODEL_SEED) + + print ("json file " + out_json) + with open(out_json, "w") as fout: + json.dump(json_dict, fout, indent=4) + + with open(out_json, 'r') as f: + json_str = f.read() + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/fasta_to_boltz.py b/bin/fasta_to_boltz.py new file mode 100755 index 000000000..c33e9821c --- /dev/null +++ b/bin/fasta_to_boltz.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import sys +import os +import argparse +import string +import re + + +ENTITY_TYPES = ["protein", "ccd", "smiles", "dna", "rna"] + + +def parse_args(args=None): + """ + Parse command line arguments for the script. + + Required arguments: + FASTA: Input fasta file path + ID: Identifier for the output file + Optional arguments: + --msa: MSA files associated with protein sequences + """ + Description = "Convert fasta files to Boltz format." + Epilog = "Example usage: python fasta_to_boltz.py [--msa file1.a3m file2.a3m]" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + + parser.add_argument( + "FASTA", + help="Input fasta file." + ) + parser.add_argument( + "ID", + help="ID for output file name." + ) + parser.add_argument( + "--msa", + nargs='*', + default=[], + help="MSA files for protein sequences." + ) + + return parser.parse_args(args) + + +def infer_entity_type(header, sequence): + """ + Infer the entity type from the FASTA header and sequence. + + Args: + header (str): FASTA header line + sequence (str): Sequence string + + Returns: + str: Entity type (protein, dna, rna, smiles, ccd, or unknown) + """ + header_lower = header.lower() + for entity in ENTITY_TYPES: + if entity in header_lower: + return entity + seq = sequence.strip() + seq_set = set(seq) + # RNA: only A,C,U,G,N + if len(seq_set - set("ACUGN")) == 0: + return "rna" + # DNA: only A,C,T,G,N + if len(seq_set - set("ACTGN")) == 0: + return "dna" + # Protein: only 20 AA, not just A,C,T,G,U,N + protein_letters = set("ACDEFGHIKLMNPQRSTVWY") + if len(seq_set - protein_letters) == 0 and not (seq_set <= set("ACUGTN")): + return "protein" + # SMILES: fallback + if re.fullmatch(r"[A-Za-z0-9@+\-\[\]\(\)=#\$%]+", seq): + return "smiles" + return "unknown" + + +def fasta_to_boltz(fasta_file, sample_id, msa_files): + """ + Convert a FASTA file to Boltz format. + + Args: + fasta_file (str): Path to the input FASTA file + sample_id (str): Sample identifier for the output file + msa_files (list): List of MSA file paths for protein sequences + """ + all_combinations = list(string.ascii_uppercase) + list(string.ascii_lowercase) + [str(x) for x in range(0, 10)] + + os.makedirs("output_fasta", exist_ok=True) + counter = 0 + msa_counter = 0 + + with open(fasta_file, "r") as f: + lines = f.readlines() + + msa = "" + fasta_data = "" + seq_lines = [] + header = None + + unique_proteins = {} + for line in lines: + line = line.strip() + if line.startswith(">"): + # Write previous entry if exists + if header is not None: + sequence = "".join(seq_lines) + entity_type = infer_entity_type(header, sequence) + msa = "" + if entity_type == 'protein': + if len(msa_files) > 0: + if sequence not in unique_proteins: + unique_proteins[sequence] = msa_counter + msa_counter += 1 + this_msa = unique_proteins[sequence] + msa = f"|{os.path.basename(msa_files[this_msa])}" + if msa[1:] not in msa_files: + print(f"Can not find msa file {os.path.basename(msa_files[counter])}") + sys.exit(1) + fasta_data += f">{all_combinations[counter]}|{entity_type}{msa}\n{sequence}\n" + counter += 1 + header = line + seq_lines = [] + else: + seq_lines.append(line) + + # Write last entry + if header is not None: + sequence = "".join(seq_lines) + entity_type = infer_entity_type(header, sequence) + msa = "" + if entity_type == 'protein': + if len(msa_files) > 0: + if not sequence in unique_proteins: + unique_proteins[sequence] = msa_counter + msa_counter += 1 + this_msa = unique_proteins[sequence] + msa = f"|{os.path.basename(msa_files[this_msa])}" + if msa[1:] not in msa_files: + print(f"Can not find msa file {os.path.basename(msa_files[counter])}") + sys.exit(1) + fasta_data += f">{all_combinations[counter]}|{entity_type}{msa}\n{sequence}\n" + + if len(fasta_data) > 0: + with open(f"output_fasta/{sample_id}.fasta", "w") as outfile: + outfile.write(fasta_data) + + +def main(args=None): + """ + Main function to process FASTA files and create Boltz formatted FASTA files. + """ + args = parse_args(args) + fasta_to_boltz(args.FASTA, args.ID, args.msa) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/fasta_to_json.py b/bin/fasta_to_json.py new file mode 100755 index 000000000..acbab8ced --- /dev/null +++ b/bin/fasta_to_json.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import json +import copy + + +def parse_args(args=None): + """ + Parse command line arguments for the script. + + Required arguments: + FASTA: Input fasta file path + ID: Identifier for the protein sequence (will be used in output filename) + """ + Description = "Convert fasta files to HelixFold3 json format." + Epilog = "Example usage: python fasta_to_json.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + + ## REQUIRED PARAMETERS + parser.add_argument( + "FASTA", + help="Input fasta file." + ) + parser.add_argument( + "ID", + help="ID for file name." + ) + + return parser.parse_args(args) + + +def fasta_to_json(file_in): + """ + Convert a FASTA file to a list of entities in HelixFold3 JSON format. + + Args: + file_in (str): Path to the input FASTA file + + Returns: + dict: Dictionary with entities list + """ + seq_template = { + "type": "", + "sequence": "", + "count": 1 + } + final_res = {"entities": []} + seq_type = "protein" + fasta_data = "" + + with open(file_in, "r") as f: + lines = f.readlines() + + for line in lines: + line = line.strip() + if line.startswith(">"): + if len(fasta_data) > 0: + new_entry = copy.deepcopy(seq_template) + new_entry["type"] = seq_type + new_entry["sequence"] = fasta_data + final_res["entities"].append(new_entry) + fasta_data = "" + else: + fasta_data += f"{line}" + + if len(fasta_data) > 0: + new_entry = copy.deepcopy(seq_template) + new_entry["type"] = seq_type + new_entry["sequence"] = fasta_data + final_res["entities"].append(new_entry) + + return final_res + + +def main(args=None): + """ + Main function to process FASTA files and create HelixFold3 JSON files. + """ + args = parse_args(args) + id = args.ID + out_json = f"{id}.json" + + json_dict = fasta_to_json(args.FASTA) + + with open(out_json, "w") as json_file: + json.dump(json_dict, json_file, indent=4, sort_keys=True) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/fasta_to_rosettafold.py b/bin/fasta_to_rosettafold.py new file mode 100755 index 000000000..ae5a9acb8 --- /dev/null +++ b/bin/fasta_to_rosettafold.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +import os +import re +import sys +from pathlib import Path + + +def read_fasta(path, sample_id): + entries = [] + header = None + seq_lines = [] + with open(path, "r") as handle: + for raw in handle: + line = raw.strip() + if not line: + continue + if line.startswith(">"): + if header is not None: + entries.append((header, "".join(seq_lines).upper())) + header = line[1:].strip() or f"{sample_id}_chain_{len(entries) + 1}" + seq_lines = [] + else: + seq_lines.append(line.replace(" ", "").upper()) + if header is not None: + entries.append((header, "".join(seq_lines).upper())) + return entries + + +def infer_type(header, sequence): + type_aliases = { + "protein": "P", + "prot": "P", + "aa": "P", + "pep": "P", + "peptide": "P", + "p": "P", + "rna": "R", + "r": "R", + "d": "D", + "double": "D", + "ds": "D", + "dsdna": "D", + "double_dna": "D", + "s": "S", + "single": "S", + "ss": "S", + "ssdna": "S", + "single_dna": "S", + "single-strand": "S", + "singlestrand": "S", + } + header_lower = header.lower() + match = re.search( + r"(?:^|\s)(?:type|entity|molecule|mol)\s*[:=]\s*([A-Za-z0-9_-]+)", + header_lower, + ) + if match: + candidate = match.group(1).lower() + if candidate in type_aliases: + return type_aliases[candidate] + for alias, code in type_aliases.items(): + if alias in {"p", "r", "d", "s"}: + continue + if re.search(r"\b" + re.escape(alias) + r"\b", header_lower): + return code + + seq_set = set(sequence) + if not sequence: + return None + if seq_set <= set("ACUGN"): + return "R" + # Default DNA to double-stranded unless explicitly marked single-strand. + if seq_set <= set("ACTGN"): + return "D" + protein_letters = set("ACDEFGHIKLMNPQRSTVWYBXZOU") + if seq_set <= protein_letters and not (seq_set <= set("ACUGTN")): + return "P" + return "P" + + +def main(): + if len(sys.argv) != 3: + sys.stderr.write("Usage: fasta_to_rosettafold.py \n") + return 1 + + sample_id, fasta_path = sys.argv[1], sys.argv[2] + allowed_ext = (".fa", ".fasta", ".fas", ".faa", ".fna") + if not fasta_path.lower().endswith(allowed_ext): + sys.stderr.write( + f"[ROSETTAFOLD2NA_FASTA] Input file '{fasta_path}' must be a FASTA file.\n" + ) + return 1 + + if not os.path.exists(fasta_path): + sys.stderr.write( + f"[ROSETTAFOLD2NA_FASTA] Input FASTA '{fasta_path}' does not exist.\n" + ) + return 1 + + entries = read_fasta(fasta_path, sample_id) + if not entries: + sys.stderr.write( + f"[ROSETTAFOLD2NA_FASTA] No sequences found in '{fasta_path}'.\n" + ) + return 1 + + output_dir = Path("rf2na_input") + output_dir.mkdir(parents=True, exist_ok=True) + + chain_records = [] + observed_files = set() + for idx, (header, sequence) in enumerate(entries, start=1): + chain_type = infer_type(header, sequence) + if chain_type is None: + sys.stderr.write( + f"[ROSETTAFOLD2NA_FASTA] Unable to determine entity type for entry '{header}'. " + "Please include a token such as 'type=protein', 'type=double_dna', or 'type=single_dna'.\n" + ) + return 1 + if chain_type not in {"P", "R", "D", "S"}: + sys.stderr.write( + f"[ROSETTAFOLD2NA_FASTA] Unable to determine entity type for entry '{header}'. " + "Allowed types: protein (P), rna (R), double_dna (D), single_dna (S).\n" + ) + return 1 + safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", header) or f"chain_{idx}" + filename = f"chain_{idx:03d}_{safe_name[:40]}.fa" + if filename in observed_files: + filename = f"chain_{idx:03d}_{idx}.fa" + observed_files.add(filename) + with open(output_dir / filename, "w") as fh: + fh.write(f">{header}\n") + for start in range(0, len(sequence), 80): + fh.write(sequence[start : start + 80] + "\n") + chain_records.append((chain_type, filename, header)) + + with open(output_dir / "chain_map.tsv", "w") as mapping: + mapping.write("type\tfilename\theader\n") + for chain_type, filename, header in chain_records: + mapping.write(f"{chain_type}\t{filename}\t{header}\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/fasta_to_yaml.py b/bin/fasta_to_yaml.py new file mode 100755 index 000000000..a3f928912 --- /dev/null +++ b/bin/fasta_to_yaml.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import string + + +def parse_args(args=None): + """ + Parse command line arguments for the script. + + Required arguments: + FASTA: Input fasta file path + ID: Identifier for the protein sequence (will be used in output filename and YAML job_name) + """ + Description = "Convert fasta files to RoseTTAFold-All-Atom yaml format." + Epilog = "Example usage: python fasta_to_yaml.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + + ## REQUIRED PARAMETERS + parser.add_argument( + "FASTA", + help="Input fasta file." + ) + parser.add_argument( + "ID", + help="ID for file name and YAML job_name." + ) + + return parser.parse_args(args) + + +def fasta_to_yaml(file_in, id): + """ + Convert a FASTA file to a RoseTTAFold-All-Atom YAML config and individual chain FASTA files. + + Creates: + - Individual chain FASTA files in out_fasta/ directory + + Returns: + str: YAML configuration string for RFAA + + Args: + file_in (str): Path to the input FASTA file + id (str): Identifier for the job name in the YAML config + """ + all_combinations = list(string.ascii_uppercase) + list(string.ascii_lowercase) + [str(x) for x in range(0, 10)] + yaml_template = f"defaults:\n - base\njob_name: \"{id}\"\nprotein_inputs:\n" + counter = 0 + fasta_data = "" + os.makedirs("out_fasta", exist_ok=True) + + with open(file_in, "r") as f: + lines = f.readlines() + + for line in lines: + line = line.strip() + if line.startswith(">"): + if len(fasta_data) > 0: + with open(f"out_fasta/{all_combinations[counter]}.fasta", "w") as fasta_file: + fasta_file.write(fasta_data + "\n") + yaml_template += f" {all_combinations[counter]}:\n fasta_file: {all_combinations[counter]}.fasta\n" + counter += 1 + fasta_data = f"{line}\n" + else: + fasta_data += f"{line}" + + if len(fasta_data) > 0: + with open(f"out_fasta/{all_combinations[counter]}.fasta", "w") as fasta_file: + fasta_file.write(fasta_data + "\n") + yaml_template += f" {all_combinations[counter]}:\n fasta_file: {all_combinations[counter]}.fasta\n" + + return yaml_template + + +def main(args=None): + """ + Main function to process FASTA files and create RoseTTAFold-All-Atom YAML files. + """ + args = parse_args(args) + id = args.ID + out_yaml = f"{id}.yaml" + + yaml_content = fasta_to_yaml(args.FASTA, id) + + with open(out_yaml, "w") as yaml_file: + yaml_file.write(yaml_content) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/fix_obsolete.py b/bin/fix_obsolete.py new file mode 100755 index 000000000..dcfca34a6 --- /dev/null +++ b/bin/fix_obsolete.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +from sys import argv + +mapping = {} +with open(argv[1]) as f: + f.readline() + for line in f: + ss = line.split() + if len(ss)<4: continue + left = ss[2] + right = ss[3] + mapping[left] = right + +removed = set() +with open(argv[1]) as f: + print(f.readline().strip('\n')) + for line in f: + ss = line.split() + if len(ss)<4: + print(line.strip('\n')) + continue + map_out = ss[3] + if ss[2] in removed: continue + while True: + if map_out not in mapping: break + removed.add(map_out) + map_out = mapping[map_out] + print(f"{ss[0]} {ss[1]} {ss[2]} {map_out}") diff --git a/bin/generate_comparison_report.py b/bin/generate_comparison_report.py new file mode 100755 index 000000000..51cb55110 --- /dev/null +++ b/bin/generate_comparison_report.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python + +import os +import argparse +from collections import OrderedDict +import base64 +import plotly.graph_objects as go +from Bio import PDB + +def reset_residue_numbers(input_pdb, output_pdb): + """ + Resets residue numbers (column 23-26) in a PDB file so the position starts from 1 for each chain + and increment only when encountering a new residue. + """ + with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile: + current_residue_number = 1 + previous_residue_id = None + previous_chain = None + + for line in infile: + if line.startswith("ATOM") or line.startswith("HETATM"): + chain = line[21] # Extract the chain identifier (column 22) + residue_id = line[22:26].strip() # Extract the residue ID (column 23-26) + + # Reset residue numbering if the chain changes + if chain != previous_chain: + current_residue_number = 1 + previous_chain = chain + previous_residue_id = None + + # Increment residue number if it's a new residue + if residue_id != previous_residue_id: + if previous_residue_id is not None: # Only increment after the first residue + current_residue_number += 1 + previous_residue_id = residue_id + + # Update the line with the new residue number + updated_line = ( + line[:22] + + f"{current_residue_number:4}" + + line[26:] + ) + outfile.write(updated_line) + + else: + # Write non-ATOM/HETATM lines (e.g., TER, PARENT) without changes + outfile.write(line) + +def generate_output(plddt_data, name, out_dir, generate_tsv, pdb): + plddt_per_model = OrderedDict() + output_data = plddt_data + + if generate_tsv == "y": + for plddt_path in output_data: + with open(plddt_path, "r") as in_file: + plddt_per_model[os.path.basename(plddt_path)[:-4]] = [ + float(x) for x in in_file.read().strip().split() + ] + else: + for i, plddt_values_str in enumerate(output_data): + plddt_per_model[i] = [] + plddt_per_model[i] = [float(x) for x in plddt_values_str.strip().split()] + + fig = go.Figure() + for idx, (model_name, value_plddt) in enumerate(plddt_per_model.items()): + rank_label = os.path.splitext(pdb[idx])[0] + fig.add_trace( + go.Scatter( + x=list(range(len(value_plddt))), + y=value_plddt, + mode="lines", + name=rank_label, + text=[f"({i}, {value:.2f})" for i, value in enumerate(value_plddt)], + hoverinfo="text", + ) + ) + fig.update_layout( + title=dict(text="Predicted LDDT per position", x=0.5, xanchor="center"), + xaxis=dict( + title="Positions", showline=True, linecolor="black", gridcolor="WhiteSmoke", minallowed=0, maxallowed=len(value_plddt)-1 + ), + yaxis=dict( + title="Predicted LDDT", + range=[0, 100], + fixedrange=True, + showline=True, + linecolor="black", + gridcolor="WhiteSmoke", + ), + legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1, bordercolor="Black", borderwidth=1), + plot_bgcolor="white", + width=600, + height=600, + modebar_remove=["toImage", "zoomIn", "zoomOut"], + ) + html_content = fig.to_html( + full_html=False, + include_plotlyjs="cdn", + config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True}, + ) + + with open( + f"{out_dir}/{name+('_' if name else '')}coverage_LDDT.html", "w" + ) as out_file: + out_file.write(html_content) + +def align_structures(structures): + + if not structures: + raise ValueError("No structures provided for alignment.") + + if structures[0].endswith(".pdb"): + parser = PDB.PDBParser(QUIET=True) + elif structures[0].endswith(".cif"): + parser = PDB.MMCIFParser(QUIET=True) + else: + raise ValueError(f"{structure} is neither a PDB or mmCIF file!") + + parsed_structures = [parser.get_structure(f"structure-{idx}", structure) for idx, structure in enumerate(structures)] + ref_structure = parsed_structures[0] + + def get_atom_ids(structure): + # Note: this is a *set* of atom_ids due to the {} surrounding the comprehension + return {(atom.get_parent().get_parent().get_id(), atom.get_parent().get_id(), atom.name) for atom in structure.get_atoms() if atom.element != 'H'} + + # Find atoms common to all structures for sub-alignment + common_atoms = get_atom_ids(ref_structure) + initial_atom_count = len(common_atoms) + for structure in parsed_structures[1:]: + common_atoms.intersection_update(get_atom_ids(structure)) + + if len(common_atoms) < initial_atom_count: + print( + f"WARNING: Structures are not atomically identical. " + f"Aligning using {len(common_atoms)} common atoms out of {initial_atom_count} " + f"in the reference structure ({initial_atom_count - len(common_atoms)} atoms excluded)." + ) + + if not common_atoms: + raise ValueError("No common atoms found between structures.") + + def extract_atoms(structure, atom_ids): + # Note: this comprehension returns an atom *object* for each atom in the structure + return [atom for atom in structure.get_atoms() if (atom.get_parent().get_parent().get_id(), atom.get_parent().get_id(), atom.name) in atom_ids] + + ref_atoms = extract_atoms(ref_structure, common_atoms) + # The aligned structures will be the parsed structures aligned to the common atoms of the reference structure + super_imposer = PDB.Superimposer() + aligned_structures = [] + for idx, structure in enumerate(parsed_structures): + # The reference structure doesn't need to be aligned so can be skipped + if idx == 0: + aligned_structures.append(structure) + continue + target_atoms = extract_atoms(structure, common_atoms) + super_imposer.set_atoms(ref_atoms, target_atoms) + super_imposer.apply(structure.get_atoms()) + + io = PDB.PDBIO() + io.set_structure(structure) + io.save(f"aligned_structure_{idx}.pdb") + aligned_structures.append(f"aligned_structure_{idx}.pdb") + + # Technically, parsed_structures now also points to the same aligned structures, but I've kept for readability + return aligned_structures + + +def pdb_to_lddt(struct_files, generate_tsv): + output_lddt = [] + averages = [] + + for struct_file in struct_files: + plddt_values = [] + + if struct_file.endswith('.pdb'): + parser = PDB.PDBParser(QUIET=True) + suffix = ".pdb" + elif struct_file.endswith('.cif'): + parser = PDB.MMCIFParser(QUIET=True) + suffix = ".cif" + else: + raise NotImplementedError("Reporting only supported for .pdb and .cif filetypes") + + structure = parser.get_structure("", struct_file) + + for residue in structure.get_residues(): + res_pLDDT_tot = 0 + res_atom_count = 0 + + for atom in residue.get_atoms(): + res_atom_count +=1 + res_pLDDT_tot += atom.get_bfactor() + + # Residue-level mean for ESMfold atom-level pLDDT + res_pLDDT_ave = res_pLDDT_tot/res_atom_count + + if res_pLDDT_ave < 1.0: + res_pLDDT_ave *= 100 + plddt_values.append(res_pLDDT_ave) + + # Calculate the average PLDDT value for the current file + if plddt_values: + avg_plddt = sum(plddt_values) / len(plddt_values) + averages.append(round(avg_plddt, 3)) + else: + averages.append(0.0) + + if generate_tsv == "y": + output_file = f"{os.path.splitext(struct_file)[0]}_plddt.tsv" + with open(output_file, "w") as outfile: + outfile.write(" ".join(map(str, plddt_values)) + "\n") + output_lddt.append(output_file) + else: + plddt_values_string = " ".join(map(str, plddt_values)) + output_lddt.append(plddt_values_string) + + return output_lddt, averages + + +print("Starting...") + +version = "1.0.0" +parser = argparse.ArgumentParser() +parser.add_argument("--type", dest="in_type") +parser.add_argument( + "--generate_tsv", choices=["y", "n"], default="n", dest="generate_tsv" +) +parser.add_argument("--msa", dest="msa", required=True, nargs="+") +parser.add_argument("--pdb", dest="pdb", required=True, nargs="+") +parser.add_argument("--name", dest="name") +parser.add_argument("--output_dir", dest="output_dir") +parser.add_argument("--html_template", dest="html_template") +parser.add_argument("--version", action="version", version=f"{version}") +parser.set_defaults(output_dir="") +parser.set_defaults(in_type="comparison") +parser.set_defaults(name="") +args = parser.parse_args() + +lddt_data, lddt_averages = pdb_to_lddt(args.pdb, args.generate_tsv) + +generate_output(lddt_data, args.name, args.output_dir, args.generate_tsv, args.pdb) + +print("generating html report...") + +# Preprocess "esmfold" PDB files, to reset residues on additional chains +processed_pdbs = [ + pdb_file.replace(".pdb", "_aligned.pdb") for pdb_file in args.pdb +] + +for pdb_file in args.pdb: + print("Reseting", pdb_file, " into ", pdb_file.replace(".pdb", "_aligned.pdb")) + reset_residue_numbers(pdb_file, pdb_file.replace(".pdb", "_aligned.pdb")) + +structures = processed_pdbs # Use the final processed list +print("reference structure:", processed_pdbs[0]) +print("target structures:", ",".join(processed_pdbs[1:])) +aligned_structures = align_structures(structures) + +io = PDB.PDBIO() +ref_structure_path = "aligned_structure_0.pdb" +io.set_structure(aligned_structures[0]) +io.save(ref_structure_path) +aligned_structures[0] = ref_structure_path + +comparision_template = open(args.html_template, "r").read() +comparision_template = comparision_template.replace("*sample_name*", args.name) +comparision_template = comparision_template.replace("*prog_name*", args.in_type) + +args_pdb_array_js = ( + "const MODELS = [" + ",\n".join([f'"{model}"' for model in structures]) + "];" +) +comparision_template = comparision_template.replace("const MODELS = [];", args_pdb_array_js) + +seq_cov_imgs = [] +seq_cov_methods = [] +for msa, pdb in zip(args.msa, args.pdb): + if msa != "NO_FILE": + image_path = msa + method = pdb.split(".pdb")[0] + seq_cov_methods.append(method) + with open(image_path, "rb") as in_file: + encoded_image = base64.b64encode(in_file.read()).decode("utf-8") + seq_cov_imgs.append(f"data:image/png;base64,{encoded_image}") + +#MSA IMAGES +args_msa_array_js = ( + f"""const SEQ_COV_IMGS = [{", ".join([f'"{img}"' for img in seq_cov_imgs])}];""" +) +comparision_template = comparision_template.replace( + "const SEQ_COV_IMGS = [];", args_msa_array_js +) +#MSA IMAGE LABELS +args_msa_method_array_js = ( + f"""const SEQ_COV_METHODS = [{", ".join([f'"{method}"' for method in seq_cov_methods])}];""" +) +comparision_template = comparision_template.replace( + "const SEQ_COV_METHODS = [];", args_msa_method_array_js +) + +averages_js_array = f"const LDDT_AVERAGES = {lddt_averages};" +comparision_template = comparision_template.replace( + "const LDDT_AVERAGES = [];", averages_js_array +) + +i = 0 +for structure in aligned_structures: + comparision_template = comparision_template.replace( + f"*_data_ranked_{i}.pdb*", open(structure, "r").read().replace("\n", "\\n") + ) + i += 1 + +with open( + f"{args.output_dir}/{args.name + ('_' if args.name else '')}coverage_LDDT.html", + "r", +) as in_file: + lddt_html = in_file.read() + comparision_template = comparision_template.replace( + '
    ', lddt_html + ) + +with open( + f"{args.output_dir}/{args.name}_{args.in_type.lower()}_report.html", "w" +) as out_file: + out_file.write(comparision_template) diff --git a/bin/generate_report.py b/bin/generate_report.py new file mode 100755 index 000000000..a2c6e9db1 --- /dev/null +++ b/bin/generate_report.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python + +import os +import argparse +from matplotlib import pyplot as plt +import numpy as np +from collections import OrderedDict +import base64 +import plotly.graph_objects as go +import re +from Bio import PDB + +def generate_pae_plot(pae_path, out_dir, name, save_image=False): + #save_image=False because plotly needs a local install of Google Chrome to save images..... + """ + Generate a Plotly heatmap for Predicted Aligned Error (PAE) data. + + Args: + pae (2D array): The PAE matrix. + Returns: + fig: A Plotly figure object of the PAE heatmap in green color scale + """ + pae = np.genfromtxt(pae_path, delimiter="\t") + fig = go.Figure() + + # Add heatmap + fig.add_trace( + go.Heatmap( + z=pae, + colorscale="Greens_r", + zmin=0, + zmax=30, + ) + ) + fig.update_layout( + xaxis=dict(title="Scored Residue", minallowed=0, maxallowed=pae.shape[0]-1), + yaxis=dict(title="Aligned Residue", minallowed=0, maxallowed=pae.shape[1]-1, autorange="reversed"), + width=600, + height=600, + ) + + if save_image: + image_path = f"{out_dir}/{name+('_' if name else '')}PAE.png" + fig.write_image(image_path, width=800, height=800) + + return fig + +def generate_output_images(msa_path, plddt_data, name, out_dir, in_type, generate_tsv, pdb): + msa = [] + if not msa_path.endswith("NO_FILE"): + with open(msa_path, "r") as in_file: + for line in in_file: + msa.append([int(x) for x in line.strip().split()]) + + # Pad jagged MSAs to avoid shape errors in downstream plotting + if msa: + max_len = max(len(row) for row in msa) + if any(len(row) != max_len for row in msa): + msa = [row + [21] * (max_len - len(row)) for row in msa] + + seqid = [] + for sequence in msa: + matches = [ + 1.0 if first == other else 0.0 for first, other in zip(msa[0], sequence) + ] + seqid.append(sum(matches) / len(matches)) + + seqid_sort = sorted(range(len(seqid)), key=seqid.__getitem__) + + non_gaps = [] + for sequence in msa: + non_gaps.append( + [float(num != 21) if num != 21 else float("nan") for num in sequence] + ) + + sorted_non_gaps = [non_gaps[i] for i in seqid_sort] + final = [] + for sorted_seq, identity in zip( + sorted_non_gaps, [seqid[i] for i in seqid_sort] + ): + final.append( + [ + value * identity if not isinstance(value, str) else value + for value in sorted_seq + ] + ) + + xaxis_size = len(final[0]) + yaxis_size = len(final) + + # ################################################################## + plt.figure(figsize=(16, 10), dpi=100) + # ################################################################## + plt.title("Sequence coverage", fontsize=30, pad=36) + plt.imshow( + final, + interpolation="nearest", + aspect="auto", + cmap="rainbow_r", + vmin=0, + vmax=1, + origin="lower", + extent=(0, xaxis_size, 0, yaxis_size) + ) + + column_counts = [0] * len(msa[0]) + for col in range(len(msa[0])): + for row in msa: + if row[col] != 21: + column_counts[col] += 1 + + plt.plot(column_counts, color="black") + plt.xlim(0, len(msa[0])) + plt.ylim(0, len(msa)) + + plt.tick_params(axis="both", which="both", labelsize=18) + + cbar = plt.colorbar() + cbar.set_label("Sequence identity to query", fontsize=24, labelpad=24) + cbar.ax.tick_params(labelsize=18) + plt.xlabel("Positions", fontsize=24, labelpad=24) + plt.ylabel("Sequences", fontsize=24, labelpad=36) + plt.savefig(f"{out_dir}/{name}_{in_type}_seq_coverage.png") + + # ################################################################## + + plddt_per_model = OrderedDict() + output_data = plddt_data + + if generate_tsv == "y": + for plddt_path in output_data: + with open(plddt_path, "r") as in_file: + plddt_per_model[os.path.basename(plddt_path)[:-4]] = [ + float(x) for x in in_file.read().strip().split() + ] + else: + for i, plddt_values_str in enumerate(output_data): + plddt_per_model[i] = [] + plddt_per_model[i] = [float(x) for x in plddt_values_str.strip().split()] + + fig = go.Figure() + for idx, (model_name, value_plddt) in enumerate(plddt_per_model.items()): + rank_label = os.path.splitext(pdb[idx])[0] + fig.add_trace( + go.Scatter( + x=list(range(len(value_plddt))), + y=value_plddt, + mode="lines", + name=rank_label, + text=[f"({i}, {value:.2f})" for i, value in enumerate(value_plddt)], + hoverinfo="text", + ) + ) + fig.update_layout( + title=dict(text="Predicted LDDT per position", x=0.5, xanchor="center"), + xaxis=dict( + title="Positions", showline=True, linecolor="black", gridcolor="WhiteSmoke", minallowed=0, maxallowed=len(value_plddt)-1 + ), + yaxis=dict( + title="Predicted LDDT", + range=[0, 100], + fixedrange=True, + showline=True, + linecolor="black", + gridcolor="WhiteSmoke", + ), + legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1, bordercolor="Black", borderwidth=1), + plot_bgcolor="white", + width=600, + height=600, + modebar_remove=["toImage", "zoomIn", "zoomOut"], + ) + html_content = fig.to_html( + full_html=False, + include_plotlyjs="cdn", + config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True}, + ) + + with open( + f"{out_dir}/{name+('_' if name else '')}coverage_LDDT.html", "w" + ) as out_file: + out_file.write(html_content) + + if args.pae and not args.pae.endswith('NO_FILE_PAE'): + pae_fig = generate_pae_plot(args.pae, out_dir, name) + pae_html_content = pae_fig.to_html( + full_html=False, + include_plotlyjs="cdn", + config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True}, + ) + with open( + f"{out_dir}/{name+('_' if name else '')}PAE.html", "w" + ) as pae_out_file: + pae_out_file.write(pae_html_content) + +def generate_plots(msa_path, plddt_paths, name, out_dir): + msa = [] + with open(msa_path, "r") as in_file: + for line in in_file: + msa.append([int(x) for x in line.strip().split()]) + + seqid = [] + for sequence in msa: + matches = [ + 1.0 if first == other else 0.0 for first, other in zip(msa[0], sequence) + ] + seqid.append(sum(matches) / len(matches)) + + seqid_sort = sorted(range(len(seqid)), key=seqid.__getitem__) + + non_gaps = [] + for sequence in msa: + non_gaps.append( + [float(num != 21) if num != 21 else float("nan") for num in sequence] + ) + + sorted_non_gaps = [non_gaps[i] for i in seqid_sort] + final = [] + for sorted_seq, identity in zip(sorted_non_gaps, [seqid[i] for i in seqid_sort]): + final.append( + [ + value * identity if not isinstance(value, str) else value + for value in sorted_seq + ] + ) + + # Plotting Sequence Coverage using Plotly + fig = go.Figure() + fig.add_trace( + go.Heatmap( + z=final, + colorscale="Rainbow", + zmin=0, + zmax=1, + ) + ) + fig.update_layout( + title="Sequence coverage", xaxis_title="Positions", yaxis_title="Sequences" + ) + # Save as interactive HTML instead of an image + fig.savefig(f"{out_dir}/{name+('_' if name else '')}seq_coverage.png") + + # Plotting Predicted LDDT per position using Plotly + plddt_per_model = OrderedDict() + plddt_paths.sort() + for plddt_path in plddt_paths: + with open(plddt_path, "r") as in_file: + plddt_per_model[os.path.basename(plddt_path)[:-4]] = [ + float(x) for x in in_file.read().strip().split() + ] + + i = 0 + for model_name, value_plddt in plddt_per_model.items(): + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=list(range(len(value_plddt))), + y=value_plddt, + mode="lines", + name=model_name, + ) + ) + fig.update_layout(title="Predicted LDDT per Position") + fig.savefig(f"{out_dir}/{name+('_' if name else '')}coverage_LDDT_{i}.png") + i += 1 + + + +def align_structures(structures): + parser = PDB.PDBParser(QUIET=True) + structures = [ + parser.get_structure(f"Structure_{i}", pdb) for i, pdb in enumerate(structures) + ] + ref_structure = structures[0] + + common_atoms = set( + f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" + for atom in ref_structure.get_atoms() if not atom.element == 'H' + ) + #print(common_atoms) + for i, structure in enumerate(structures[1:], start=1): + common_atoms = common_atoms.intersection( + set( + f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" + for atom in structure.get_atoms() + ) + ) + + ref_atoms = [ + atom + for atom in ref_structure.get_atoms() + if f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" in common_atoms + ] + # print(ref_atoms) + super_imposer = PDB.Superimposer() + aligned_structures = [structures[0]] # Include the reference structure in the list + + for i, structure in enumerate(structures[1:], start=1): + target_atoms = [ + atom + for atom in structure.get_atoms() + if f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" in common_atoms + ] + + super_imposer.set_atoms(ref_atoms, target_atoms) + super_imposer.apply(structure.get_atoms()) + + aligned_structure = f"aligned_structure_{i}.pdb" + io = PDB.PDBIO() + io.set_structure(structure) + io.save(aligned_structure) + aligned_structures.append(aligned_structure) + + return aligned_structures + + +def pdb_to_lddt(struct_files, generate_tsv): + struct_files_sorted = struct_files + struct_files_sorted.sort() + + output_lddt = [] + averages = [] + + for struct_file in struct_files_sorted: + plddt_values = [] + + if struct_file.endswith('.pdb'): + parser = PDB.PDBParser(QUIET=True) + suffix = ".pdb" + elif struct_file.endswith('.cif'): + parser = PDB.MMCIFParser(QUIET=True) + suffix = ".cif" + else: + raise NotImplementedError("Reporting only supported for .pdb and .cif filetypes") + structure = parser.get_structure("", struct_file) + + for residue in structure.get_residues(): + res_pLDDT_tot = 0 + res_atom_count = 0 + + for atom in residue.get_atoms(): + res_atom_count +=1 + res_pLDDT_tot += atom.get_bfactor() + + # Residue-level mean for ESMfold atom-level pLDDT + res_pLDDT_ave = res_pLDDT_tot/res_atom_count + + if res_pLDDT_ave < 1.0: + res_pLDDT_ave *= 100 + plddt_values.append(res_pLDDT_ave) + + # Calculate the average PLDDT value for the current file + if plddt_values: + avg_plddt = sum(plddt_values) / len(plddt_values) + averages.append(round(avg_plddt, 3)) + else: + averages.append(0.0) + + if generate_tsv == "y": + output_file = f"{struct_file.replace(suffix, '')}_plddt.tsv" + with open(output_file, "w") as outfile: + outfile.write(" ".join(map(str, plddt_values)) + "\n") + output_lddt.append(output_file) + else: + plddt_values_string = " ".join(map(str, plddt_values)) + output_lddt.append(plddt_values_string) + + return output_lddt, averages + + +print("Starting...") + +version = "1.0.0" +model_name = { + "esmfold": "ESMFold", + "alphafold2": "AlphaFold2", + "alphafold3": "Alphafold3", + "colabfold": "ColabFold", + "rosettafold_all_atom": "RosettaFold All-Atom", + "helixfold3": "HelixFold3", + "rosettafold2na": "RoseTTAFold2NA", + "boltz": "Boltz" +} + +parser = argparse.ArgumentParser() +parser.add_argument("--type", dest="in_type") +parser.add_argument( + "--generate_tsv", choices=["y", "n"], default="n", dest="generate_tsv" +) +parser.add_argument("--msa", dest="msa", default="NO_FILE") +parser.add_argument("--pdb", dest="pdb", required=True, nargs="+") +parser.add_argument("--pae", dest="pae", default="NO_FILE") +parser.add_argument("--name", dest="name") +parser.add_argument("--output_dir", dest="output_dir") +parser.add_argument("--html_template", dest="html_template") +parser.add_argument("--version", action="version", version=f"{version}") +parser.set_defaults(output_dir="") +parser.set_defaults(in_type="esmfold") +parser.set_defaults(name="") +args = parser.parse_args() + +lddt_data, lddt_averages = pdb_to_lddt(args.pdb, args.generate_tsv) + +generate_output_images( + args.msa, lddt_data, args.name, args.output_dir, args.in_type, args.generate_tsv, args.pdb +) + +print("generating html report...") +structures = args.pdb +structures.sort() +aligned_structures = align_structures(structures) + +io = PDB.PDBIO() +ref_structure_path = "aligned_structure_0.pdb" +io.set_structure(aligned_structures[0]) +io.save(ref_structure_path) +aligned_structures[0] = ref_structure_path + +proteinfold_template = open(args.html_template, "r").read() +proteinfold_template = proteinfold_template.replace("*sample_name*", args.name) +proteinfold_template = proteinfold_template.replace( + "*prog_name*", model_name[args.in_type.lower()] +) + +args_pdb_array_js = ",\n".join([f'"{model}"' for model in structures]) +proteinfold_template = re.sub( + r"const MODELS = \[.*?\];", # Match the existing MODELS array in HTML template + f"const MODELS = [\n {args_pdb_array_js}\n];", # Replace with the new array + proteinfold_template, + flags=re.DOTALL, +) + +averages_js_array = f"const LDDT_AVERAGES = {lddt_averages};" +proteinfold_template = proteinfold_template.replace( + "const LDDT_AVERAGES = [];", averages_js_array +) + +i = 0 +for structure in aligned_structures: + proteinfold_template = proteinfold_template.replace( + f"*_data_ranked_{i}.pdb*", open(structure, "r").read().replace("\n", "\\n") + ) + i += 1 + +if not args.msa.endswith("NO_FILE"): + image_path = f"{args.output_dir}/{args.name}_{args.in_type}_seq_coverage.png" + with open(image_path, "rb") as in_file: + proteinfold_template = proteinfold_template.replace( + "seq_coverage.png", + f"data:image/png;base64,{base64.b64encode(in_file.read()).decode('utf-8')}", + ) +else: + pattern = r'
    .*?(.*?)*?
    \s*\s*\s*' + proteinfold_template = re.sub(pattern, "", proteinfold_template, flags=re.DOTALL) + +with open( + f"{args.output_dir}/{args.name + ('_' if args.name else '')}coverage_LDDT.html", + "r", +) as in_file: + lddt_html = in_file.read() + proteinfold_template = proteinfold_template.replace( + '
    ', lddt_html + ) + +if not args.pae.endswith("NO_FILE_PAE"): + with open( + f"{args.output_dir}/{args.name + ('_' if args.name else '')}PAE.html", + "r", + ) as pae_in_file: + pae_html = pae_in_file.read() + proteinfold_template = proteinfold_template.replace( + '
    ', pae_html + ) +else: + pattern = r'
    .*?(.*?)*?
    \s*' + proteinfold_template = re.sub(pattern, "", proteinfold_template, flags=re.DOTALL) + +with open( + f"{args.output_dir}/{args.name}_{args.in_type.lower()}_report.html", "w" +) as out_file: + out_file.write(proteinfold_template) diff --git a/bin/mmcif_to_pdb.py b/bin/mmcif_to_pdb.py new file mode 100755 index 000000000..dafd63260 --- /dev/null +++ b/bin/mmcif_to_pdb.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +############################################################################### +############################################################################### +## Created on December 16th 2024 convert cif files to pdb +############################################################################### +############################################################################### + +import argparse +import sys +from Bio import PDB + +def parse_args(args=None): + Description = "Convert mmcif files to pdb format." + Epilog = """Example usage: python mmcif_to_pdb.py """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "MMCIF_IN", + help="Input mmcif file." + ) + parser.add_argument( + "-po", + "--pdb_out", + type=str, + dest="PDB_OUT", + default="", + help="Output pdb file." + ) + return parser.parse_args(args) + + +def mmcif_to_pdb(mmcif_file, pdb_file): + """ + Convert an mmCIF file to PDB format. + """ + # Parse the mmCIF file + parser = PDB.MMCIFParser(QUIET=True) + structure = parser.get_structure("structure", mmcif_file) + + # Write to PDB format + io = PDB.PDBIO() + io.set_structure(structure) + io.save(pdb_file) + + return pdb_file + + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def main(args=None): + args = parse_args(args) + + # Name output PDB file name + pdb_file = args.PDB_OUT + if not pdb_file: + pdb_file = args.MMCIF_IN.rsplit(".", 1)[0] + ".pdb" + + pdb_file = mmcif_to_pdb(args.MMCIF_IN, pdb_file) + print(f"Converted {args.MMCIF_IN} to {pdb_file}") + + +if __name__ == "__main__": + main() diff --git a/bin/msa_manager.py b/bin/msa_manager.py new file mode 100755 index 000000000..242f0a9cb --- /dev/null +++ b/bin/msa_manager.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +import os +import string +import argparse + + +MAX_MSA_SEQS = 16384 +MAX_PAIRED_SEQS = 8192 +ID_CHARS = list(string.ascii_uppercase) + list(string.ascii_lowercase) + [str(x) for x in range(10)] + + +def get_sub_sequences(seq_lengths, whole_seq): + out_seqs = [] + curr_seq = "" + curr_seq_itr = 0 + total_letters = 0 + for letter in whole_seq: + curr_seq += letter + if letter.isupper() or letter == "-": + total_letters += 1 + if total_letters == seq_lengths[curr_seq_itr]: + out_seqs.append(curr_seq) + curr_seq = "" + curr_seq_itr += 1 + total_letters = 0 + + if len(out_seqs) != len(seq_lengths): + print("Something wrong in the input file, could not generate the required number of sequences") + exit(1) + + return out_seqs + + +def parse_msa(msa_path, output_dir, meta_id): + os.makedirs(output_dir, exist_ok=True) + homolog = "" + section_index = 0 + + with open(msa_path, "r") as file: + first_line = file.readline() + if not first_line.startswith("#"): + homologs_lengths = [len(file.readline().strip('\n'))] + sequence_groups = [[[],[]]] + is_multimer = False + else: + homologs_lengths = [int(x.strip()) for x in first_line.replace('#',"").split()[0].split(",")] + sequence_groups = [[[], []] for _ in range(len(homologs_lengths))] + is_multimer = True + + with open(msa_path, "r") as file: + if is_multimer: + file.readline() + header_line = file.readline().strip()[1:] + expected_section_headers = [x.strip() for x in header_line.split()] + current_header = header_line + first_seq = False + for line in file: + line = line.strip() + if line.startswith(">"): + if homolog: + if first_seq and section_index > 0: + first_seq = False + else: + sub_sequences = get_sub_sequences(homologs_lengths, homolog) + for seq_index in range(len(homologs_lengths)): + if section_index == 0: + if len(sequence_groups[seq_index][0]) < MAX_PAIRED_SEQS: + sequence_groups[seq_index][0].append(sub_sequences[seq_index]) + else: + if seq_index == section_index - 1: + if len(sequence_groups[seq_index][1]) + len(sequence_groups[seq_index][0]) < MAX_MSA_SEQS: + sequence_groups[seq_index][1].append(sub_sequences[seq_index]) + + homolog = "" + current_header = line[1:].strip() + + if section_index < len(homologs_lengths) and current_header == expected_section_headers[section_index]: + section_index += 1 + first_seq = True + else: + homolog += line + if homolog: + if first_seq and section_index > 0: + first_seq = False + else: + sub_sequences = get_sub_sequences(homologs_lengths, homolog) + for seq_index in range(len(homologs_lengths)): + if section_index == 0: + if len(sequence_groups[seq_index][0]) < MAX_PAIRED_SEQS: + sequence_groups[seq_index][0].append(sub_sequences[seq_index]) + else: + if seq_index == section_index - 1: + if len(sequence_groups[seq_index][1]) + len(sequence_groups[seq_index][0]) < MAX_MSA_SEQS: + sequence_groups[seq_index][1].append(sub_sequences[seq_index]) + + for seq_index in range(len(homologs_lengths)): + filename = os.path.join(output_dir, f"{meta_id}_{seq_index}.csv") + with open(filename, "w") as out_file: + out_file.write("key,sequence\n") + if len(homologs_lengths)==1: #Homo-oligomer: all sequences are paired + paired_sequences = sequence_groups[seq_index][0]+sequence_groups[seq_index][1] + for i, seq in enumerate(paired_sequences): + out_file.write(f"{i},{seq}\n") + else: + paired_sequences = sequence_groups[seq_index][0] + for i, seq in enumerate(paired_sequences, start=1): + out_file.write(f"{i},{seq}\n") + + unpaired_sequences = sequence_groups[seq_index][1] + for seq in unpaired_sequences: + out_file.write(f"-1,{seq}\n") + + +def main(): + parser = argparse.ArgumentParser(description="Split multi-A3M file into CSV sequences per section.") + parser.add_argument("msa_path", help="Path to input .a3m file") + parser.add_argument("-o", "--output_dir", default="output_msa", help="Directory to write output CSVs") + parser.add_argument("--meta_id", default="default", help="Prefix for MSA files") + + args = parser.parse_args() + parse_msa(args.msa_path, args.output_dir, args.meta_id) + + +if __name__ == "__main__": + main() diff --git a/bin/utils.py b/bin/utils.py new file mode 100755 index 000000000..60f2f1cd8 --- /dev/null +++ b/bin/utils.py @@ -0,0 +1,112 @@ +import importlib.util +import numpy as np +bio_is_installed = importlib.util.find_spec("Bio") is not None + +def _convert_plddt_to_100(res_plddt): + if (res_plddt < 1): # Converting to a [0,100] range + res_plddt *= 100 + return res_plddt + + +def plddt_from_struct_b_factor_adhoc(struct_file): + """ + Uses ad hoc PDB parser to extract residue pLDDT values from the b-factor column. Iterates over PDB objects rather than processes raw file + """ + #NOTE: this is a temporary hack which is not robust as a general parser. + #Should be temporary - need to check with non-protein entities + if not str(struct_file).endswith(".pdb"): + raise ValueError(f"{struct_file} must be a PDB file!") + + res_plddts = [] + resid_prev = -1 + atom_plddt_list = [] + + with open(struct_file) as f: + for line in f: + if not line.startswith('ATOM'): continue + resid = int(line[23:26].strip()) + if resid == resid_prev: + atom_plddt_list.append(float(line[61:66].strip())) + else: + if resid_prev == -1: + resid_prev = resid + continue + res_plddt = sum(atom_plddt_list)/len(atom_plddt_list) + res_plddt = _convert_plddt_to_100(res_plddt) + res_plddts.append(res_plddt) + resid_prev = resid + + # Reset atom tracking + atom_plddt_list = [] + atom_plddt_list.append(float(line[61:66].strip())) + + res_plddt = sum(atom_plddt_list)/len(atom_plddt_list) + res_plddt = _convert_plddt_to_100(res_plddt) + res_plddts.append(res_plddt) + + res_plddts = np.array(res_plddts) + res_plddts = np.round(res_plddts, 2) + + return res_plddts + +def get_chain_ids(struct_file): + from Bio import PDB + if str(struct_file).endswith(".pdb"): + parser = PDB.PDBParser(QUIET=True) + structure = parser.get_structure(id=id, file=struct_file) + elif str(struct_file).endswith(".cif"): + parser = PDB.MMCIFParser(QUIET=True) + structure = parser.get_structure(structure_id=id, filename=struct_file) + else: + raise ValueError(f"{struct_file} is neither a PDB or mmCIF file!") + + return [chain.id for chain in structure.get_chains()] + + +def plddt_from_struct_b_factor_biopython(struct_file): + """ + Uses the BioPython PDB package to extract residue pLDDT values from the b-factor column. Iterates over PDB objects rather than processes raw file + """ + from Bio import PDB + if str(struct_file).endswith(".pdb"): + parser = PDB.PDBParser(QUIET=True) + structure = parser.get_structure(id=id, file=struct_file) + elif str(struct_file).endswith(".cif"): + parser = PDB.MMCIFParser(QUIET=True) + structure = parser.get_structure(structure_id=id, filename=struct_file) + else: + raise ValueError(f"{struct_file} is neither a PDB or mmCIF file!") + +# res_list = [] + res_plddts = [] +# plddt_tot = 0 + + for model in structure: + for chain in model: + chain_res_list = chain.get_unpacked_list() +# res_list.extend(chain_res_list) + for residue in chain: + atom_list = residue.get_unpacked_list() + atom_plddt_tot = 0 + for atom in residue: # ESMFold and others have separate atom-wise values, so doing atom-wise to cover that and residue-wise + atom_plddt = atom.get_bfactor() + atom_plddt_tot += atom_plddt + + res_plddt = float(atom_plddt_tot / len(atom_list)) + + if (res_plddt < 1): # RFAA the multiplication of mean isn't failing. Anyway covering to a [0,100] range for any structure file1 + res_plddt *= 100 + res_plddt = _convert_plddt_to_100(res_plddt) + + res_plddts.append(res_plddt) +# plddt_tot += res_plddt + + res_plddts = np.array(res_plddts) + res_plddts = np.round(res_plddts, 2) + + return res_plddts + +if bio_is_installed: + plddt_from_struct_b_factor = plddt_from_struct_b_factor_biopython +else: + plddt_from_struct_b_factor = plddt_from_struct_b_factor_adhoc diff --git a/conf/base.config b/conf/base.config index 69ad41e9f..b867a7f88 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,47 +10,45 @@ process { - // TODO nf-core: Check the defaults for all processes - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 * task.attempt } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. + // NOTE - Please try and reuse the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 2 * task.attempt } + memory = { 12.GB * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 8.h * task.attempt } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + cpus = { 12 * task.attempt } + memory = { 72.GB * task.attempt } + time = { 16.h * task.attempt } } withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + time = { 20.h * task.attempt } } withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } + memory = { 200.GB * task.attempt } } withLabel:error_ignore { errorStrategy = 'ignore' @@ -59,4 +57,8 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel:process_gpu { + ext.use_gpu = { workflow.profile.contains('gpu') } + accelerator = { workflow.profile.contains('gpu') ? 1 : null } + } } diff --git a/conf/dbs.config b/conf/dbs.config index 9fd0ec9a1..807251534 100644 --- a/conf/dbs.config +++ b/conf/dbs.config @@ -1,46 +1,90 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for databases links + Nextflow config file for database links and paths ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines pointers to the DBS publicly available that store models parametrisations - and data. + Centralizes URLs and local paths for all external resources used in the workflow. + Update version numbers and base directories in one place for maintainability. ---------------------------------------------------------------------------------------- */ + params { + // Dated prefixes, can be modified for alternate versions + alphafold2_params_prefix = "alphafold_params_2022-12-06" + uniref30_prefix = "UniRef30_2023_02" + // AlphaFold2 links - bfd_link = 'https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz' - small_bfd_link = 'https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz' - alphafold2_params_link = 'https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar' - mgnify_link = 'https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz' - pdb70_link = 'http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200916.tar.gz' - pdb_mmcif_link = 'rsync.rcsb.org::ftp_data/structures/divided/mmCIF/' //Other sources available: 'rsync.rcsb.org::ftp_data/structures/divided/mmCIF/' ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ - pdb_obsolete_link = 'https://files.wwpdb.org/pub/pdb/data/status/obsolete.dat' - uniref30_alphafold2_link = 'https://storage.googleapis.com/alphafold-databases/v2.3/UniRef30_2021_03.tar.gz' - uniref90_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz' - pdb_seqres_link = 'https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' - uniprot_sprot_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' - uniprot_trembl_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz' - - // Alphafold paths - bfd_path = "${params.alphafold2_db}/bfd/*" - small_bfd_path = "${params.alphafold2_db}/small_bfd/*" - alphafold2_params_path = "${params.alphafold2_db}/alphafold_params_*/*" - mgnify_path = "${params.alphafold2_db}/mgnify/*" - pdb70_path = "${params.alphafold2_db}/pdb70/**" - pdb_mmcif_path = "${params.alphafold2_db}/pdb_mmcif/*" - uniref30_alphafold2_path = "${params.alphafold2_db}/uniref30/*" - uniref90_path = "${params.alphafold2_db}/uniref90/*" - pdb_seqres_path = "${params.alphafold2_db}/pdb_seqres/*" - uniprot_path = "${params.alphafold2_db}/uniprot/*" + alphafold2_bfd_link = 'https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz' + alphafold2_small_bfd_link = 'https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz' + alphafold2_params_link = 'https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar' + alphafold2_mgnify_link = 'https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2024_04/mgy_clusters.fa.gz' + alphafold2_pdb70_link = 'https://wwwuser.gwdguser.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz' + alphafold2_pdb_mmcif_link = 'rsync.rcsb.org::ftp_data/structures/divided/mmCIF/' //Other sources available: 'rsync.rcsb.org::ftp_data/structures/divided/mmCIF/' ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ + alphafold2_pdb_obsolete_link = 'https://files.wwpdb.org/pub/pdb/data/status/obsolete.dat' + alphafold2_uniref30_link = 'https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02/UniRef30_2023_02_hhsuite.tar.gz' + alphafold2_uniref90_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz' + alphafold2_pdb_seqres_link = 'https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' + alphafold2_uniprot_sprot_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' + alphafold2_uniprot_trembl_link = 'https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz' + + // Alphafold2 paths + alphafold2_bfd_path = "${params.alphafold2_db}/bfd/*" + alphafold2_small_bfd_path = "${params.alphafold2_db}/small_bfd/*" + alphafold2_params_path = "${params.alphafold2_db}/params/${params.alphafold2_params_prefix}/*" + alphafold2_mgnify_path = "${params.alphafold2_db}/mgnify/*" + alphafold2_pdb70_path = "${params.alphafold2_db}/pdb70/**" + alphafold2_pdb_mmcif_path = "${params.alphafold2_db}/pdb_mmcif/mmcif_files" + alphafold2_pdb_obsolete_path = "${params.alphafold2_db}/pdb_mmcif/obsolete.dat" + alphafold2_uniref30_path = "${params.alphafold2_db}/uniref30/*" + alphafold2_uniref90_path = "${params.alphafold2_db}/uniref90/*" + alphafold2_pdb_seqres_path = "${params.alphafold2_db}/pdb_seqres/*" + alphafold2_uniprot_path = "${params.alphafold2_db}/uniprot/*" + + // Alphafold3 links + alphafold3_small_bfd_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/bfd-first_non_consensus_sequences.fasta.zst' + alphafold3_mgnify_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/mgy_clusters_2022_05.fa.zst' + alphafold3_pdb_mmcif_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/pdb_2022_09_28_mmcif_files.tar.zst' + alphafold3_uniref90_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/uniref90_2022_05.fa.zst' + alphafold3_pdb_seqres_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/pdb_seqres_2022_09_28.fasta.zst' + alphafold3_uniprot_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/uniprot_all_2021_04.fa.zst' + alphafold3_rnacentral_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/rnacentral_active_seq_id_90_cov_80_linclust.fasta.zst' + alphafold3_nt_rna_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/nt_rna_2023_02_23_clust_seq_id_90_cov_80_rep_seq.fasta.zst' + alphafold3_rfam_link = 'https://storage.googleapis.com/alphafold-databases/v3.0/rfam_14_9_clust_seq_id_90_cov_80_rep_seq.fasta.zst' + + // Alphafold3 paths + alphafold3_small_bfd_path = "${params.alphafold3_db}/small_bfd/*" + alphafold3_params_path = params.alphafold3_params_path ?: (params.alphafold3_db ? "${params.alphafold3_db}/params/*" : null) + alphafold3_mgnify_path = "${params.alphafold3_db}/mgnify/*" + alphafold3_pdb_mmcif_path = "${params.alphafold3_db}/pdb_mmcif/mmcif_files" + alphafold3_uniref90_path = "${params.alphafold3_db}/uniref90/*" + alphafold3_pdb_seqres_path = "${params.alphafold3_db}/pdb_seqres/*" + alphafold3_uniprot_path = "${params.alphafold3_db}/uniprot/*" + alphafold3_rnacentral_path = "${params.alphafold3_db}/rnacentral/*" + alphafold3_nt_rna_path = "${params.alphafold3_db}/nt_rna/*" + alphafold3_rfam_path = "${params.alphafold3_db}/rfam/*" + + // Boltz links + boltz_ccd_link = 'https://huggingface.co/boltz-community/boltz-1/resolve/main/ccd.pkl' + boltz_model_link = 'https://huggingface.co/boltz-community/boltz-1/resolve/main/boltz1_conf.ckpt' + boltz2_aff_link = 'https://huggingface.co/boltz-community/boltz-2/resolve/main/boltz2_aff.ckpt' + boltz2_conf_link = 'https://huggingface.co/boltz-community/boltz-2/resolve/main/boltz2_conf.ckpt' + boltz2_mols_link = 'https://huggingface.co/boltz-community/boltz-2/resolve/main/mols.tar' + + // Boltz paths + boltz_ccd_path = "${params.boltz_db}/params/ccd.pkl" + boltz_model_path = "${params.boltz_db}/params/boltz1_conf.ckpt" + boltz2_aff_path = "${params.boltz_db}/params/boltz2_aff.ckpt" + boltz2_conf_path = "${params.boltz_db}/params/boltz2_conf.ckpt" + boltz2_mols_path = "${params.boltz_db}/params/mols/" // Colabfold links - colabfold_db_link = 'http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz' - uniref30_colabfold_link = 'https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2302.tar.gz' + colabfold_db_link = 'https://opendata.mmseqs.org/colabfold/colabfold_envdb_202108.db.tar.gz' + colabfold_uniref30_link = 'https://opendata.mmseqs.org/colabfold/uniref30_2302.db.tar.gz' // Colabfold paths - colabfold_db_path = "${params.colabfold_db}/colabfold_envdb_202108" - uniref30_colabfold_path = "${params.colabfold_db}/uniref30_2302" + colabfold_envdb_path = "${params.colabfold_db}/colabfold_envdb/*" + colabfold_uniref30_path = "${params.colabfold_db}/colabfold_uniref30/*" + // Are all these params options needed? colabfold_alphafold2_params_tags = [ "alphafold2_multimer_v1" : "alphafold_params_colab_2021-10-27", "alphafold2_multimer_v2" : "alphafold_params_colab_2022-03-02", @@ -48,11 +92,75 @@ params { "alphafold2_ptm" : "alphafold_params_2021-07-14" ] + // RoseTTAFold_All_Atom links + rosettafold_all_atom_uniref30_link = 'https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02/UniRef30_2023_02_hhsuite.tar.gz' + rosettafold_all_atom_pdb100_link = 'https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz' + rosettafold_all_atom_bfd_link = 'https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz' + rosettafold_all_atom_paper_weights_link = 'http://files.ipd.uw.edu/pub/RF-All-Atom/weights/RFAA_paper_weights.pt' + + // RoseTTAFold_All_Atom paths + rosettafold_all_atom_uniref30_path = "${params.rosettafold_all_atom_db}/uniref30/*" + rosettafold_all_atom_pdb100_path = "${params.rosettafold_all_atom_db}/pdb100/*" + rosettafold_all_atom_bfd_path = "${params.rosettafold_all_atom_db}/bfd/*" + rosettafold_all_atom_paper_weights_path = "${params.rosettafold_all_atom_db}/params/RFAA_paper_weights.pt" + + // Helixfold3 links + helixfold3_uniclust30_link = 'https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02/UniRef30_2023_02_hhsuite.tar.gz' + helixfold3_ccd_preprocessed_link = 'https://paddlehelix.bd.bcebos.com/HelixFold3/CCD/ccd_preprocessed_etkdg.pkl.gz' + helixfold3_rfam_link = 'https://paddlehelix.bd.bcebos.com/HelixFold3/MSA/Rfam-14.9_rep_seq.fasta' + helixfold3_init_models_link = 'https://paddlehelix.bd.bcebos.com/HelixFold3/params/HelixFold3-params-240814.zip' + helixfold3_bfd_link = 'https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz' + helixfold3_small_bfd_link = 'https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz' + helixfold3_uniprot_sprot_link = 'ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' + helixfold3_uniprot_trembl_link = 'ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz' + helixfold3_pdb_seqres_link = 'https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' + helixfold3_uniref90_link = 'ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz' + helixfold3_mgnify_link = 'https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2024_04/mgy_clusters.fa.gz' + helixfold3_pdb_mmcif_link = 'rsync.rcsb.org::ftp_data/structures/divided/mmCIF/' + helixfold3_obsolete_link = 'https://files.rcsb.org/pub/pdb/data/status/obsolete.dat' + helixfold3_maxit_src_link = 'https://proteinfold-dataset.s3.amazonaws.com/test-data/db/helixfold3/maxit-v11.200-prod-src.tar.gz' + + // Helixfold3 paths + helixfold3_uniclust30_path = "${params.helixfold3_db}/uniref30/*" + helixfold3_ccd_preprocessed_path = "${params.helixfold3_db}/params/ccd_preprocessed_etkdg.pkl.gz" + helixfold3_rfam_path = "${params.helixfold3_db}/rfam/Rfam-14.9_rep_seq.fasta" + helixfold3_init_models_path = "${params.helixfold3_db}/params/HelixFold3-240814.pdparams" + helixfold3_bfd_path = "${params.helixfold3_db}/bfd/*" + helixfold3_small_bfd_path = "${params.helixfold3_db}/small_bfd/*" + helixfold3_uniprot_path = "${params.helixfold3_db}/uniprot/*" + helixfold3_pdb_seqres_path = "${params.helixfold3_db}/pdb_seqres/*" + helixfold3_uniref90_path = "${params.helixfold3_db}/uniref90/*" + helixfold3_mgnify_path = "${params.helixfold3_db}/mgnify/*" + helixfold3_pdb_mmcif_path = "${params.helixfold3_db}/pdb_mmcif/mmcif_files" + helixfold3_obsolete_path = "${params.helixfold3_db}/pdb_mmcif/obsolete.dat" + helixfold3_maxit_src_path = "${params.helixfold3_db}/maxit-v11.200-prod-src" + + // RosettaFold2NA database download links + rosettafold2na_uniref30_link = "http://wwwuser.gwdg.de/~compbiol/uniclust/2020_06/UniRef30_2020_06_hhsuite.tar.gz" + rosettafold2na_bfd_link = "https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz" + rosettafold2na_pdb100_link = "https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz" + rosettafold2na_weights_link = "https://files.ipd.uw.edu/dimaio/RF2NA_apr23.tgz" + + // RNA database links + rfam_full_region_link = "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.full_region.gz" + rfam_cm_link = "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz" + rnacentral_rfam_annotations_link = "ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/rfam/rfam_annotations.tsv.gz" + rnacentral_id_mapping_link = "ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/id_mapping.tsv.gz" + rnacentral_sequences_link = "ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_species_specific_ids.fasta.gz" + + // RosettaFold2NA database paths + rosettafold2na_uniref30_path = "${params.rosettafold2na_db}/UniRef30_2020_06/*" + rosettafold2na_bfd_path = "${params.rosettafold2na_db}/bfd/*" + rosettafold2na_pdb100_path = "${params.rosettafold2na_db}/pdb100/*" + rosettafold2na_weights_path = "${params.rosettafold2na_db}/params/network/weights/RF2NA_apr23.pt" + rosettafold2na_rna_path = "${params.rosettafold2na_db}/RNA/*" + // Esmfold links esmfold_3B_v1 = 'https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v1.pt' esm2_t36_3B_UR50D = 'https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t36_3B_UR50D.pt' esm2_t36_3B_UR50D_contact_regression = 'https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t36_3B_UR50D-contact-regression.pt' // Esmfold paths - esmfold_params_path = "${params.esmfold_db}/*" + esmfold_params_path = "${params.esmfold_db}/params/*" + } diff --git a/conf/modules.config b/conf/modules.config index c12b372db..02f1c0048 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -30,13 +30,14 @@ process { withName: 'UNTAR' { ext.args2 = '--no-same-owner' publishDir = [ - path: {"${params.outdir}/DBs/${params.mode}/${params.alphafold2_mode}"}, + path: {"${params.outdir}/DBs/${params.mode}"}, mode: 'symlink', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } withName: 'ARIA2' { + time = { 20.h * task.attempt } publishDir = [ enabled: false ] @@ -51,4 +52,20 @@ process { ] } + withName: 'GENERATE_REPORT' { + publishDir = [ + path: { "${params.outdir}/reports" }, + mode: 'copy', + pattern: '*report.html' + ] + } + + withName: 'FOLDSEEK_EASYSEARCH' { + ext.args = { params.foldseek_easysearch_arg ? "$params.foldseek_easysearch_arg" : "--format-mode 3" } + publishDir = [ + path: { "${params.outdir}/foldseek_easysearch" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 4aae2d301..a9eae8416 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -17,50 +17,170 @@ process { withName: 'GUNZIP|COMBINE_UNIPROT|DOWNLOAD_PDBMMCIF|ARIA2_PDB_SEQRES' { publishDir = [ - path: {"${params.outdir}/DBs/${params.mode}/${params.alphafold2_mode}"}, + path: {"${params.outdir}/DBs/alphafold2"}, mode: 'symlink', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } -} + withName: 'NFCORE_PROTEINFOLD:POST_PROCESSING:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "alphafold2_$filename" } + ] + } + + // Configure UNTAR/GUNZIP processes to use correct directory names + withName: '.*ARIA2_ALPHAFOLD2_PARAMS:UNTAR' { + ext.prefix = { "${params.alphafold2_params_prefix}" } + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_SMALL_BFD:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/small_bfd" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_BFD:UNTAR' { + ext.prefix = 'bfd' + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_MGNIFY:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/mgnify" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*ARIA2_PDB70:UNTAR' { + ext.prefix = 'pdb70' + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:DOWNLOAD_PDBMMCIF' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_mmcif" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_OBSOLETE:ARIA2' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_mmcif"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*.*PREPARE_ALPHAFOLD2_DBS:ARIA2_UNIREF30:UNTAR' { + ext.prefix = 'uniref30' + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_UNIREF90:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/uniref90" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:ARIA2_PDB_SEQRES' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_seqres" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ALPHAFOLD2_DBS:COMBINE_UNIPROT' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/uniprot" }, + pattern: 'uniprot.fasta', + ] + } -if (params.alphafold2_mode == 'standard') { - process { - withName: 'RUN_ALPHAFOLD2' { - if(params.use_gpu) { accelerator = 1 } - ext.args = [ - params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false', - params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' - ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/${params.mode}/${params.alphafold2_mode}" }, + withName: 'RUN_ALPHAFOLD2' { + if (params.use_gpu) { + accelerator = 1 + } + ext.args = [ + params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false', + params.alphafold2_max_template_date ? "--max_template_date ${params.alphafold2_max_template_date}" : '', + params.alphafold2_random_seed ? "--random_seed=${params.alphafold2_random_seed}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + saveAs: { filename -> + if(filename.endsWith('_pae.tsv')){ + "paes/$filename" + } else { filename } + }, + pattern: '*.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}/" }, + mode: 'copy', + pattern: 'raw/**' + ], + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_alphafold2.pdb' ] - } + ] } -} - -if (params.alphafold2_mode == 'split_msa_prediction') { - process { - withName: 'RUN_ALPHAFOLD2_MSA' { - ext.args = params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' - publishDir = [ - path: { "${params.outdir}/${params.mode}/${params.alphafold2_mode}" }, + withName: 'RUN_ALPHAFOLD2_MSA' { + ext.args = params.alphafold2_max_template_date ? "--max_template_date ${params.alphafold2_max_template_date}" : '' + publishDir = [ + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}/msa/" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: 'raw/*.*', + saveAs: { filename -> + if(filename.equals('versions.yml')) { + null + } else { + filename.toString().replaceFirst(/^raw\//, '') + } + } + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}/raw/" }, + mode: 'copy', + pattern: 'raw/msas/**', + saveAs: { filename -> filename.toString().replaceFirst(/^raw\//, '') } ] - } + ] + } - withName: 'RUN_ALPHAFOLD2_PRED' { - if(params.use_gpu) { accelerator = 1 } - ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' - publishDir = [ - path: { "${params.outdir}/${params.mode}/${params.alphafold2_mode}" }, + withName: 'RUN_ALPHAFOLD2_PRED' { + if (params.use_gpu) { + accelerator = 1 + } + ext.args = [ + params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false', + params.alphafold2_random_seed ? "--random_seed=${params.alphafold2_random_seed}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}" }, + mode: 'copy', + saveAs: { filename -> + if(filename.endsWith('_pae.tsv')){ + "paes/$filename" + } else { filename } + }, + pattern: '*.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/${meta.id}/" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: 'raw/**' + ], + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_alphafold2.pdb' ] - } + ] } } diff --git a/conf/modules_alphafold3.config b/conf/modules_alphafold3.config new file mode 100644 index 000000000..34c1f2b9f --- /dev/null +++ b/conf/modules_alphafold3.config @@ -0,0 +1,150 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +// +// General configuration options +// + +process { + withName: 'GUNZIP' { + publishDir = [ + path: {"${params.outdir}/DBs/alphafold3/"}, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'ZSTD_DECOMPRESS' { + ext.args = '--force' + publishDir = [ + [ + path: {"${params.outdir}/DBs/${params.mode}/mgnify"}, + mode: 'symlink', + pattern: 'mgy_clusters_2022_05.fa' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/pdb_seqres"}, + mode: 'symlink', + pattern: 'pdb_seqres_*.fasta' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/small_bfd"}, + mode: 'symlink', + pattern: 'bfd-first_non_consensus_sequences.fasta' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/uniprot"}, + mode: 'symlink', + pattern: 'uniprot*.fa' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/uniref90"}, + mode: 'symlink', + pattern: 'uniref90*.fa' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/rnacentral"}, + mode: 'symlink', + pattern: 'rnacentral*.fasta' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/nt_rna"}, + mode: 'symlink', + pattern: 'nt_rna*.fasta' + ], + [ + path: {"${params.outdir}/DBs/${params.mode}/rfam"}, + mode: 'symlink', + pattern: 'rfam*.fasta' + ], + ] + } + withName: '.*PREPARE_ALPHAFOLD3_DBS:ARIA2_MMCIF:UNTAR' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_mmcif" }, + saveAs: { filename -> + if (filename.equals('versions.yml')) { + return null + } + return filename.replaceAll(/.*_mmcif_files/, 'mmcif_files') + }, + ] + } + withName: 'NFCORE_PROTEINFOLD:ALPHAFOLD3:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "alphafold3_$filename" } + ] + } +} + +// +// Module specific configuration options +// +process { + withName: 'RUN_ALPHAFOLD3' { + if (params.use_gpu) { + accelerator = 1 + } + publishDir = [ + [ + path: { "${params.outdir}/alphafold3/${meta.id}" }, + mode: 'copy', + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/alphafold3/${meta.id}" }, + mode: 'copy', + pattern: '*_ptm.tsv' + ], + [ + path: { "${params.outdir}/alphafold3/${meta.id}" }, + mode: 'copy', + pattern: '*_iptm.tsv' + ], + [ + path: { "${params.outdir}/alphafold3/${meta.id}" }, + mode: 'copy', + saveAs: { filename -> "paes/$filename" }, + pattern: '*_pae.tsv' + ], + [ + path: { "${params.outdir}/alphafold3/${meta.id}" }, + mode: 'copy', + pattern: '*_alphafold3_msa.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/alphafold3/${meta.id}/raw" }, + mode: 'copy', + pattern: 'raw/**', + saveAs: { filename -> filename.toString().replaceFirst(/^raw\//, '') } + ], + [ + path: { "${params.outdir}/alphafold3/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.cif" }, + pattern: '*_alphafold3.cif' + ] + ] + } + withName: 'MMCIF2PDB_TOP_RANKED' { + publishDir = [ + [ + path: { "${params.outdir}/alphafold3/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*.pdb' + ] + ] + } +} diff --git a/conf/modules_boltz.config b/conf/modules_boltz.config new file mode 100644 index 000000000..4320859cf --- /dev/null +++ b/conf/modules_boltz.config @@ -0,0 +1,128 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + // Provide args plus configure processes to use correct directory names + // for Boltz parameters and models, which are downloaded as part of the workflow + withName: '.*ARIA2_COLABFOLD_PARAMS:UNTAR' { + ext.prefix = { "${params.colabfold_alphafold2_params_tags[params.colabfold_model_preset] }" } + publishDir = [ + path: {"${params.outdir}/DBs/${params.mode}/params"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*ARIA2_COLABFOLD_DB:UNTAR' { + ext.prefix = 'colabfold_envdb' + } + withName: '.*PREPARE_COLABFOLD_DBS_BOLTZ:ARIA2_UNIREF30:UNTAR' { + ext.prefix = 'colabfold_uniref30' + } + withName: 'ARIA2_BOLTZ_CCD' { + ext.args = '-o ccd.pkl' + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: 'ccd.pkl', + ] + } + withName: 'ARIA2_BOLTZ_MODEL' { + ext.args = '-o boltz1_conf.ckpt' + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: 'boltz1_conf.ckpt', + ] + } + withName: 'ARIA2_BOLTZ2_AFF' { + ext.args = '-o boltz2_aff.ckpt' + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: 'boltz2_aff.ckpt', + ] + } + withName: 'ARIA2_BOLTZ2_CONF' { + ext.args = '-o boltz2_conf.ckpt' + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: 'boltz2_conf.ckpt', + ] + } + withName: '.*ARIA2_UNCOMPRESS:ARIA2' { + ext.args = '-o mols.tar' + } + withName: '.*PREPARE_BOLTZ_DBS:ARIA2_UNCOMPRESS:UNTAR' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: 'RUN_BOLTZ' { + if (params.use_gpu) { + accelerator = 1 + } + ext.args = [ + params.boltz_model ? "--model ${params.boltz_model}" : "", + params.use_msa_server ? "--use_msa_server" : "", + params.msa_server_url ? "--msa_server_url ${params.msa_server_url}" : "", + params.boltz_use_potentials ? "--use_potentials" : "", + params.boltz_use_kernels ? "" : "--no_kernels", + "--write_full_pae", + ].findAll { arg -> arg }.join(' ').trim() + + publishDir = [ + [ + path: { "${params.outdir}/boltz/${meta.id}" }, + mode: 'copy', + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/boltz/${meta.id}" }, + mode: 'copy', + pattern: '*_boltz_msa.tsv' + ], + [ + path: { "${params.outdir}/boltz/${meta.id}" }, + mode: 'copy', + pattern: '*{ptm,iptm}.tsv' + ], + [ + path: { "${params.outdir}/boltz/${meta.id}/paes" }, + mode: 'copy', + pattern: '*_[0-5]_pae.tsv' + ], + [ + path: { "${params.outdir}/boltz/top_ranked_structures" }, + mode: 'copy', + saveAs: { _filename -> "${meta.id}.pdb" }, + pattern: '*_boltz.pdb' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/boltz/${meta.id}" }, + mode: 'copy', + pattern: 'boltz_results_*', + ], + ] + } + + withName: 'NFCORE_PROTEINFOLD:BOLTZ:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "boltz_$filename" } + ] + } + + withName: 'BOLTZ_FASTA|MULTIFASTA_TO_CSV|SPLIT_MSA' { + cpus = 1 + memory = 2.GB + time = 1.h + } +} diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index a7a719b0e..c713f7098 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -10,58 +10,87 @@ ---------------------------------------------------------------------------------------- */ -if (params.colabfold_server == 'webserver') { - process { - withName: 'COLABFOLD_BATCH' { - ext.args = [ - params.use_gpu ? '--use-gpu-relax' : '', - params.use_amber ? '--amber' : '', - params.use_templates ? '--templates' : '', - params.host_url ? "--host-url ${params.host_url}" : '' - ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/${params.mode}/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' - ] - } +process { + withName: 'NFCORE_PROTEINFOLD:COLABFOLD:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "colabfold_$filename" } + ] + } + + // Configure UNTAR processes to use correct directory names + withName: '.*ARIA2_COLABFOLD_PARAMS:UNTAR' { + ext.prefix = { "${params.colabfold_alphafold2_params_tags[params.colabfold_model_preset] }" } + publishDir = [ + path: {"${params.outdir}/DBs/${params.mode}/params"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*ARIA2_COLABFOLD_DB:UNTAR' { + ext.prefix = 'colabfold_envdb' + } + withName: '.*PREPARE_COLABFOLD_DBS_COLABFOLD:ARIA2_UNIREF30:UNTAR' { + ext.prefix = 'colabfold_uniref30' } } -if (params.colabfold_server == 'local') { - process { - withName: '.*:MMSEQS_.*' { - publishDir = [ - enabled: false - ] +process { + withName: 'COLABFOLD_BATCH' { + if (params.use_gpu) { + accelerator = 1 } - withName: 'MMSEQS_CREATEINDEX' { - ext.args = '--remove-tmp-files 1' - ext.args2 = '*_seq.tsv' - publishDir = [ - enabled: false - ] - } - withName: 'MMSEQS_COLABFOLDSEARCH' { - ext.args = { params.db_load_mode ? "--db-load-mode ${params.db_load_mode}" : '' } - publishDir = [ - enabled: false - ] - } - withName: 'COLABFOLD_BATCH' { - if(params.use_gpu) { accelerator = 1 } - ext.args = [ - params.use_gpu ? '--use-gpu-relax' : '', - params.use_amber ? '--amber' : '', - params.use_templates ? '--templates' : '' - ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/${params.mode}/${params.colabfold_server}" }, + + ext.args = [ + params.colabfold_use_gpu_relax ? '--use-gpu-relax' : '', + params.colabfold_use_amber ? '--amber' : '', + params.colabfold_use_templates ? '--templates' : '', + params.use_msa_server && params.msa_server_url ? "--host-url ${params.msa_server_url}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/colabfold/${meta.id}/" }, + mode: 'copy', + saveAs: { filename -> + if(filename.endsWith('_pae.tsv')){ + "paes/$filename" + } else { filename } + }, + pattern: '*.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/colabfold/${meta.id}/" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + pattern: 'raw/**' + ], + [ + path: { "${params.outdir}/colabfold/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_colabfold.pdb' ] - } + ] + } +} + +process { + withName: '.*:MMSEQS_.*' { + publishDir = [ + enabled: false + ] + } + withName: 'MMSEQS_CREATEINDEX' { + ext.args = '--compressed 1 --remove-tmp-files 1' + ext.args2 = '*_seq.dbtype' + publishDir = [ + enabled: false + ] + } + withName: 'MMSEQS_COLABFOLDSEARCH' { + ext.args = { params.colabfold_db_load_mode ? "--db-load-mode ${params.colabfold_db_load_mode}" : '' } + publishDir = [ + enabled: false + ] } } diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index 81b3048fd..f76d4e35d 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -11,13 +11,44 @@ */ process { + // Configure processes to use correct directory names + withName: '.*PREPARE_ESMFOLD_DBS:ARIA2_.*' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: '*.pt', + ] + } withName: 'RUN_ESMFOLD' { - ext.args = {params.use_gpu ? '' : '--cpu-only'} + if (params.use_gpu) { + accelerator = 1 + } + ext.args = { params.use_gpu ? '' : '--cpu-only' } + containerOptions = { + workflow.containerEngine in ['singularity', 'apptainer'] ? + '--nv --env TRITON_CACHE_DIR=/tmp/triton_cache --env XDG_CACHE_HOME=/tmp' : + '' + } publishDir = [ - path: { "${params.outdir}/${params.mode}" }, + [ + path: { "${params.outdir}/esmfold/${meta.id}" }, mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/esmfold/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_esmfold.pdb' ] + ] } + + withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "esmfold_$filename" } + ] + } + } diff --git a/conf/modules_helixfold3.config b/conf/modules_helixfold3.config new file mode 100644 index 000000000..21f931c71 --- /dev/null +++ b/conf/modules_helixfold3.config @@ -0,0 +1,168 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + // Configure UNTAR/GUNZIP processes to use correct directory names + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_UNICLUST30:UNTAR' { + ext.prefix = 'uniref30' + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_INIT_MODELS:UNZIP' { + ext.prefix = '.' + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}" }, + saveAs: { filename -> + if (filename.equals('versions.yml')) { + return null + } + return filename.replaceAll(/HelixFold3-params-240814/, 'params/') + }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_CCD_PREPROCESSED:ARIA2' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params" }, + pattern: '*.pkl.gz', + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_RFAM:ARIA2' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/rfam" }, + pattern: '*.fasta', + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_SMALL_BFD:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/small_bfd" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_BFD:UNTAR' { + ext.prefix = 'bfd' + } + withName: '.*PREPARE_HELIXFOLD3_DBS:COMBINE_UNIPROT' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/uniprot" }, + pattern: 'uniprot.fasta', + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_PDB_SEQRES' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_seqres" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_UNIREF90:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/uniref90" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_MGNIFY:GUNZIP' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/mgnify" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:DOWNLOAD_PDBMMCIF' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_mmcif" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_OBSOLETE:ARIA2' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/pdb_mmcif"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_RFAM:ARIA2' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/rfam" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_HELIXFOLD3_DBS:ARIA2_MAXIT:UNTAR' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: 'RUN_HELIXFOLD3' { + if (params.use_gpu) { + accelerator = 1 + } + ext.args = [ + params.helixfold3_max_template_date ? "--max_template_date=${params.helixfold3_max_template_date}" : "--max_template_date=2038-01-19", + "--model_name allatom_demo", + // params.helixfold3_full_dbs ? "--preset 'full_dbs'" : "--preset 'reduced_dbs'", // Not supported yet + "--preset 'reduced_dbs'", // Always use reduced_dbs for now + "--logging_level 'ERROR'", + params.helixfold3_precision ? "--precision ${params.helixfold3_precision}" : "--precision 'bf16'", + params.helixfold3_infer_times ? "--infer_times ${params.helixfold3_infer_times}" : "--infer_times 4" + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/helixfold3/${meta.id}" }, + mode: 'copy', + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/helixfold3/${meta.id}" }, + mode: 'copy', + pattern: '*_ptm.tsv' + ], + [ + path: { "${params.outdir}/helixfold3/${meta.id}" }, + mode: 'copy', + pattern: '*_iptm.tsv' + ], + [ + path: { "${params.outdir}/helixfold3/${meta.id}" }, + mode: 'copy', + saveAs: { filename -> "paes/$filename" }, + pattern: '*_[1-5]_pae.tsv' + ], + [ + path: { "${params.outdir}/helixfold3/${meta.id}" }, + mode: 'copy', + pattern: '*_helixfold3_msa.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/helixfold3/${meta.id}/raw" }, + mode: 'copy', + pattern: 'raw/**', + saveAs: { filename -> filename.toString().replaceFirst(/^raw\//, '') } + ], + [ + path: { "${params.outdir}/helixfold3/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_helixfold3.pdb' + ], + [ + path: { "${params.outdir}/helixfold3/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.cif" }, + pattern: '*_helixfold3.cif' + ] + ] + } + + withName: 'NFCORE_PROTEINFOLD:HELIXFOLD3:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "helixfold3_$filename" } + ] + } +} diff --git a/conf/modules_rosettafold2na.config b/conf/modules_rosettafold2na.config new file mode 100644 index 000000000..cb7658640 --- /dev/null +++ b/conf/modules_rosettafold2na.config @@ -0,0 +1,87 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths for RF2NA +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + // Configure UNTAR/GUNZIP processes to use correct directory names + withName: '.*PREPARE_ROSETTAFOLD2NA_DBS:ARIA2_UNIREF30:UNTAR' { + ext.prefix = 'UniRef30_2020_06' + } + withName: '.*PREPARE_ROSETTAFOLD2NA_DBS:ARIA2_BFD:UNTAR' { + ext.prefix = 'bfd' + } + withName: '.*PREPARE_ROSETTAFOLD2NA_DBS:ARIA2_PDB100:UNTAR' { + ext.prefix = 'pdb100' + } + withName: '.*PREPARE_ROSETTAFOLD2NA_DBS:ARIA2_WEIGHTS:UNTAR' { + publishDir = [ + path: { "${params.outdir}/DBs/${params.mode}/params/network"}, + saveAs: { filename -> + if (filename.equals('versions.yml')) { + return null + } + return filename.replaceAll(/RF2NA_apr23/, 'weights') + }, + ] + } + withName: '.*PREPARE_ROSETTAFOLD2NA_DBS:DOWNLOAD_RNA_DATABASES' { + publishDir = [ + path: {"${params.outdir}/DBs/${params.mode}/"}, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: 'RUN_ROSETTAFOLD2NA' { + if (params.use_gpu) { + accelerator = 1 + } + publishDir = [ + [ + path: { "${params.outdir}/rosettafold2na/${meta.id}" }, + mode: 'copy', + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/rosettafold2na/${meta.id}" }, + mode: 'copy', + saveAs: { filename -> "paes/$filename" }, + pattern: '*_pae.tsv' + ], + [ + path: { "${params.outdir}/rosettafold2na/${meta.id}" }, + mode: 'copy', + pattern: '*_rosettafold2na_msa.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/rosettafold2na/${meta.id}/raw" }, + mode: 'copy', + pattern: 'raw/**', + saveAs: { filename -> filename.toString().replaceFirst(/^raw\//, '') } + ], + [ + path: { "${params.outdir}/rosettafold2na/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_rosettafold2na.pdb' + ] + ] + } + + withName: 'NFCORE_PROTEINFOLD:ROSETTAFOLD2NA:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "rosettafold2na_$filename" } + ] + } +} diff --git a/conf/modules_rosettafold_all_atom.config b/conf/modules_rosettafold_all_atom.config new file mode 100644 index 000000000..2a0f00751 --- /dev/null +++ b/conf/modules_rosettafold_all_atom.config @@ -0,0 +1,83 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + withName: 'GUNZIP|ARIA2_PDB_SEQRES' { + publishDir = [ + path: {"${params.outdir}/DBs/rosettafold_all_atom/"}, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: 'RUN_ROSETTAFOLD_ALL_ATOM' { + if (params.use_gpu) { + accelerator = 1 + } + publishDir = [ + [ + path: { "${params.outdir}/rosettafold_all_atom/${meta.id}" }, + mode: 'copy', + pattern: '*_plddt.tsv' + ], + [ + path: { "${params.outdir}/rosettafold_all_atom/${meta.id}" }, + mode: 'copy', + saveAs: { filename -> "paes/$filename" }, + pattern: '*_[0-5]_pae.tsv' + ], + [ + path: { "${params.outdir}/rosettafold_all_atom/${meta.id}" }, + mode: 'copy', + pattern: '*_rosettafold_all_atom_msa.tsv' + ], + [ + enabled: params.save_intermediates, + path: { "${params.outdir}/rosettafold_all_atom/${meta.id}/raw" }, + mode: 'copy', + pattern: 'raw/**', + saveAs: { filename -> filename.toString().replaceFirst(/^raw\//, '') } + ], + [ + path: { "${params.outdir}/rosettafold_all_atom/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_rosettafold_all_atom.pdb' + ] + ] + } + + withName: 'NFCORE_PROTEINFOLD:ROSETTAFOLD_ALL_ATOM:MULTIQC' { + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : "rosettafold_all_atom_$filename" } + ] + } + + // Configure UNTAR processes to use correct directory names + withName: '.*PREPARE_ROSETTAFOLD_ALL_ATOM_DBS:ARIA2_WEIGHTS:ARIA2' { + publishDir = [ + path: {"${params.outdir}/DBs/${params.mode}/params"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*PREPARE_ROSETTAFOLD_ALL_ATOM_DBS:ARIA2_UNIREF30:UNTAR' { + ext.prefix = 'uniref30' + } + withName: '.*PREPARE_ROSETTAFOLD_ALL_ATOM_DBS:ARIA2_PDB100:UNTAR' { + ext.prefix = 'pdb100' + } + withName: '.*PREPARE_ROSETTAFOLD_ALL_ATOM_DBS:ARIA2_BFD:UNTAR' { + ext.prefix = 'bfd' + } +} diff --git a/conf/test.config b/conf/test.config index e6e18ac2e..c7d7de85b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -12,19 +12,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test alphafold2 analysis mode = 'alphafold2' alphafold2_mode = 'standard' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' alphafold2_db = "${projectDir}/assets/dummy_db_dir" } diff --git a/conf/test_alphafold3_download.config b/conf/test_alphafold3_download.config new file mode 100644 index 000000000..11f10d8b2 --- /dev/null +++ b/conf/test_alphafold3_download.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_alphafold2_download, --outdir + +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test alphafold2 analysis + mode = 'alphafold3' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + alphafold3_params_path = "${projectDir}/assets/dummy_db_dir" +} + +process { + withName: 'ARIA2|UNTAR|DOWNLOAD_PDBMMCIF_AF3|RUN_ALPHAFOLD3' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_alphafold3_standard.config b/conf/test_alphafold3_standard.config new file mode 100644 index 000000000..d6cb70988 --- /dev/null +++ b/conf/test_alphafold3_standard.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test alphafold2 analysis + mode = 'alphafold3' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + alphafold3_db = "${projectDir}/assets/dummy_db_dir" +} + +process { + withName: 'RUN_ALPHAFOLD3' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_alphafold_download.config b/conf/test_alphafold_download.config index 759ec61aa..287bbca0c 100644 --- a/conf/test_alphafold_download.config +++ b/conf/test_alphafold_download.config @@ -12,19 +12,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test alphafold2 analysis mode = 'alphafold2' alphafold2_mode = 'standard' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' } process { diff --git a/conf/test_alphafold_split.config b/conf/test_alphafold_split.config index 47d4f5d62..c74f487be 100644 --- a/conf/test_alphafold_split.config +++ b/conf/test_alphafold_split.config @@ -12,19 +12,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test alphafold2 splitting MSA from prediction analysis mode = 'alphafold2' alphafold2_mode = 'split_msa_prediction' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' alphafold2_db = "${projectDir}/assets/dummy_db_dir" } diff --git a/conf/test_boltz.config b/conf/test_boltz.config new file mode 100644 index 000000000..6e2441e1c --- /dev/null +++ b/conf/test_boltz.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_full_esmfold, --outdir + +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +process { + resourceLimits = [ + cpus: 1, + memory: '2.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'stub test profile for boltz' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data for full test of boltz + mode = 'boltz' + colabfold_model_preset = 'alphafold2_ptm' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + colabfold_db = "${projectDir}/assets/dummy_db_dir" + boltz_db = "${projectDir}/assets/dummy_db_dir" +} + +process { + withName: 'MMSEQS_COLABFOLDSEARCH|RUN_BOLTZ' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_colabfold_download.config b/conf/test_colabfold_download.config index 843fa07f8..313a4ec68 100644 --- a/conf/test_colabfold_download.config +++ b/conf/test_colabfold_download.config @@ -12,19 +12,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test colabfold analysis mode = 'colabfold' - colabfold_server = 'webserver' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + use_msa_server = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' } process { diff --git a/conf/test_colabfold_local.config b/conf/test_colabfold_local.config index b401c0aa5..2a896542d 100644 --- a/conf/test_colabfold_local.config +++ b/conf/test_colabfold_local.config @@ -10,20 +10,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test colabfold with the colabfold webserver analysis mode = 'colabfold' - colabfold_server = 'local' colabfold_db = "${projectDir}/assets/dummy_db_dir" - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' } process { @@ -31,4 +34,3 @@ process { container = 'biocontainers/gawk:5.1.0' } } - diff --git a/conf/test_colabfold_webserver.config b/conf/test_colabfold_webserver.config index 3cd74de7f..206a3aeba 100644 --- a/conf/test_colabfold_webserver.config +++ b/conf/test_colabfold_webserver.config @@ -10,20 +10,24 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - - // Input data to test colabfold with a local server analysis + // Input data to test colabfold with a webserver analysis mode = 'colabfold' - colabfold_server = 'webserver' + use_msa_server = true colabfold_db = "${projectDir}/assets/dummy_db_dir" - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' } process { diff --git a/conf/test_esmfold.config b/conf/test_esmfold.config index ad9847427..0bffe195d 100644 --- a/conf/test_esmfold.config +++ b/conf/test_esmfold.config @@ -10,19 +10,23 @@ stubRun = true +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - // Input data to test esmfold mode = 'esmfold' esmfold_db = "${projectDir}/assets/dummy_db_dir" - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' } process { @@ -30,4 +34,3 @@ process { container = 'quay.io/biocontainers/gawk:5.1.0' } } - diff --git a/conf/test_full.config b/conf/test_full.config index 18233938a..e59ffded6 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,6 +17,13 @@ params { // Input data for full test of alphafold standard mode mode = 'alphafold2' alphafold2_mode = 'standard' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' - alphafold2_db = 's3://proteinfold-dataset/test-data/db/alphafold_mini' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + alphafold2_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} + +process { + withName: 'RUN_ALPHAFOLD2' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib && export OPENMM_CUDA_COMPILER=/opt/conda/bin/nvcc && export TMPDIR=/tmp' : null + } } diff --git a/conf/test_full_alphafold_multimer.config b/conf/test_full_alphafold_multimer.config index 62e819667..76fb40238 100644 --- a/conf/test_full_alphafold_multimer.config +++ b/conf/test_full_alphafold_multimer.config @@ -18,6 +18,20 @@ params { mode = 'alphafold2' alphafold2_mode = 'standard' alphafold2_model_preset = 'multimer' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - alphafold2_db = 's3://proteinfold-dataset/test-data/db/alphafold_mini' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet_multimer.csv' + alphafold2_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} + +process { + withName: 'RUN_ALPHAFOLD2' { + memory = '60 GB' + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib && export OPENMM_CUDA_COMPILER=/opt/conda/bin/nvcc && export TMPDIR=/tmp' : null + } +} + +process { + withName: 'RUN_ALPHAFOLD2_PRED' { + ext.args = "--num_multimer_predictions_per_model 1" + } } diff --git a/conf/test_full_alphafold_split.config b/conf/test_full_alphafold_split.config index 90df73f28..e23ecc2c6 100644 --- a/conf/test_full_alphafold_split.config +++ b/conf/test_full_alphafold_split.config @@ -17,6 +17,15 @@ params { // Input data to test colabfold with a local server analysis mode = 'alphafold2' alphafold2_mode = 'split_msa_prediction' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' - alphafold2_db = 's3://proteinfold-dataset/test-data/db/alphafold_mini' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + alphafold2_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} + +docker.pullStrategy = 'lazy' + +process { + withName: 'RUN_ALPHAFOLD2_PRED' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib && export OPENMM_CUDA_COMPILER=/opt/conda/bin/nvcc && export TMPDIR=/tmp' : null + } } diff --git a/conf/test_full_boltz.config b/conf/test_full_boltz.config new file mode 100644 index 000000000..ebd100169 --- /dev/null +++ b/conf/test_full_boltz.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_full_esmfold, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile for boltz' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data for full test of boltz + mode = 'boltz' + colabfold_model_preset = 'alphafold2_ptm' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet_single_fasta.csv' + colabfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' + boltz_db = 's3://proteinfold-dataset/test-data/mini_dbs/' +} + +docker.pullStrategy = 'lazy' + +process { + withName: 'RUN_BOLTZ' { + memory = '20 GB' + ext.args = '--output_format "pdb" --write_full_pae --cache ./ --recycling_steps 1' + } +} diff --git a/conf/test_full_colabfold_local.config b/conf/test_full_colabfold_local.config index ad91f5e0b..4f4da3f32 100644 --- a/conf/test_full_colabfold_local.config +++ b/conf/test_full_colabfold_local.config @@ -11,19 +11,28 @@ */ params { - config_profile_name = 'Full test profile for colabfold using colabfold server' + config_profile_name = 'Full test profile for colabfold using local colabfold' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data to test colabfold with a local server analysis mode = 'colabfold' - colabfold_server = 'local' colabfold_model_preset = 'alphafold2_ptm' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' - colabfold_db = 's3://proteinfold-dataset/test-data/db/colabfold_mini' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + colabfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' + colabfold_use_gpu_relax = false + colabfold_use_amber = false + colabfold_use_templates = false } + +docker.pullStrategy = 'lazy' + process { withName:MMSEQS_COLABFOLDSEARCH { memory = 16.GB } + withName: 'COLABFOLD_BATCH' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/lib:/usr/local/targets/x86_64-linux/lib:/usr/local/cuda-12.9/targets/x86_64-linux/lib:/opt/conda/lib"' : null + } } diff --git a/conf/test_full_colabfold_webserver.config b/conf/test_full_colabfold_webserver.config index 7e296189c..9926ee29f 100644 --- a/conf/test_full_colabfold_webserver.config +++ b/conf/test_full_colabfold_webserver.config @@ -15,9 +15,18 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data for full test of colabfold with Colabfold server - mode = 'colabfold' - colabfold_server = 'webserver' - colabfold_model_preset = 'alphafold2_ptm' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' - colabfold_db = 's3://proteinfold-dataset/test-data/db/colabfold_mini' + mode = 'colabfold' + use_msa_server = true + colabfold_model_preset = 'alphafold2_ptm' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + colabfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' + colabfold_use_gpu_relax = false + colabfold_use_amber = false + colabfold_use_templates = false +} + +process { + withName: 'COLABFOLD_BATCH' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/lib:/usr/local/targets/x86_64-linux/lib:/usr/local/cuda-12.9/targets/x86_64-linux/lib:/opt/conda/lib"' : null + } } diff --git a/conf/test_full_colabfold_webserver_multimer.config b/conf/test_full_colabfold_webserver_multimer.config index c8adca613..1a108bd2f 100644 --- a/conf/test_full_colabfold_webserver_multimer.config +++ b/conf/test_full_colabfold_webserver_multimer.config @@ -11,13 +11,23 @@ */ params { - config_profile_name = 'Full test profile for colabfold using colabfold server' + config_profile_name = 'Full test profile for colabfold multimer using colabfold server' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data for full test of colabfold with Colabfold server - mode = 'colabfold' - colabfold_server = 'webserver' - colabfold_model_preset = 'alphafold2_multimer_v3' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - colabfold_db = 's3://proteinfold-dataset/test-data/db/colabfold_mini' + mode = 'colabfold' + use_msa_server = true + colabfold_model_preset = 'alphafold2_multimer_v3' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet_multimer.csv' + colabfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' + colabfold_use_gpu_relax = false + colabfold_use_amber = false + colabfold_use_templates = false +} + +process { + withName: 'COLABFOLD_BATCH' { + ext.args = '' + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/lib:/usr/local/targets/x86_64-linux/lib:/usr/local/cuda-12.9/targets/x86_64-linux/lib:/opt/conda/lib"' : null + } } diff --git a/conf/test_full_esmfold.config b/conf/test_full_esmfold.config index a0af69a46..c5c4c4be4 100644 --- a/conf/test_full_esmfold.config +++ b/conf/test_full_esmfold.config @@ -15,8 +15,15 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data for full test of esmfold monomer - mode = 'esmfold' - esmfold_model_preset = 'monomer' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet.csv' - esmfold_db = 's3://proteinfold-dataset/db/esmfold' + mode = 'esmfold' + use_gpu = true + esmfold_model_preset = 'monomer' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + esmfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} + +process { + withName: 'RUN_ESMFOLD' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.7/lib64:/conda/lib/python3.9/site-packages/nvidia/cusparse/lib:/conda/lib:/usr/local/cuda/lib64"' : null + } } diff --git a/conf/test_full_esmfold_multimer.config b/conf/test_full_esmfold_multimer.config index 498ae0029..f81ff0a0a 100644 --- a/conf/test_full_esmfold_multimer.config +++ b/conf/test_full_esmfold_multimer.config @@ -17,6 +17,13 @@ params { // Input data for full test of esmfold multimer mode = 'esmfold' esmfold_model_preset = 'multimer' - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - esmfold_db = 's3://proteinfold-dataset/test-data/db/esmfold' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet_multimer.csv' + esmfold_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} + +process { + withName: 'RUN_ESMFOLD' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.7/lib64:/conda/lib/python3.9/site-packages/nvidia/cusparse/lib:/conda/lib:/usr/local/cuda/lib64"' : null + } } diff --git a/conf/test_full_helixfold3.config b/conf/test_full_helixfold3.config new file mode 100644 index 000000000..76e68f893 --- /dev/null +++ b/conf/test_full_helixfold3.config @@ -0,0 +1,28 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_full_helixfold3, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile for helixfold3' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full test of helixfold3 + mode = 'helixfold3' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + helixfold3_db = 's3://proteinfold-dataset/test-data/mini_dbs/' +} + +process { + withName: 'RUN_HELIXFOLD3' { + beforeScript = System.getenv('TOWER_WORKFLOW_ID') ? 'unset LD_LIBRARY_PATH && export LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda-12.8/lib64"' : null + } +} diff --git a/conf/test_full_rosettafold2na.config b/conf/test_full_rosettafold2na.config new file mode 100644 index 000000000..56ed2135a --- /dev/null +++ b/conf/test_full_rosettafold2na.config @@ -0,0 +1,22 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_full_rosettafold2na, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile for rosettafold2na' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full test of rosettafold2na + mode = 'rosettafold2na' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/rna_complex_samplesheet.csv' + rosettafold2na_db = 's3://proteinfold-dataset/test-data/mini_dbs' +} diff --git a/conf/test_full_rosettafold_all_atom.config b/conf/test_full_rosettafold_all_atom.config new file mode 100644 index 000000000..12a638ee5 --- /dev/null +++ b/conf/test_full_rosettafold_all_atom.config @@ -0,0 +1,22 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/proteinfold -profile test_full_rosettafold_all_atom, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile for rosettafold_all_atom' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full test of rosettafold_all_atom + mode = 'rosettafold_all_atom' + use_gpu = true + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' + rosettafold_all_atom_db = 's3://proteinfold-dataset/test-data/mini_dbs/' +} diff --git a/conf/test_helixfold3.config b/conf/test_helixfold3.config new file mode 100644 index 000000000..99595b02c --- /dev/null +++ b/conf/test_helixfold3.config @@ -0,0 +1,36 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_helixfold3, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test helixfold3 + mode = 'helixfold3' + helixfold3_db = "${projectDir}/assets/dummy_db_dir" + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' +} + +process { + withName: 'RUN_HELIXFOLD3' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_rosettafold2na.config b/conf/test_rosettafold2na.config new file mode 100644 index 000000000..a3ab29b1b --- /dev/null +++ b/conf/test_rosettafold2na.config @@ -0,0 +1,36 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_rosettafold2na, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test rosettafold2na + mode = 'rosettafold2na' + rosettafold2na_db = "${projectDir}/assets/dummy_db_dir" + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/rna_complex_samplesheet.csv' +} + +process { + withName: 'RUN_ROSETTAFOLD2NA' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_rosettafold_all_atom.config b/conf/test_rosettafold_all_atom.config new file mode 100644 index 000000000..86805b9a8 --- /dev/null +++ b/conf/test_rosettafold_all_atom.config @@ -0,0 +1,36 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_rosettafold_all_atom, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test rosettafold_all_atom + mode = 'rosettafold_all_atom' + rosettafold_all_atom_db = "${projectDir}/assets/dummy_db_dir" + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet.csv' +} + +process { + withName: 'RUN_ROSETTAFOLD_ALL_ATOM' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config new file mode 100644 index 000000000..5ee897804 --- /dev/null +++ b/conf/test_split_fasta.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_colabfold_local, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test colabfold with the colabfold webserver analysis + mode = 'colabfold' + split_fasta = true + colabfold_db = "${projectDir}/assets/dummy_db_dir" + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/samplesheet_multimer.csv' +} + +process { + withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_msa b/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_msa deleted file mode 100644 index 64baaa38f..000000000 --- a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_msa +++ /dev/null @@ -1,58 +0,0 @@ -FROM nvidia/cuda:11.4.3-cudnn8-runtime-ubuntu18.04 - -LABEL authors="Luisa Santus, Athanasios Baltzis, Jose Espinosa-Carrasco, Leila Mansouri" \ - title="nfcore/proteinfold_alphafold2_msa" \ - Version="1.1.0" \ - description="Docker image containing all software requirements to run the RUN_ALPHAFOLD2_MSA module using the nf-core/proteinfold pipeline" - -# Use bash to support string substitution. -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -# Add env variables -ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH" -ENV PATH="/conda/bin:$PATH" - -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - build-essential \ - cmake \ - cuda-command-line-tools-11-1 \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get autoremove -y \ - && apt-get clean - -# Clone AlphaFold2 -RUN git clone https://github.com/cbcrg/alphafold.git /app/alphafold && \ - cd /app/alphafold && \ - git checkout 1b3170e9409472ec8ad044f9935c92bedd7b4674 && \ - cd - - -# Compile HHsuite from source -RUN git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ - && mkdir /tmp/hh-suite/build \ - && cd /tmp/hh-suite/build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite -DHAVE_AVX2=1 .. \ - && make -j 4 && make install \ - && ln -s /opt/hhsuite/bin/* /usr/bin \ - && cd - && rm -rf /tmp/hh-suite - -# Install Miniconda package manager -RUN wget -q -P /tmp \ - https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /conda \ - && rm /tmp/Miniconda3-latest-Linux-x86_64.sh - -# Install conda packages -RUN /conda/bin/conda install -y -c conda-forge \ - pip \ - python=3.10 \ - && conda clean --all --force-pkgs-dirs --yes - -# Install pip packages -RUN pip3 install --upgrade pip --no-cache-dir \ - && pip3 install -r /app/alphafold/requirements_msa.txt --no-cache-dir diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_split b/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_split deleted file mode 100644 index 4f4c89b4f..000000000 --- a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_split +++ /dev/null @@ -1,79 +0,0 @@ -FROM nvidia/cuda:11.4.3-cudnn8-runtime-ubuntu18.04 - -LABEL authors="Athanasios Baltzis, Jose Espinosa-Carrasco, Leila Mansouri" \ - title="nfcore/proteinfold_alphafold2_split" \ - Version="1.1.0" \ - description="Docker image containing all software requirements to run the RUN_ALPHAFOLD2_PRED module using the nf-core/proteinfold pipeline" - -# Use bash to support string substitution. -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -# Add env variables -ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH" -ENV PATH="/conda/bin:$PATH" - -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - build-essential \ - cmake \ - cuda-command-line-tools-11-1 \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get autoremove -y \ - && apt-get clean - -# Clone AlphaFold2 -RUN git clone https://github.com/cbcrg/alphafold.git /app/alphafold && \ - cd /app/alphafold && \ - git checkout 1b3170e9409472ec8ad044f9935c92bedd7b4674 && \ - cd - - -# Compile HHsuite from source -RUN git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ - && mkdir /tmp/hh-suite/build \ - && cd /tmp/hh-suite/build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite -DHAVE_AVX2=1 .. \ - && make -j 4 && make install \ - && ln -s /opt/hhsuite/bin/* /usr/bin \ - && cd - && rm -rf /tmp/hh-suite - -# Install Miniconda package manager -RUN wget -q -P /tmp \ - https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /conda \ - && rm /tmp/Miniconda3-latest-Linux-x86_64.sh - -# Install conda packages -RUN /conda/bin/conda install -y -c conda-forge \ - openmm=7.7.0 \ - cudatoolkit==11.1.1 \ - pdbfixer \ - pip \ - python=3.10 \ - && conda clean --all --force-pkgs-dirs --yes - -RUN wget -q -P /app/alphafold/alphafold/common/ \ - https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt - -# Install pip packages. -RUN pip3 install --upgrade pip --no-cache-dir \ - && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ - && pip3 install --upgrade --no-cache-dir \ - jax==0.3.25 \ - jaxlib==0.3.25+cuda11.cudnn805 \ - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - -RUN sed -i "s|alphafold/common/stereo_chemical_props.txt|/app/alphafold/alphafold/common/stereo_chemical_props.txt|g" /app/alphafold/alphafold/common/residue_constants.py - -# Add SETUID bit to the ldconfig binary so that non-root users can run it. -RUN chmod u+s /sbin/ldconfig.real - -# We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk -# with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for -# details. -RUN cd /app/alphafold -RUN ldconfig diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_standard b/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_standard deleted file mode 100644 index 774d89f6f..000000000 --- a/dockerfiles/Dockerfile_nfcore-proteinfold_alphafold2_standard +++ /dev/null @@ -1,79 +0,0 @@ -FROM nvidia/cuda:11.4.3-cudnn8-runtime-ubuntu18.04 - -LABEL authors="Athanasios Baltzis, Jose Espinosa-Carrasco, Leila Mansouri" \ - title="nfcore/proteinfold_alphafold2_standard" \ - Version="1.1.0" \ - description="Docker image containing all software requirements to run the RUN_ALPHAFOLD2 module using the nf-core/proteinfold pipeline" - -# Use bash to support string substitution. -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -# Add env variables -ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH" -ENV PATH="/conda/bin:$PATH" - -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - build-essential \ - cmake \ - cuda-command-line-tools-11-4 \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get autoremove -y \ - && apt-get clean - -# Clone AlphaFold2 -RUN git clone https://github.com/deepmind/alphafold.git /app/alphafold && \ - cd /app/alphafold && \ - git checkout 7c9114c8423ac9db981d8365168464bab09b3e54 && \ - cd - - -# Compile HHsuite from source -RUN git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ - && mkdir /tmp/hh-suite/build \ - && cd /tmp/hh-suite/build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite -DHAVE_AVX2=1 .. \ - && make -j 4 && make install \ - && ln -s /opt/hhsuite/bin/* /usr/bin \ - && cd - && rm -rf /tmp/hh-suite - -# Install Miniconda package manager -RUN wget -q -P /tmp \ - https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /conda \ - && rm /tmp/Miniconda3-latest-Linux-x86_64.sh - -# Install conda packages -RUN /conda/bin/conda install -y -c conda-forge \ - openmm=7.7.0 \ - cudatoolkit==11.1.1 \ - pdbfixer \ - pip \ - python=3.10 \ - && conda clean --all --force-pkgs-dirs --yes - -RUN wget -q -P /app/alphafold/alphafold/common/ \ - https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt - -# Install pip packages. -RUN pip3 install --upgrade pip --no-cache-dir \ - && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ - && pip3 install --upgrade --no-cache-dir \ - jax==0.3.25 \ - jaxlib==0.3.25+cuda11.cudnn805 \ - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - -RUN sed -i "s|alphafold/common/stereo_chemical_props.txt|/app/alphafold/alphafold/common/stereo_chemical_props.txt|g" /app/alphafold/alphafold/common/residue_constants.py - -# Add SETUID bit to the ldconfig binary so that non-root users can run it. -RUN chmod u+s /sbin/ldconfig.real - -# We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk -# with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for -# details. -RUN cd /app/alphafold -RUN ldconfig diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_colabfold b/dockerfiles/Dockerfile_nfcore-proteinfold_colabfold deleted file mode 100644 index 2ac1f851e..000000000 --- a/dockerfiles/Dockerfile_nfcore-proteinfold_colabfold +++ /dev/null @@ -1,37 +0,0 @@ -FROM nvidia/cuda:11.4.3-cudnn8-runtime-ubuntu18.04 - -LABEL authors="Athanasios Baltzis, Jose Espinosa-Carrasco, Leila Mansouri" \ - title="nfcore/proteinfold_colabfold" \ - Version="1.1.0" \ - description="Docker image containing all software requirements to run the COLABFOLD_BATCH module using the nf-core/proteinfold pipeline" - -ENV PATH="/localcolabfold/colabfold-conda/bin:$PATH" -ENV LD_LIBRARY_PATH="/localcolabfold/colabfold-conda/lib:/usr/local/cuda/lib64" -ENV PYTHONPATH="/localcolabfold/colabfold-conda/lib" -ENV PATH="/MMseqs2/build/bin:$PATH" - -# Use bash to support string substitution. -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - build-essential \ - cuda-command-line-tools-11-4 \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - curl \ - cmake \ - && rm -rf /var/lib/apt/lists/* - -RUN cd / \ - && wget https://raw.githubusercontent.com/YoshitakaMo/localcolabfold/82a3635/install_colabbatch_linux.sh \ - && sed -i "/colabfold.download/d" install_colabbatch_linux.sh \ - && sed -i "s|cudatoolkit==.*\sopenmm|cudatoolkit==11.1.1 openmm|g" install_colabbatch_linux.sh \ - && bash install_colabbatch_linux.sh - -RUN /localcolabfold/colabfold-conda/bin/python3.10 -m pip install tensorflow-cpu==2.11.0 - -#Silence download of the AlphaFold2 params -RUN sed -i "s|download_alphafold_params(|#download_alphafold_params(|g" /localcolabfold/colabfold-conda/lib/python3.10/site-packages/colabfold/batch.py -RUN sed -i "s|if args\.num_models|#if args\.num_models|g" /localcolabfold/colabfold-conda/lib/python3.10/site-packages/colabfold/batch.py diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_esmfold b/dockerfiles/Dockerfile_nfcore-proteinfold_esmfold deleted file mode 100644 index af2cd9936..000000000 --- a/dockerfiles/Dockerfile_nfcore-proteinfold_esmfold +++ /dev/null @@ -1,48 +0,0 @@ -FROM nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 -LABEL authors="Athanasios Baltzis" \ - title="nfcore/proteinfold_esmfold" \ - Version="1.1.0" \ - description="Docker image containing all software requirements to run ESMFold using the nf-core/proteinfold pipeline" - -# Add env variables -ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.0/lib64:/conda/lib/python3.9/site-packages/nvidia/cusparse/lib:$LD_LIBRARY_PATH" -ENV PATH="/conda/bin:$PATH" - -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - build-essential \ - cuda-command-line-tools-11-0 \ - nvidia-cuda-dev \ - wget \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Install Miniconda package manager -RUN wget -q -P /tmp \ - https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \ - && bash /tmp/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -p /conda \ - && rm /tmp/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh - -# Install ESMFold dependencies -RUN cd / && /conda/bin/conda update -qy conda \ - && /conda/bin/conda install -y -c conda-forge pip python -RUN /conda/bin/pip install --no-cache-dir git+https://github.com/facebookresearch/esm.git -RUN /conda/bin/pip install --no-cache-dir "fair-esm[esmfold]" -RUN /conda/bin/pip install --no-cache-dir \ - torch==1.13.1 \ - torchvision==0.14.1 \ - pytorch_lightning==1.5.10 \ - biopython==1.79 \ - deepspeed==0.5.9 \ - dm-tree==0.1.6 \ - ml-collections==0.1.0 \ - numpy==1.21.2 \ - PyYAML==5.4.1 \ - requests==2.26.0 \ - scipy==1.7.1 \ - tqdm==4.62.2 \ - typing-extensions==3.10.0.2 \ - wandb==0.12.21 -RUN /conda/bin/pip uninstall -y nvidia_cublas_cu11 -RUN /conda/bin/pip install --no-cache-dir 'dllogger @ git+https://github.com/NVIDIA/dllogger.git' -RUN /conda/bin/pip install --no-cache-dir 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307' diff --git a/docs/images/nf-core-proteinfold_metro_map.png b/docs/images/nf-core-proteinfold_metro_map.png deleted file mode 100644 index 5b1cfbae6..000000000 Binary files a/docs/images/nf-core-proteinfold_metro_map.png and /dev/null differ diff --git a/docs/images/nf-core-proteinfold_metro_map.svg b/docs/images/nf-core-proteinfold_metro_map.svg deleted file mode 100644 index ff895d9e9..000000000 --- a/docs/images/nf-core-proteinfold_metro_map.svg +++ /dev/null @@ -1,1267 +0,0 @@ - - - - - - - - - - - - fasta - - - - - fasta - - - - - DB - - - - - DB - - - - - DB - - - - - DB - - - - - params - - - - - params - - - - - - csv - - - - - PDB - - - - Samplesheet - ColabFoldWebserver - Standard AlphaFold2 (AF2) - AlphaFold2 (AF2) Split - ColabFold Webserver - ColabFold Local - INPUT CHECK - LEGEND - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - AF2 MSA - AF2 PRED - AF2 - - - - - PREPARE AF2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PREPARE COLABFOLD - COLABFOLD - COLABFOLD MSA - COLABFOLDPRED - - - - - diff --git a/docs/images/nf-core-proteinfold_metro_map_1.1.0.png b/docs/images/nf-core-proteinfold_metro_map_1.1.0.png deleted file mode 100644 index 503473566..000000000 Binary files a/docs/images/nf-core-proteinfold_metro_map_1.1.0.png and /dev/null differ diff --git a/docs/images/nf-core-proteinfold_metro_map_1.1.0.svg b/docs/images/nf-core-proteinfold_metro_map_1.1.0.svg deleted file mode 100644 index 372a70dcb..000000000 --- a/docs/images/nf-core-proteinfold_metro_map_1.1.0.svg +++ /dev/null @@ -1,1640 +0,0 @@ - - - - - - - - - - - - fasta - - - - - fasta - - - - DB - - - DB - - - - DB - - - - - DB - - - - - params - - - - - params - - - - params - - - - - csv - - - - - PDB - - - - Samplesheet - ColabFoldWebserver - Standard AlphaFold2 (AF2) - AlphaFold2 (AF2) Split - ColabFold Webserver - ColabFold Local - INPUT CHECK - v.1.1.0 - LEGEND - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ESMFold - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - AF2MSA - AF2 PRED - AF2 - - - - - PREPAREAF2 - - PREPAREESMFOLD - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PREPARE COLABFOLD - COLABFOLD - - ESMFOLD - COLABFOLDMSA - COLABFOLDPRED - - - - - diff --git a/docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png b/docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png deleted file mode 100644 index 8db0e9b00..000000000 Binary files a/docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png and /dev/null differ diff --git a/docs/images/nf-core-proteinfold_metro_map_2.0.0.png b/docs/images/nf-core-proteinfold_metro_map_2.0.0.png new file mode 100644 index 000000000..ea4a51677 Binary files /dev/null and b/docs/images/nf-core-proteinfold_metro_map_2.0.0.png differ diff --git a/docs/images/nf-core-proteinfold_metro_map_2.0.0.svg b/docs/images/nf-core-proteinfold_metro_map_2.0.0.svg new file mode 100644 index 000000000..9ff996d6c --- /dev/null +++ b/docs/images/nf-core-proteinfold_metro_map_2.0.0.svg @@ -0,0 +1,253 @@ + + + + + + + + + + + + + ESMFold + + + + + + + + + + Boltz + ColabFold + MMSeqs2ColabFoldSearch + + + + + + + + + + + + + + + + + + + + + + + AlphaFold2MSA + AlphaFold2Pred + + + + + + AlphaFold3Boltz (web)HelixFold3 + AlphaFold2ColabFold (web)RoseTTAFold-AARoseTTAFold2NA + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SplitFasta + PrepareDBs + + + + + fasta + + + + + + json + + + + + + yaml + + + + + + + PDB + + + + + + TSV + + GenerateReport + + MultiQC + ComparisonReport + + Combined: MSA Search + Model Inference + Split: AlphaFold2 MSA Search + Model Inference + Split: ColabFold MSA Search + Model Inference + pLM: Protein Language Model + + + + + + HTML + + + + + + HTML + + + + + + HTML + + Foldseek + + + + + M8 + + + + + diff --git a/docs/images/pae_proteinfold-v2.png b/docs/images/pae_proteinfold-v2.png new file mode 100644 index 000000000..2f1d265f5 Binary files /dev/null and b/docs/images/pae_proteinfold-v2.png differ diff --git a/docs/images/plddt_proteinfold-v2.png b/docs/images/plddt_proteinfold-v2.png new file mode 100644 index 000000000..814b7ec71 Binary files /dev/null and b/docs/images/plddt_proteinfold-v2.png differ diff --git a/docs/images/sequence_coverage_proteinfold-v2.png b/docs/images/sequence_coverage_proteinfold-v2.png new file mode 100644 index 000000000..9e900b98a Binary files /dev/null and b/docs/images/sequence_coverage_proteinfold-v2.png differ diff --git a/docs/output.md b/docs/output.md index 29d2337c9..52b3cf738 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,180 +2,173 @@ ## Introduction -This document describes the output produced by the pipeline. - -Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the user-facing output produced by the pipeline. ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and predicts protein structures using the following methods: -- [AlphaFold2](https://github.com/deepmind/alphafold) -- [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 (API server or local search) followed by ColabFold +- [AlphaFold2](https://github.com/google-deepmind/alphafold) +- [AlphaFold3](https://github.com/google-deepmind/alphafold3) +- [Boltz](https://github.com/jwohlwend/boltz) +- [ColabFold](https://github.com/sokrypton/ColabFold) - [ESMFold](https://github.com/facebookresearch/esm) +- [RoseTTAFold2NA](https://github.com/uw-ipd/RoseTTAFold2NA) +- [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) +- [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) See main [README.md](https://github.com/nf-core/proteinfold/blob/master/README.md) for a condensed overview of the steps in the pipeline, and the bioinformatics tools used at each step. The directories listed below will be created in the output directory after the pipeline has finished. All paths are relative to the top-level results directory. -### AlphaFold2 +Exact subdirectories depend on the selected mode(s). In a multi-mode run (for example `alphafold2,boltz,rosettafold_all_atom`) you will typically see top-level directories such as `alphafold2/`, `boltz/`, `rosettafold_all_atom/`, `multiqc/`, `reports/`, `compare/`, and `pipeline_info/`. + +### Prediction outputs (all modes) + +User-facing outputs are largely consistent across modes.
    -Output files +Common output patterns -- `AlphaFold2/` - - `/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings - - `.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models -- `DBs/` that contains symbolic links to the downloaded database and parameter files +- `/top_ranked_structures/.pdb` +- `//_plddt.tsv` +- `//paes/__pae.tsv` (when available) +- `//__msa.tsv` (mode-specific MSA summary) +- `//_{ptm,iptm}.tsv` and chainwise summaries (where applicable)
    -Below you can find an indicative example of the TSV file with the pLDDT scores per residue for each of the 5 predicted models produced by AlphaFold2, which is included in the MultiQC report: - -| Positions | rank_0 | rank_1 | rank_2 | rank_3 | rank_4 | -| --------- | ------ | ------ | ------ | ------ | ------ | -| 1 | 66.17 | 60.61 | 60.32 | 64.20 | 65.31 | -| 2 | 78.01 | 74.20 | 73.11 | 77.36 | 78.46 | -| 3 | 82.16 | 78.16 | 76.70 | 80.20 | 80.68 | -| 4 | 86.03 | 82.78 | 81.88 | 82.19 | 83.93 | -| 5 | 88.08 | 84.38 | 84.73 | 85.58 | 87.70 | -| 6 | 89.37 | 86.06 | 86.31 | 86.84 | 88.52 | -| 7 | 91.27 | 88.27 | 88.09 | 87.01 | 88.67 | -| 8 | 91.28 | 89.42 | 90.17 | 87.47 | 90.07 | -| 9 | 93.10 | 90.09 | 92.86 | 90.70 | 93.41 | -| 10 | 93.23 | 91.42 | 93.07 | 90.13 | 92.91 | -| 11 | 94.12 | 92.44 | 93.00 | 89.90 | 92.97 | -| 12 | 95.15 | 93.63 | 94.25 | 92.66 | 94.38 | -| 13 | 95.09 | 93.75 | 94.36 | 92.54 | 94.95 | -| 14 | 94.08 | 92.72 | 93.43 | 90.31 | 93.63 | -| 15 | 94.34 | 93.77 | 93.31 | 91.72 | 93.57 | -| 16 | 95.56 | 94.62 | 94.46 | 93.55 | 95.20 | -| 17 | 95.54 | 94.75 | 94.65 | 93.61 | 95.37 | -| 18 | 93.91 | 93.89 | 93.30 | 91.33 | 92.95 | -| 19 | 95.48 | 95.78 | 94.48 | 93.95 | 95.05 | -| 20 | 95.96 | 95.46 | 95.14 | 94.01 | 95.83 | -| 21 | 94.06 | 94.06 | 93.13 | 91.69 | 93.54 | -| 22 | 92.98 | 93.28 | 91.14 | 88.80 | 91.25 | -| 23 | 95.28 | 95.13 | 93.39 | 91.48 | 93.56 | -| 24 | 93.41 | 93.38 | 92.32 | 89.85 | 92.40 | -| 25 | 90.88 | 91.40 | 88.60 | 85.67 | 87.65 | -| 26 | 89.30 | 88.90 | 84.58 | 83.11 | 84.52 | -| 27 | 91.96 | 90.95 | 89.04 | 86.42 | 87.77 | -| 28 | 91.20 | 90.68 | 88.71 | 86.43 | 87.62 | -| 29 | 88.01 | 87.53 | 85.83 | 83.11 | 84.95 | -| 30 | 81.29 | 83.72 | 77.75 | 75.76 | 74.84 | -| 31 | 87.14 | 86.92 | 82.10 | 82.32 | 78.74 | -| 32 | 92.34 | 90.13 | 89.04 | 88.31 | 86.49 | -| 33 | 91.70 | 88.94 | 85.52 | 85.94 | 81.75 | -| 34 | 90.11 | 88.23 | 84.33 | 85.47 | 80.01 | -| 35 | 93.35 | 91.49 | 90.60 | 89.40 | 87.10 | -| 36 | 94.15 | 92.47 | 90.17 | 90.48 | 86.77 | -| 37 | 93.40 | 92.01 | 86.38 | 87.84 | 80.11 | -| 38 | 92.79 | 89.97 | 89.31 | 88.55 | 85.15 | -| 39 | 94.66 | 91.29 | 92.74 | 90.67 | 90.30 | -| 40 | 95.98 | 93.58 | 94.30 | 91.69 | 90.73 | -| 41 | 94.94 | 92.57 | 88.31 | 88.40 | 80.33 | -| 42 | 92.89 | 91.03 | 84.03 | 85.31 | 74.66 | -| 43 | 94.54 | 93.44 | 86.50 | 84.91 | 76.68 | -| 44 | 96.93 | 95.23 | 92.42 | 91.98 | 86.11 | -| 45 | 94.40 | 92.27 | 87.40 | 89.02 | 79.44 | -| 46 | 91.74 | 90.94 | 81.35 | 84.88 | 74.93 | -| 47 | 96.19 | 94.46 | 90.51 | 89.82 | 84.51 | -| 48 | 94.84 | 93.04 | 91.02 | 91.57 | 87.72 | -| 49 | 91.23 | 89.34 | 86.10 | 87.63 | 82.12 | -| 50 | 91.64 | 89.58 | 84.93 | 85.88 | 79.38 | - -### ColabFold +### pLDDT (`{meta.id}_plddt.tsv`) + +Confidence values per residue, rounded to 2 decimal places. Each ranked result gets its own column (for all-atom modules, atomic token confidences are processed to a naive mean value across the residue). + +``` +Positions rank_0 rank_1 rank_2 rank_3 rank_4 +0 83.58 85.27 88.41 86.22 84.91 +1 97.99 97.81 97.39 97.49 97.32 +2 98.22 98.42 98.16 97.88 97.81 +3 98.06 98.15 97.94 97.56 97.4 +4 98.67 98.56 98.3 98.38 98.29 +5 98.81 98.77 98.62 98.61 98.54 +6 98.79 98.74 98.57 98.59 98.52 +... +``` -
    -Output files +### MSA (`{meta.id}_{meta.mode}_msa.tsv`) -- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings -- `DBs/` that contains symbolic links to the downloaded database and parameter files +The amino acid characters are converted to integers `0-19`, unknown as 20, **integer `21`** represents the gap character. -
    +``` +19 5 5 4 10 16 15 3 8 15 13 16 12 9 17 16 9 4 8 11 0 7 7 8 11 0 19 8 8 5 3 +19 5 5 4 10 16 15 3 8 15 13 16 12 9 17 16 9 4 8 11 0 7 7 8 11 0 6 8 8 5 13 +19 5 5 4 10 5 15 13 14 0 14 16 12 9 17 16 9 4 14 11 0 7 5 8 15 4 5 8 3 5 21 +19 5 5 4 10 16 15 3 8 15 13 16 12 9 17 16 9 4 8 11 0 7 7 8 11 0 19 8 8 5 21 +19 5 5 4 10 16 15 3 8 15 13 16 12 9 17 16 9 4 8 11 0 7 7 8 11 0 19 8 8 5 13 +19 5 5 4 10 16 15 3 8 15 13 16 12 9 7 16 9 4 8 11 0 7 7 8 11 0 6 8 8 5 13 +``` + +This allows easy sequence indentity calculation when processing as a `numpy` array. + +### (i)pTM (`{meta.id}_[i]ptm.tsv`) + +(i)pTM scores, rounded to 3 decimal places, listed by the rank number (currently unsorted - to reflect models and seeds where appropriate). + +``` +17 0.552 +22 0.529 +21 0.532 +20 0.541 +23 0.523 +3 0.606 +2 0.610 +4 0.606 +1 0.616 +0 0.617 +12 0.580 +9 0.588 +13 0.580 +11 0.583 +14 0.570 +15 0.565 +24 0.517 +16 0.560 +18 0.550 +19 0.550 +10 0.588 +5 0.600 +6 0.597 +7 0.596 +8 0.595 +``` + +### chain-wise (i)pTM (`{meta.id}_chainwise_[i]ptm.tsv`) + +(Asymmetrical) ipTM scores, rounded to 4 decimal places, with chain pair lettering as the row (`X:Y`), and the rank number as the column. A pTM value is a chain's own predicted Template Modelling score so lettering will be `X:X`. + +``` +0 1 2 +A:B 0.2880 0.2750 0.2900 +B:A 0.2904 0.2801 0.2915 +``` + +### PAE (`{meta.id}_{rank_number}_pae.tsv`) + +Predicted alignment error of residues `j` aligned by residue `i`, rounded to 4 decimal places. +The row number gives you the index of residue `i` and the column value within the row gives the index of residue `j` for the 2D PAE matrix. + +Each model prediction generates a separate file containing the rank number. The `_0_pae.tsv` file corresponds to the top ranked model, other ranked results are stored within the `paes/` folder. + +``` +0.2500 1.5710 3.9037 6.2177 8.4471 11.4583 12.9679 15.1237 18.0263 18.3868 18.9381 20.5747 19.3314 20.1825 21.6145 23.2190 +2.2177 0.2500 1.5559 4.0327 6.3151 7.6372 10.1969 11.3626 14.9366 16.1303 17.9119 19.1877 21.2715 20.9531 20.1760 19.4087 +3.4270 1.5284 0.2500 2.1333 3.5351 5.1049 6.6521 8.2317 12.1379 13.7185 14.9523 16.6154 19.6988 21.7614 18.6592 17.9619 +6.1051 5.4206 2.5987 0.2500 2.0724 5.1454 6.7492 9.5538 9.6285 12.3868 13.8527 16.3586 17.2605 20.6381 19.9987 19.3295 +7.3512 6.4947 5.5435 2.6740 0.2500 1.7561 4.9041 6.3923 8.9735 8.9272 12.3419 14.6005 15.9820 17.6358 20.5190 19.1028 +7.4734 7.0899 5.8128 5.7512 2.0439 0.2500 1.8352 5.1064 6.4225 9.2098 10.5136 12.9404 14.3152 16.8122 18.6336 17.7382 +``` -Below you can find some indicative examples of the output images produced by ColabFold, which are included in the MultiQC report: +#### Example report plots -#### Sequence coverage +The report exports include key visualisations such as sequence coverage, predicted Local Distance Difference Test (pLDDT), and Predicted Aligned Error (PAE). -![Alt text](../docs/images/T1024_LmrP____408_residues__coverage_mqc.png?raw=true "T1024_coverage") +##### Sequence coverage -#### predicted Local Distance Difference Test (pLDDT) +![Sequence coverage](images/sequence_coverage_proteinfold-v2.png?raw=true "Example sequence coverage plot") -![Alt text](../docs/images/T1024_LmrP____408_residues__plddt_mqc.png?raw=true "T1024_coverage") +##### predicted Local Distance Difference Test (pLDDT) -#### Predicted Aligned Error (PAE) +![pLDDT](images/plddt_proteinfold-v2.png?raw=true "Example pLDDT plot") -![Alt text](../docs/images/T1024_LmrP____408_residues__PAE_mqc.png?raw=true "T1024_coverage") +##### Predicted Aligned Error (PAE) + +![PAE](images/pae_proteinfold-v2.png?raw=true "Example PAE plot") + +### Per-mode reports and comparisons -### ESMFold +
    +Output files + +- `reports/` + - `__report.html` (single-mode report per sequence/mode) +- `compare/` + - `_comparison_report.html` (present when running multiple modes) + +
    + +### Foldseek structural similarity search + +If Foldseek is enabled (`--skip_foldseek false`), results are written to:
    Output files -- `esmfold/` - - `.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models -- `DBs/` that contains symbolic links to the downloaded database and parameter files +- `foldseek_easysearch/` + - `__foldseek.html` (default output format) + - `.m8` (tabular output when `--foldseek_easysearch_arg` does not include `--format-mode 3`)
    -Below you can find an indicative example of the TSV file with the pLDDT scores per atom for predicted model produced by ESMFold, which is included in the MultiQC report: - -| Atom_serial_number | Atom_name | Residue_name | Residue_sequence_number | pLDDT | -| ------------------ | --------- | ------------ | ----------------------- | ----- | -| 1 | N | VAL | 1 | 44.77 | -| 2 | CA | VAL | 1 | 47.23 | -| 3 | C | VAL | 1 | 46.66 | -| 4 | CB | VAL | 1 | 41.88 | -| 5 | O | VAL | 1 | 45.75 | -| 6 | CG1 | VAL | 1 | 39.15 | -| 7 | CG2 | VAL | 1 | 39.59 | -| 8 | N | THR | 2 | 49.89 | -| 9 | CA | THR | 2 | 51.41 | -| 10 | C | THR | 2 | 50.21 | -| 11 | CB | THR | 2 | 43.84 | -| 12 | O | THR | 2 | 47.36 | -| 13 | CG2 | THR | 2 | 35.32 | -| 14 | OG1 | THR | 2 | 40.12 | -| 15 | N | VAL | 3 | 51.40 | -| 16 | CA | VAL | 3 | 54.38 | -| 17 | C | VAL | 3 | 52.10 | -| 18 | CB | VAL | 3 | 48.50 | -| 19 | O | VAL | 3 | 52.58 | -| 20 | CG1 | VAL | 3 | 38.75 | -| 21 | CG2 | VAL | 3 | 39.26 | -| 22 | N | ASP | 4 | 52.00 | -| 23 | CA | ASP | 4 | 53.92 | -| 24 | C | ASP | 4 | 52.33 | -| 25 | CB | ASP | 4 | 46.82 | -| 26 | O | ASP | 4 | 51.28 | -| 27 | CG | ASP | 4 | 42.89 | -| 28 | OD1 | ASP | 4 | 45.89 | -| 29 | OD2 | ASP | 4 | 53.61 | -| 30 | N | ASP | 5 | 56.10 | -| 31 | CA | ASP | 5 | 56.97 | -| 32 | C | ASP | 5 | 55.75 | -| 33 | CB | ASP | 5 | 50.34 | -| 34 | O | ASP | 5 | 54.18 | -| 35 | CG | ASP | 5 | 45.82 | -| 36 | OD1 | ASP | 5 | 50.03 | -| 37 | OD2 | ASP | 5 | 58.01 | -| 38 | N | LEU | 6 | 56.50 | -| 39 | CA | LEU | 6 | 58.34 | -| 40 | C | LEU | 6 | 55.81 | -| 41 | CB | LEU | 6 | 52.46 | -| 42 | O | LEU | 6 | 54.42 | -| 43 | CG | LEU | 6 | 49.17 | -| 44 | CD1 | LEU | 6 | 44.31 | -| 45 | CD2 | LEU | 6 | 47.07 | -| 46 | N | VAL | 7 | 57.23 | -| 47 | CA | VAL | 7 | 57.68 | -| 48 | C | VAL | 7 | 57.39 | -| 49 | CB | VAL | 7 | 52.74 | -| 50 | O | VAL | 7 | 56.46 | +Foldseek runs on top-ranked structures from each selected mode and sequence. By default, the pipeline uses `--format-mode 3` and publishes HTML reports. ### MultiQC report @@ -183,17 +176,14 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p Output files - `multiqc` - - multiqc_report.html: A standalone HTML file that can be viewed in your web browser. - - multiqc_data/: Directory containing parsed statistics from the different tools used in the pipeline. - - multiqc_plots/: Directory containing static images from the report in various formats. + - `*_multiqc_report.html`: Standalone HTML report(s) that can be viewed in your web browser. + - `*_multiqc_report_data/`: Parsed report data for each corresponding MultiQC report. -[MultiQC](https://multiqc.info/docs/) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available within the report data directory. +[MultiQC](https://multiqc.info/docs/) is a visualisation tool that generates HTML report(s) summarising samples in your project. Most QC results are visualised in the report and further statistics are available within each corresponding `*_multiqc_report_data/` directory. -Results generated by MultiQC collate pipeline QC from AlphaFold2 or ColabFold. - -The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see http://multiqc.info. +Results generated by MultiQC collate QC metrics from the selected structure-prediction mode(s), and the software versions for traceability. For more information about how to use MultiQC reports, see . ### Pipeline information @@ -209,3 +199,36 @@ The pipeline has special steps which also allow the software versions to be repo [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +### Additional intermediate outputs + +Depending on the selected mode(s) and options, additional top-level directories may be present, for example: + +- `fasta2yaml/` (for YAML conversion inputs/outputs) +- `mmseqs/results/` (for MMseqs2 outputs such as `.a3m` files) +- `split/output_msa/` (for split-MSA intermediate CSV outputs) + +### `--save_intermediates` + +If `--save_intermediates` is enabled, extra raw intermediate files are published in mode-specific `raw/` directories. + +Examples include: + +- `alphafold2///raw/` +- `colabfold//raw/` +- `boltz//boltz_results_*/` +- `rosettafold_all_atom//raw/` +- `alphafold3//raw/` +- `helixfold3//raw/` +- `rosettafold2na//raw/` + +These raw outputs are intended for advanced debugging, reproducibility and method-specific downstream analyses. For detailed, canonical tool-specific native output specifications, see: + +- [AlphaFold2](https://github.com/google-deepmind/alphafold?tab=readme-ov-file#alphafold-output) +- [AlphaFold3](https://github.com/google-deepmind/alphafold3/blob/main/docs/output.md) +- [Boltz](https://github.com/jwohlwend/boltz/blob/main/docs/prediction.md#output) +- [ColabFold](https://www.ebi.ac.uk/training/online/courses/alphafold/advanced-modeling-and-applications-of-predicted-protein-structures/customising-alphafold-structure-predictions/outputs-from-colabfold/) +- [ESMFold](https://github.com/facebookresearch/esm) +- [RosettaFold2NA](https://github.com/uw-ipd/RoseTTAFold2NA?tab=readme-ov-file#expected-outputs) +- [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/?tab=readme-ov-file#understanding-model-outputs) +- [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3#-understanding-model-output) diff --git a/docs/usage.md b/docs/usage.md index 12e475520..944f6ad58 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,8 +6,6 @@ ## Introduction - - ## Samplesheet input You will need to create a samplesheet with information about the sequences you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 2 columns, and a header row as shown in the examples below. @@ -21,408 +19,182 @@ You will need to create a samplesheet with information about the sequences you w A sample of the final samplesheet file for two sequences is shown below: ```csv title="samplesheet.csv" -sequence,fasta +id,fasta T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta ``` The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the table below: -| Column | Description | -| ---------- | --------------------------------------------------------------------------------------------------- | -| `sequence` | Custom sequence name. Spaces in sequence names are automatically converted to underscores (`_`). | -| `fasta` | Full path to fasta file for the provided sequence. File has to have the extension ".fasta" or "fa". | +| Column | Description | +| ------- | ---------------------------------------------------------------------------------------------------- | +| `id` | Custom sequence name. Spaces in sequence names are automatically converted to underscores (`_`). | +| `fasta` | Full path to fasta file for the provided sequence. File has to have the extension ".fasta" or ".fa". | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -## Running the pipeline +Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet. -The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below. +## Running the pipeline -AlphaFold2 regular can be run using this command: +The typical command for running the pipeline is shown below. ```bash nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode alphafold2 \ - --alphafold2_db \ - --full_dbs \ - --alphafold2_model_preset monomer \ - --use_gpu \ - -profile + -profile \ + --input samplesheet.csv \ + --outdir \ + --mode \ + --db ``` -To run the AlphaFold2 that splits the MSA calculation from the model inference, you can use the `--alphafold2_mode split_msa_prediction` parameter, as shown below: +> You can run any combination of the models by providing them to the `--mode` parameter separated by a comma. For example: `--mode alphafold2,esmfold,colabfold` will run the three models in parallel. -```bash -nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode alphafold2 \ - --alphafold2_mode split_msa_prediction \ - --alphafold2_db \ - --full_dbs \ - --alphafold2_model_preset monomer \ - --use_gpu \ - -profile -``` - -To provide the predownloaded AlphaFold2 databases and parameters you can specify the `--alphafold2_db ` parameter and the directory structure of your path should be like this: +Each mode has specific reference data requirements. To support all modes the `--db` directory should conform to the following file structure:
    Directory structure -```console -├── alphafold_params_2022-12-06 -│   ├── LICENSE -│   ├── params_model_1_multimer.npz -│   ├── params_model_1_multimer_v2.npz -│   ├── params_model_1_multimer_v3.npz -│   ├── params_model_1.npz -│   ├── params_model_1_ptm.npz -│   ├── params_model_2_multimer.npz -│   ├── params_model_2_multimer_v2.npz -│   ├── params_model_2_multimer_v3.npz -│   ├── params_model_2.npz -│   ├── params_model_2_ptm.npz -│   ├── params_model_3_multimer.npz -│   ├── params_model_3_multimer_v2.npz -│   ├── params_model_3_multimer_v3.npz -│   ├── params_model_3.npz -│   ├── params_model_3_ptm.npz -│   ├── params_model_4_multimer.npz -│   ├── params_model_4_multimer_v2.npz -│   ├── params_model_4_multimer_v3.npz -│   ├── params_model_4.npz -│   ├── params_model_4_ptm.npz -│   ├── params_model_5_multimer.npz -│   ├── params_model_5_multimer_v2.npz -│   ├── params_model_5_multimer_v3.npz -│   ├── params_model_5.npz -│   └── params_model_5_ptm.npz + +``` +/ +├── bfd +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│   └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── colabfold_envdb +│   ├── colabfold_envdb_202108_db +│   ├── colabfold_envdb_202108_db_aln +│   ├── colabfold_envdb_202108_db_aln.dbtype +│   └── ... +├── colabfold_uniref30 +│   ├── uniref30_2302_db +│   ├── uniref30_2302_db_aln +│   ├── uniref30_2302_db_aln.dbtype +│   └── ... +├── maxit-v11.200-prod-src +│   ├── annotation-v1.0 +│   └── ... ├── mgnify -│   └── mgy_clusters_2022_05.fa +│   └── mgy_clusters.fa +├── params +│ ├── af3.bin +│   ├── alphafold_params_2021-07-14 +│   ├── alphafold_params_2022-12-06 +│   ├── alphafold_params_colab_2022-12-06 +│   ├── boltz1_conf.ckpt +│   ├── boltz2_aff.ckpt +│   ├── boltz2_conf.ckpt +│   ├── ccd.pkl +│   ├── ccd_preprocessed_etkdg.pkl.gz +│   ├── esm2_t36_3B_UR50D-contact-regression.pt +│   ├── esm2_t36_3B_UR50D.pt +│   ├── esmfold_3B_v1.pt +│   ├── HelixFold3-240814.pdparams +│   ├── mols +│   └── RFAA_paper_weights.pt +├── pdb100 +│   ├── LICENSE +│   ├── pdb100_2021Mar03_a3m.ffdata +│   ├── pdb100_2021Mar03_a3m.ffindex +│   ├── pdb100_2021Mar03_cs219.ffdata +│   ├── pdb100_2021Mar03_cs219.ffindex +│   ├── pdb100_2021Mar03_hhm.ffdata +│   ├── pdb100_2021Mar03_hhm.ffindex +│   ├── pdb100_2021Mar03_pdb.ffdata +│   └── pdb100_2021Mar03_pdb.ffindex ├── pdb70 -│   └── pdb70_from_mmcif_200916 -│   ├── md5sum -│   ├── pdb70_a3m.ffdata -│   ├── pdb70_a3m.ffindex -│   ├── pdb70_clu.tsv -│   ├── pdb70_cs219.ffdata -│   ├── pdb70_cs219.ffindex -│   ├── pdb70_hhm.ffdata -│   ├── pdb70_hhm.ffindex -│   └── pdb_filter.dat +│   ├── md5sum +│   ├── pdb70_a3m.ffdata +│   ├── pdb70_a3m.ffindex +│   ├── pdb70_clu.tsv +│   ├── pdb70_cs219.ffdata +│   ├── pdb70_cs219.ffindex +│   ├── pdb70_hhm.ffdata +│   ├── pdb70_hhm.ffindex +│   └── pdb_filter.dat ├── pdb_mmcif │   ├── mmcif_files -│   │   ├── 1g6g.cif -│   │   ├── 1go4.cif -│   │   ├── 1isn.cif -│   │   ├── 1kuu.cif -│   │   ├── 1m7s.cif -│   │   ├── 1mwq.cif -│   │   ├── 1ni5.cif -│   │   ├── 1qgd.cif -│   │   ├── 1tp9.cif -│   │   ├── 1wa9.cif -│   │   ├── 1ye5.cif -│   │   ├── 1yhl.cif -│   │   ├── 2bjd.cif -│   │   ├── 2bo9.cif -│   │   ├── 2e7t.cif -│   │   ├── 2fyg.cif -│   │   ├── 2j0q.cif -│   │   ├── 2jcq.cif -│   │   ├── 2m4k.cif -│   │   ├── 2n9o.cif -│   │   ├── 2nsx.cif -│   │   ├── 2w4u.cif -│   │   ├── 2wd6.cif -│   │   ├── 2wh5.cif -│   │   ├── 2wji.cif -│   │   ├── 2yu3.cif -│   │   ├── 3cw2.cif -│   │   ├── 3d45.cif -│   │   ├── 3gnz.cif -│   │   ├── 3j0a.cif -│   │   ├── 3jaj.cif -│   │   ├── 3mzo.cif -│   │   ├── 3nrn.cif -│   │   ├── 3piv.cif -│   │   ├── 3pof.cif -│   │   ├── 3pvd.cif -│   │   ├── 3q45.cif -│   │   ├── 3qh6.cif -│   │   ├── 3rg2.cif -│   │   ├── 3sxe.cif -│   │   ├── 3uai.cif -│   │   ├── 3uid.cif -│   │   ├── 3wae.cif -│   │   ├── 3wt1.cif -│   │   ├── 3wtr.cif -│   │   ├── 3wy2.cif -│   │   ├── 3zud.cif -│   │   ├── 4bix.cif -│   │   ├── 4bzx.cif -│   │   ├── 4c1n.cif -│   │   ├── 4cej.cif -│   │   ├── 4chm.cif -│   │   ├── 4fzo.cif -│   │   ├── 4i1f.cif -│   │   ├── 4ioa.cif -│   │   ├── 4j6o.cif -│   │   ├── 4m9q.cif -│   │   ├── 4mal.cif -│   │   ├── 4nhe.cif -│   │   ├── 4o2w.cif -│   │   ├── 4pzo.cif -│   │   ├── 4qlx.cif -│   │   ├── 4uex.cif -│   │   ├── 4zm4.cif -│   │   ├── 4zv1.cif -│   │   ├── 5aj4.cif -│   │   ├── 5frs.cif -│   │   ├── 5hwo.cif -│   │   ├── 5kbk.cif -│   │   ├── 5odq.cif -│   │   ├── 5u5t.cif -│   │   ├── 5wzq.cif -│   │   ├── 5x9z.cif -│   │   ├── 5xe5.cif -│   │   ├── 5ynv.cif -│   │   ├── 5yud.cif -│   │   ├── 5z5c.cif -│   │   ├── 5zb3.cif -│   │   ├── 5zlg.cif -│   │   ├── 6a6i.cif -│   │   ├── 6az3.cif -│   │   ├── 6ban.cif -│   │   ├── 6g1f.cif -│   │   ├── 6ix4.cif -│   │   ├── 6jwp.cif -│   │   ├── 6ng9.cif -│   │   ├── 6ojj.cif -│   │   ├── 6s0x.cif -│   │   ├── 6sg9.cif -│   │   ├── 6vi4.cif -│   │   └── 7sp5.cif │   └── obsolete.dat ├── pdb_seqres │   └── pdb_seqres.txt +├── rfam +│   └── Rfam-14.9_rep_seq.fasta ├── small_bfd │   └── bfd-first_non_consensus_sequences.fasta ├── uniprot │   └── uniprot.fasta ├── uniref30 -│   ├── UniRef30_2021_03_a3m.ffdata -│   ├── UniRef30_2021_03_a3m.ffindex -│   ├── UniRef30_2021_03_cs219.ffdata -│   ├── UniRef30_2021_03_cs219.ffindex -| ├── UniRef30_2021_03_hhm.ffdata -│   └── UniRef30_2021_03_hhm.ffindex +│   ├── UniRef30_2023_02_a3m.ffdata +│   ├── UniRef30_2023_02_a3m.ffindex +│   ├── UniRef30_2023_02_cs219.ffdata +│   ├── UniRef30_2023_02_cs219.ffindex +│   ├── UniRef30_2023_02_hhm.ffdata +│   ├── UniRef30_2023_02_hhm.ffindex +│   └── UniRef30_2023_02.md5sums └── uniref90 - └── uniref90.fasta +    └── uniref90.fasta ``` +
    -Colabfold mode using use your own custom MMSeqs2 API server (`--colabfold_server local`) can be run using the following command: +Alternatively, the required data layout for each of the individual modes is described in the mode-specific usage documentation: -```bash -nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode colabfold \ - --colabfold_server local \ - --colabfold_db \ - --num_recycles_colabfold 3 \ - --use_amber \ - --colabfold_model_preset "AlphaFold2-ptm" \ - --use_gpu \ - --db_load_mode 0 \ - -profile -``` +- [AlphaFold2](./usage/alphafold2.md) +- [AlphaFold3](./usage/alphafold3.md) +- [Boltz](./usage/boltz.md) +- [ColabFold](./usage/colabfold.md) +- [ESMFold](./usage/esmfold.md) +- [HelixFold3](./usage/helixfold3.md) +- [RoseTTAFold-All-Atom](./usage/rosettafold_all_atom.md) +- [RoseTTAFold2NA](./usage/rosettafold2na.md) -The command to run run Colabfold, using the Colabfold webserver is shown below: +> Omitting the `--db` flag will allow the pipeline to download the reference data required to execute the selected modes. + +## Foldseek structural similarity search + +The pipeline can run an optional Foldseek `easy-search` step on the top-ranked structure for each input/mode combination. + +Foldseek is disabled by default (`--skip_foldseek true`). To enable it, set: ```bash -nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode colabfold - --colabfold_server webserver \ - --host_url \ - --colabfold_db \ - --num_recycles_colabfold 3 \ - --use_amber \ - --colabfold_model_preset "AlphaFold2-ptm" \ - --use_gpu \ - -profile +--skip_foldseek false +--foldseek_db +--foldseek_db_path ``` -If you specify the `--colabfold_db ` parameter, the directory structure of your path should be like this: +The database is resolved as: -
    -Directory structure -```console -├── colabfold_envdb_202108 -│   ├── colabfold_envdb_202108_db.0 -│   ├── colabfold_envdb_202108_db.1 -│   ├── colabfold_envdb_202108_db.10 -│   ├── colabfold_envdb_202108_db.11 -│   ├── colabfold_envdb_202108_db.12 -│   ├── colabfold_envdb_202108_db.13 -│   ├── colabfold_envdb_202108_db.14 -│   ├── colabfold_envdb_202108_db.15 -│   ├── colabfold_envdb_202108_db.2 -│   ├── colabfold_envdb_202108_db.3 -│   ├── colabfold_envdb_202108_db.4 -│   ├── colabfold_envdb_202108_db.5 -│   ├── colabfold_envdb_202108_db.6 -│   ├── colabfold_envdb_202108_db.7 -│   ├── colabfold_envdb_202108_db.8 -│   ├── colabfold_envdb_202108_db.9 -│   ├── colabfold_envdb_202108_db_aln.0 -│   ├── colabfold_envdb_202108_db_aln.1 -│   ├── colabfold_envdb_202108_db_aln.10 -│   ├── colabfold_envdb_202108_db_aln.11 -│   ├── colabfold_envdb_202108_db_aln.12 -│   ├── colabfold_envdb_202108_db_aln.13 -│   ├── colabfold_envdb_202108_db_aln.14 -│   ├── colabfold_envdb_202108_db_aln.15 -│   ├── colabfold_envdb_202108_db_aln.2 -│   ├── colabfold_envdb_202108_db_aln.3 -│   ├── colabfold_envdb_202108_db_aln.4 -│   ├── colabfold_envdb_202108_db_aln.5 -│   ├── colabfold_envdb_202108_db_aln.6 -│   ├── colabfold_envdb_202108_db_aln.7 -│   ├── colabfold_envdb_202108_db_aln.8 -│   ├── colabfold_envdb_202108_db_aln.9 -│   ├── colabfold_envdb_202108_db_aln.dbtype -│   ├── colabfold_envdb_202108_db_aln.index -│   ├── colabfold_envdb_202108_db.dbtype -│   ├── colabfold_envdb_202108_db_h -│   ├── colabfold_envdb_202108_db_h.dbtype -│   ├── colabfold_envdb_202108_db_h.index -│   ├── colabfold_envdb_202108_db.idx -│   ├── colabfold_envdb_202108_db.idx.dbtype -│   ├── colabfold_envdb_202108_db.idx.index -│   ├── colabfold_envdb_202108_db.index -│   ├── colabfold_envdb_202108_db_seq.0 -│   ├── colabfold_envdb_202108_db_seq.1 -│   ├── colabfold_envdb_202108_db_seq.10 -│   ├── colabfold_envdb_202108_db_seq.11 -│   ├── colabfold_envdb_202108_db_seq.12 -│   ├── colabfold_envdb_202108_db_seq.13 -│   ├── colabfold_envdb_202108_db_seq.14 -│   ├── colabfold_envdb_202108_db_seq.15 -│   ├── colabfold_envdb_202108_db_seq.2 -│   ├── colabfold_envdb_202108_db_seq.3 -│   ├── colabfold_envdb_202108_db_seq.4 -│   ├── colabfold_envdb_202108_db_seq.5 -│   ├── colabfold_envdb_202108_db_seq.6 -│   ├── colabfold_envdb_202108_db_seq.7 -│   ├── colabfold_envdb_202108_db_seq.8 -│   ├── colabfold_envdb_202108_db_seq.9 -│   ├── colabfold_envdb_202108_db_seq.dbtype -│   ├── colabfold_envdb_202108_db_seq_h -> colabfold_envdb_202108_db_h -│   ├── colabfold_envdb_202108_db_seq_h.dbtype -> colabfold_envdb_202108_db_h.dbtype -│   ├── colabfold_envdb_202108_db_seq_h.index -> colabfold_envdb_202108_db_h.index -│   ├── colabfold_envdb_202108_db_seq.index -├── params -│   ├── alphafold_params_2021-07-14 -│   │   ├── LICENSE -│   │   ├── params_model_1.npz -│   │   ├── params_model_1_ptm.npz -│   │   ├── params_model_2.npz -│   │   ├── params_model_2_ptm.npz -│   │   ├── params_model_3.npz -│   │   ├── params_model_3_ptm.npz -│   │   ├── params_model_4.npz -│   │   ├── params_model_4_ptm.npz -│   │   ├── params_model_5.npz -│   │   └── params_model_5_ptm.npz -│   └── alphafold_params_colab_2022-12-06 -│   ├── LICENSE -│   ├── params_model_1_multimer_v3.npz -│   ├── params_model_1.npz -│   ├── params_model_2_multimer_v3.npz -│   ├── params_model_2.npz -│   ├── params_model_2_ptm.npz -│   ├── params_model_3_multimer_v3.npz -│   ├── params_model_3.npz -│   ├── params_model_4_multimer_v3.npz -│   ├── params_model_4.npz -│   ├── params_model_5_multimer_v3.npz -│   └── params_model_5.npz -└── uniref30_2302 - ├── uniref30_2302_aln.tsv - ├── uniref30_2302_db.0 - ├── uniref30_2302_db.1 - ├── uniref30_2302_db.2 - ├── uniref30_2302_db.3 - ├── uniref30_2302_db.4 - ├── uniref30_2302_db.5 - ├── uniref30_2302_db.6 - ├── uniref30_2302_db.7 - ├── uniref30_2302_db_aln.0 - ├── uniref30_2302_db_aln.1 - ├── uniref30_2302_db_aln.2 - ├── uniref30_2302_db_aln.3 - ... - ├── uniref30_2302_db_aln.97 - ├── uniref30_2302_db_aln.98 - ├── uniref30_2302_db_aln.99 - ├── uniref30_2302_db_aln.dbtype - ├── uniref30_2302_db_aln.index - ├── uniref30_2302_db.dbtype - ├── uniref30_2302_db_h - ├── uniref30_2302_db_h.dbtype - ├── uniref30_2302_db_h.index - ├── uniref30_2302_db.idx - ├── uniref30_2302_db.idx.dbtype - ├── uniref30_2302_db.idx.index - ├── uniref30_2302_db.idx_mapping - ├── uniref30_2302_db.idx_taxonomy - ├── uniref30_2302_db.index - ├── uniref30_2302_db_mapping - ├── uniref30_2302_db_seq.0 - ├── uniref30_2302_db_seq.1 - ├── uniref30_2302_db_seq.2 - ├── uniref30_2302_db_seq.3 - ... - ├── uniref30_2302_db_seq.97 - ├── uniref30_2302_db_seq.98 - ├── uniref30_2302_db_seq.99 - ├── uniref30_2302_db_seq.dbtype - ├── uniref30_2302_db_seq_h -> uniref30_2302_db_h - ├── uniref30_2302_db_seq_h.dbtype -> uniref30_2302_db_h.dbtype - ├── uniref30_2302_db_seq_h.index -> uniref30_2302_db_h.index - └── uniref30_2302_db_seq.index +```bash +/ ``` -
    -```console +For example: + +```bash nextflow run nf-core/proteinfold \ - --input samplesheet.csv \ - --outdir \ - --mode esmfold - --esmfold_db \ - --num_recycles_esmfold 4 \ - --esmfold_model_preset \ - --use_gpu \ - -profile + -profile \ + --input samplesheet.csv \ + --outdir \ + --mode alphafold2,colabfold \ + --skip_foldseek false \ + --foldseek_db pdb100 \ + --foldseek_db_path /data/foldseek_db ``` -If you specify the `--esmfold_db ` parameter, the directory structure of your path should be like this: +By default, results are written in HTML format (`--format-mode 3`) to: -```console -└── checkpoints - ├── esm2_t36_3B_UR50D-contact-regression.pt - ├── esm2_t36_3B_UR50D.pt - └── esmfold_3B_v1.pt +```bash +/foldseek_easysearch/ ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +You can override Foldseek arguments with: + +```bash +--foldseek_easysearch_arg "" +``` Note that the pipeline will create the following files in your working directory: @@ -437,9 +209,8 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: +> [!WARNING] +> Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). The above pipeline run specified with a params file in yaml format: @@ -447,12 +218,11 @@ The above pipeline run specified with a params file in yaml format: nextflow run nf-core/proteinfold -profile docker -params-file params.yaml ``` -with `params.yaml` containing: +with: -```yaml +```yaml title="params.yaml" input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` @@ -468,23 +238,21 @@ nextflow pull nf-core/proteinfold ### Reproducibility -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. First, go to the [nf-core/proteinfold releases page](https://github.com/nf-core/proteinfold/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can use share and reuse [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -:::tip -If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. -::: +> [!TIP] +> If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ## Core Nextflow arguments -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +> [!NOTE] +> These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen) ### `-profile` @@ -492,16 +260,15 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +> [!IMPORTANT] +> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is supported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer environment. - `test` - A profile with a complete configuration for automated testing @@ -515,7 +282,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` @@ -537,13 +304,13 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the pipeline steps, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher resources request (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. ### Custom Containers -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. +In some cases, you may wish to change the container or conda environment used by a pipeline steps for a particular tool. By default, nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However, in some cases the pipeline specified version maybe out of date. To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. diff --git a/docs/usage/HOWTO_CONTRIBUTE_NEW_MODES.md b/docs/usage/HOWTO_CONTRIBUTE_NEW_MODES.md new file mode 100644 index 000000000..01fe7d83c --- /dev/null +++ b/docs/usage/HOWTO_CONTRIBUTE_NEW_MODES.md @@ -0,0 +1,47 @@ +--- +title: Contributing new modes +subtitle: Adding structure prediction modes to nf-core/proteinfold +weight: 100 +--- + +# Adding structure prediction modes to nf-core/proteinfold + +This section provides guidance on adding new structure prediction modes, implemented via the `--mode` option, to nf-core/proteinfold. + +## Contributing + +One of the great advantages of an `nf-core` pipeline is that the community can extend workflows to add new functionalities. In nf-core/proteinfold, this allows adding new protein structure prediction modules as they are released, while still leveraging the existing workflow infrastructure and reporting. + +Please consider writing some code to become a [nf-core contributor](https://nf-co.re/contributors) and expand the pipeline! Reach out to a maintainer of contributor for guidance : + +We are all contactable at the [#proteinfold_dev](https://nfcore.slack.com/archives/C08THK11CHX) nf-core Slack channel. That's the best place for person-to-person discussions over new additions to implement into the pipeline. + +## Locating pipeline sections + +- `main.nf`: This kicks off each `--mode`'s workflow once the databases have been prepared on the deployment infrastructure. Relevant parameters are passed from `params.[mode_name]` (largely populated from global `nextflow.config` `params` which inherits `dbs.config` database locations) through to the `[MODE_NAME]()` workflow. The channels returned contain the relevant `report_input` metrics, the `top_rank_model` (_i.e._ the best structure from all inference runs), and standard software versioning info. +- `subworkflows`: largely used for mode-specific smaller set-up worklows, except for the `post_processing` subworkflow which will be detailed later. +- `workflows/[mode_name].nf`: the `--mode`'s workflow handles input channels of relevant databases, passes them to the local module that does the prediction work (`RUN_[MODE_NAME]()`) and maps the output from the underlying structure prediction to emitted channels ingested by the reporting modules. +- `modules/local/run_[MODE_NAME]`: this is where the bulk of the compute work is done. Each underlying structure prediction module is bundled with its own Dockerfile to setup the software in a container, and a `/modules/local/run_[MODE_NAME]/main.nf` to execute the container from nextflow. + - input: + - `meta` contains the metadata info of this sub-job, including the `id` column from the `samplesheet.csv` accessed by `{meta.id}`. + - `path(fasta)` (or more flexible yaml or json) locates the biomolecular input sequence file, where `fasta.baseName` gives the underlying input file name (not the `id` label). + - `path(features)` is used to pass through multiple sequence alignment (MSA) data, in line with AlphaFold2's [features.pkl](https://github.com/google-deepmind/alphafold?tab=readme-ov-file#alphafold-output) file. + - Other `path()`s largely locate the core [AlphaFold sequence databases](https://github.com/google-deepmind/alphafold?tab=readme-ov-file#genetic-databases) (or module specific variants thereof). + - output: + - Outputs are structured as a bundled `tuple` of two objects, the first is always `meta` containing the metadata labels, and then `path()` to various output data files useful to the end-user. The prediction module is called in a way that return files to the process's current directory (`.`). + - `"""script block"""`: + - `program`: the script block calls the program from the Nextflow shell with the programs typical `--flags`, in whatever form (`binary` or `script.py`) the program is distributed from its codebase repository. + - `extract_metrics.py`: accesses the canonical data output formats from the structure prediction program and returns a core set of plain text `.tsv` metric files. +- `bin/extract_metrics.py`: a globally accessible program to go from serialised data into `.tsv` plaintext. It currently applies format specific extraction logic for `.pkl`, `.json` and `.npz` files. However, as the community adds more `--mode`s to the pipeline, different programs could use the same compressed output format. In which case `extract_metrics.py` should be refactored to match based on the passing the `--mode` to `extract_metrics.py`. +- `subworkflows/local/post_processing.nf`: the `POST_PROCESSING{}` process sits after all possible `[MODE_NAME]()` workflows in the `main.nf`. It passes along visualisation options, metrics data files, and report templates (`single` or `comparison`). Those reports are created with the `GENERATE_REPORT()` or `COMPARE_STRUCTURES()` `/module/local/` modules, respectively. +- `bin/generate_[comparison]_report.py` takes the HTML templates at `assets/[report|comparison]_template.html` and populates them with plots created inside these python scripts. + +## Process labelling + +At the top of a module's `RUN_[MODE_NAME]`{} process, there are a series of labels that allow the `nextflow.config` to pass the job to the appropriate resources on the compute cluster. `label 'process_gpu'` is very useful to specify the AI inference stages requiring GPU-intensive computation. Other processes can use default labels that request CPU resources and, once finished, will naturally cascade onto GPU-enabled steps due to Nextflow's dataflow paradigm. + +## Processable structure prediction metrics + +Metrics from AlphaFold-inspired protein structure prediction programs are structured in two ways: tabular or as a matrix (PAE values) + +When contributing a new mode to `proteinfold`, functionality should be added to `extract_metrics.py` to access the canonical ouput files of the new program, and extract data into compliant `.tsv` files that can be easily processed by downstream plotting and MultiQC functions. diff --git a/docs/usage/alphafold2.md b/docs/usage/alphafold2.md new file mode 100644 index 000000000..b33ee843a --- /dev/null +++ b/docs/usage/alphafold2.md @@ -0,0 +1,144 @@ +--- +title: AlphaFold2 +weight: 10 +--- + +# AlphaFold2 + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :-------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [AlphaFold2](https://github.com/deepmind/alphafold) | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | + +AlphaFold2 can be run using the command below: + +```bash +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode alphafold2 \ + --alphafold2_db \ + --use_gpu \ + --alphafold2_model_preset \ + -profile +``` + +> [!NOTE] +> By default, this will run a fork of AlphaFold2 where MSA generation is split from the neural network inference. This enables more efficient utilization of resources by allowing the CPU-bound MSA generation to be executed without occupying an idle GPU. If you want to run the original implementation of AlphaFold2 you can use the `--alphafold2_mode standard`. However, please be advised that this will cause the allocated GPU to idle while MSAs are generated. + +> [!WARNING] +> `--alphafold2_model_preset ` is used to infer how to handle multi-entry fasta files. Choosing `monomer_ptm`, `monomer` or `monomer_casp14` will result in a multi-entry fasta being processed as a series of monomer entries rather than as a single oligomeric complex. + +## File Structure + +The file structure of `--alphafold2_db` must be as follows: + +
    +Directory structure + +```console +/ +├── bfd +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── params +│   └── alphafold_params_2022-12-06 +│   ├── LICENSE +│   ├── params_model_1_multimer_v3.npz +│   ├── params_model_1.npz +│   ├── params_model_1_ptm.npz +│   ├── params_model_2_multimer_v3.npz +│   ├── params_model_2.npz +│   ├── params_model_2_ptm.npz +│   ├── params_model_3_multimer_v3.npz +│   ├── params_model_3.npz +│   ├── params_model_3_ptm.npz +│   ├── params_model_4_multimer_v3.npz +│   ├── params_model_4.npz +│   ├── params_model_4_ptm.npz +│   ├── params_model_5_multimer_v3.npz +│   ├── params_model_5.npz +│   └── params_model_5_ptm.npz +├── mgnify +│   └── mgy_clusters.fa +├── pdb70 +│   ├── md5sum +│   ├── pdb70_a3m.ffdata +│   ├── pdb70_a3m.ffindex +│   ├── pdb70_clu.tsv +│   ├── pdb70_cs219.ffdata +│   ├── pdb70_cs219.ffindex +│   ├── pdb70_hhm.ffdata +│   ├── pdb70_hhm.ffindex +│   └── pdb_filter.dat +├── pdb_mmcif +│   ├── mmcif_files +│   │   ├── 1g6g.cif +│   │   ├── 1go4.cif +│   │   ├── 1isn.cif +│   │   ├── 1qgd.cif +│   │   ├── 1tp9.cif +│   │   ├── 4o2w.cif +│   │   ├── 6sg9.cif +│   │   ├── 6vi4.cif +│   │   ├── 7sp5.cif +│   │   └── ... +│   └── obsolete.dat +├── pdb_seqres +│   └── pdb_seqres.txt +├── small_bfd +│   └── bfd-first_non_consensus_sequences.fasta +├── uniprot +│   └── uniprot.fasta +├── uniref30 +│ ├── UniRef30_2023_02_a3m.ffdata +│ ├── UniRef30_2023_02_a3m.ffindex +│ ├── UniRef30_2023_02_cs219.ffdata +│ ├── UniRef30_2023_02_cs219.ffindex +│ ├── UniRef30_2023_02_hhm.ffdata +│ ├── UniRef30_2023_02_hhm.ffindex +│ └── UniRef30_2023_02.md5sums +└── uniref90 + └── uniref90.fasta +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--alphafold2_bfd_path
    +--alphafold2_small_bfd_path
    +--alphafold2_params_path
    +--alphafold2_mgnify_path
    +--alphafold2_pdb70_path +--alphafold2_pdb_mmcif_path +--alphafold2_pdb_obsolete_path +--alphafold2_uniref30_path +--alphafold2_uniref90_path +--alphafold2_pdb_seqres_path +--alphafold2_uniprot_path +``` + +Without setting the `--alphafold2_db` flag, all of the required data files will be downloaded during the workflow execution. Database downloaded can be an extremely long process. The downloaded data can be used for future workflow executions. + +> [!WARNING] +> The AlphaFold2 reference databases require ~2TB of disk space. + +## Additional Arguments + +See the [AlphaFold2](https://github.com/google-deepmind/alphafold) documentation for a full description of additional arguments. The arguments supported by the proteinfold workflow are described briefly below: + +| Parameter | Default | Description | +| -------------------------------- | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--alphafold2_full_dbs` | `false` | bfd is a large environmental sequence database used to identify homologs. small bfd is a redundancy recuced version of the bfd database which can reduce the execution time of homolog search but may reduce the depth of the resulting MSA in some cases. `--alphafold2_full_dbs` ensures that the full version of bfd is used for search. | +| `--alphafold2_random_seed` | `null` | AlphaFold2 model inference is a stochastic process. Fixing a numerical random seed ensures that results are reproducible between runs. | +| `--alphafold2_max_template_date` | `2038-01-19` | Structural templates from the PDB are used as additional context when making predictions. Molecules with solved structures in the PDB can be trivially predicted by using these structures as inputs. When benchmarking model performance it can be useful to restrict the use of templates to those deposited before a fixed date to ensure solved structures do not bias predictions. | + +> You can override any of these parameters via the command line or a params file. + +> [!NOTE] +> Check the versions of the PDB data available on the infrastructure used to run proteinfold to determine template availability. diff --git a/docs/usage/alphafold3.md b/docs/usage/alphafold3.md new file mode 100644 index 000000000..eec0aee4d --- /dev/null +++ b/docs/usage/alphafold3.md @@ -0,0 +1,80 @@ +--- +title: AlphaFold3 +weight: 20 +--- + +# AlphaFold3 + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :---------------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [AlphaFold3](https://github.com/google-deepmind/alphafold3) | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | + +> [!WARNING] +> The AlphaFold3 weights are not provided by this pipeline. Users must obtain the weights directly from DeepMind according to their [terms of use](https://github.com/google-deepmind/alphafold3/blob/main/WEIGHTS_TERMS_OF_USE.md) and [prohibited use policy](https://github.com/google-deepmind/alphafold3/blob/main/WEIGHTS_PROHIBITED_USE_POLICY.md). Please ensure you comply with all terms and conditions before using AlphaFold3. For more information about AlphaFold3 usage and requirements, please refer to the [official AlphaFold3 repository](https://github.com/google-deepmind/alphafold3). + +AlphaFold3 can be run using the command below: + +```bash +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode alphafold3 \ + --alphafold3_db \ + --use_gpu \ + -profile +``` + +The file structure of `--alphafold3_db` must be as follows: + +
    +Directory structure + +```console +├── mgnify +│ └── mgy_clusters_2022_05.fa +├── params +│ └── af3.bin +├── pdb_mmcif +│ └── mmcif_files +│ ├── 1g6g.cif +│ ├── 1go4.cif +│ └── ... +├── pdb_seqres +│ └── pdb_seqres_2022_09_28.fasta +├── small_bfd +│ └── bfd-first_non_consensus_sequences.fasta +├── uniprot +│ └── uniprot_all_2021_04.fa +└── uniref90 + └── uniref90_2022_05.fa +``` + +
    + +> [!NOTE] +> The reference databases used by the workflow are those hosted by the AlphaFold3 implementation but the pipeline can be run with different versions of the same datasets. + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--alphafold3_small_bfd_path +--alphafold3_params_path +--alphafold3_mgnify_path +--alphafold3_pdb_mmcif_path +--alphafold3_uniref90_path +--alphafold3_pdb_seqres_path +--alphafold3_uniprot_path +``` + +Note the following databases are only required to support RNA predictions: + +```console +--alphafold3_rnacentral_path +--alphafold3_nt_rna_path +--alphafold3_rfam_path +``` + +Without setting the `--alphafold3_db` flag, all of the required data files will be downloaded during the workflow execution. + +> [!WARNING] +> The AlphaFold3 reference databases require ~2TB of disk space. diff --git a/docs/usage/boltz.md b/docs/usage/boltz.md new file mode 100644 index 000000000..91865dcba --- /dev/null +++ b/docs/usage/boltz.md @@ -0,0 +1,149 @@ +--- +title: Boltz +weight: 30 +--- + +# Boltz + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [Boltz](https://github.com/jwohlwend/boltz/) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | + +## General Use + +Boltz mode can be run using the command below: + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode boltz \ + --boltz_db \ + --colabfold_db \ + --use_gpu \ + -profile +``` + +By default, `--mode boltz` will generate MSA files required for structure prediction using a local execution of the [ColabFold](https://github.com/sokrypton/ColabFold) search protocol. This protocol uses [MMseqs2](https://github.com/soedinglab/MMseqs2) to search a uniref30 expandable profile database and construct paired alignments using taxonomic labels. MSAs are enriched with additional unpaired sequences by searching an expandable profile databased of environmental sequences. + +> [!NOTE] +> Local ColabFold search occurs in a separate module to model inference and the resulting MSA will be cached if downstream modules need to be re-run. + +## File Structure + +The file structure of `--boltz_db` must be as follows: + +
    +Directory structure + +``` +/ +└── params + ├── boltz1_conf.ckpt + ├── boltz2_aff.ckpt + ├── boltz2_conf.ckpt + ├── ccd.pkl + └── mols +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +# Boltz-1 +--boltz_ccd_path +--boltz_model_path + +# Boltz-2 +--boltz2_aff_path
    +--boltz2_conf_path
    +--boltz2_mols_path +``` + +Similarly, the `--colabfold_db` flag must be set to run the local execution of ColabFold search. The file structure of `--colabfold_db` must be: + +
    +Directory structure + +``` +/ +├── colabfold_envdb +│   ├── colabfold_envdb_202108_db +│   ├── colabfold_envdb_202108_db_aln +│   ├── colabfold_envdb_202108_db_aln.dbtype +│   └── ... +└── colabfold_uniref30 +    ├── uniref30_2302_db +    ├── uniref30_2302_db_aln +    ├── uniref30_2302_db_aln.dbtype +    └── ... +``` + +
    + +Without setting the `--boltz_db` and `--colabfold_db` flags, all of the required data files will be downloaded during the workflow execution. + +> [!WARNING] +> The colabfold reference sequence [databases](https://colabfold.mmseqs.com/) (uniref30_2302 and colabfold_envdb_202108) require ~1TB of disk space. + +As an alternative, Boltz MSAs can be generated without downloading the large reference sequence databases by calling the public MMSeqs API with the `--use_msa_server` argument. Users can also point to a private api endpoint using the `--msa_server_url` argument. + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode boltz \ + --boltz_db \ + --use_msa_server \ + --use_gpu \ + -profile +``` + +> [!WARNING] +> If you aim to carry out a large amount of predictions, please use the local mmseqs search module or setup and use your own custom MMSeqs2 API Server. You can find instructions [here](https://github.com/sokrypton/ColabFold/tree/main/MsaServer). + +## General Molecules + +Boltz can support general molecular structure prediction. The most direct way to indicate molecular type is to format FASTA files with the molecular type indicated in the sequence header: + +``` +>A|protein +QLEDSEVEAVAKGLEEM +>B|rna +AUGC +>C|smiles +N[C@@H](Cc1ccc(O)cc1)C(=O)O +>D|ccd +ATP +>E|dna +ATGC +``` + +If the molecule type is not specified in the header of the input fasta, proteinfold will try to guess the expected molecule type based on the character composition. + +## YAML format + +Boltz allows specifying post-translational modifications and manual distance constraints to guide predictions. However, this input information is not supported in the FASTA format and must be specified in an input YAML file according to the boltz [specification](https://github.com/jwohlwend/boltz/blob/main/docs/prediction.md#yaml-format). + +Boltz YAML files can be run with proteinfold in boltz mode by substituting the typical FASTA file in the input samplesheet. + +``` +id,fasta +T1024,T1024.yaml +``` + +> [!NOTE] +> Structures predicted from the Boltz YAML input will not be compatible with running multiple modes simultaneously. + +## Additional Arguments + +See the [Boltz](https://github.com/jwohlwend/boltz) documentation for a full description of additional arguments. The arguments supported by the proteinfold workflow are described briefly below: + +| Parameter | Default | Description | +| ------------------------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--boltz_model` | `boltz2` | The model to use for prediction (boltz1 or boltz2) | +| `--boltz_use_potentials` | `false` | Steering potentials are used by Boltz to improve the physical validity of output predictions (ie steric clashes, incorrect chirality etc). However, these potentials dramatically increase execution time and memory requirements. | +| `--boltz_use_kernels` | `true` | Use optimized Triton-based CUDA kernels for Boltz inference. These may be incompatible with some GPU types and can be disabled as a workaround. | + +> You can override any of these parameters via the command line or a params file. diff --git a/docs/usage/colabfold.md b/docs/usage/colabfold.md new file mode 100644 index 000000000..2f6e0bd17 --- /dev/null +++ b/docs/usage/colabfold.md @@ -0,0 +1,118 @@ +--- +title: ColabFold +weight: 40 +--- + +# ColabFold + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :-------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [ColabFold](https://github.com/sokrypton/ColabFold) | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | + +## General Usage + +ColabFold mode can be run using the command below: + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode colabfold \ + --colabfold_db \ + --colabfold_model_preset "" \ + --use_gpu \ + -profile +``` + +> [!WARNING] +> `--colabfold_model_preset` is used to infer how to handle multi-entry fasta files. Choosing `alphafold2_ptm` will result in a multi-entry fasta being processed as a series of monomer entries rather than as a single oligomeric complex. + +By default, `--mode colabfold` will generate MSA files required for structure prediction using a local execution of the [ColabFold](https://github.com/sokrypton/ColabFold) search protocol. This protocol uses [MMseqs2](https://github.com/soedinglab/MMseqs2) to search a uniref30 expandable profile database and construct paired alignments using taxonomic labels. MSAs are enriched with additional unpaired sequences by searching an expandable profile databased of environmental sequences. + +> [!NOTE] +> Local ColabFold search occurs in a separate module to model inference and the resulting MSA will be cached if downstream modules need to be re-run. + +## File Structure + +The file structure of `--colabfold_db` must be as follows: + +
    +Directory structure + +``` +/ +├── colabfold_envdb +│   ├── colabfold_envdb_202108_db +│   ├── colabfold_envdb_202108_db_aln +│   ├── colabfold_envdb_202108_db_aln.dbtype +│   └── ... +├── colabfold_uniref30 +│   ├── uniref30_2302_db +│   ├── uniref30_2302_db_aln +│   ├── uniref30_2302_db_aln.dbtype +│   └── ... +└── params/ +    └── alphafold_params_colab_2022-12-06/ + ├── LICENSE + ├── params_model_1_multimer_v2.npz + ├── params_model_1_multimer_v3.npz + ├── params_model_1.npz + ├── params_model_2_multimer_v2.npz + ├── params_model_2_multimer_v3.npz + ├── params_model_2.npz + ├── params_model_3_multimer_v2.npz + ├── params_model_3_multimer_v3.npz + ├── params_model_3.npz + ├── params_model_4_multimer_v2.npz + ├── params_model_4_multimer_v3.npz + ├── params_model_4.npz + ├── params_model_5_multimer_v2.npz + ├── params_model_5_multimer_v3.npz + └── params_model_5.npz +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--colabfold_envdb_path
    +--colabfold_uniref30_path +--colabfold_alphafold2_params_path +``` + +Without setting the `--colabfold_db` flag, all of the required data files will be downloaded during the workflow execution. + +> [!WARNING] +> The ColabFold reference sequence [databases](https://colabfold.mmseqs.com/) (uniref30_2302 and colabfold_envdb_202108) require ~1TB of disk space. + +As an alternative, ColabFold MSAs can be generated without downloading the large reference sequence databases by calling the public MMSeqs API with the `--use_msa_server` argument. Users can also point to a private api endpoint using the `--msa_server_url` argument. + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode colabfold \ + --colabfold_db \ + --colabfold_model_preset \ + --use_msa_server \ + --use_gpu \ + -profile +``` + +> [!WARNING] +> If you aim to carry out a large number of predictions, please use the local mmseqs search module or setup and use your own custom MMSeqs2 API Server. You can find instructions [here](https://github.com/sokrypton/ColabFold/tree/main/MsaServer). + +## Additional Arguments + +See the [ColabFold](https://github.com/sokrypton/ColabFold) documentation for a full description of additional arguments. The arguments supported by the proteinfold workflow are described briefly below: + +| Parameter | Default | Description | +| --------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--colabfold_num_recycles` | `3` | The AlphaFold2 model used by ColabFold provides initial structure predictions as a recycled model input in an iterative refinement process. This parameter controls the number of times model outputs are recycled. Increasing the number of recycles has been found to improve performance for some challening cases. | +| `--colabfold_use_amber` | `true` | ColabFold outputs will sometimes contain phsyical violations such as steric clashes. These clashes can be resolved by post-processing the outputs with a short relaxation using the Amber Force Field. Non-clashing atoms are pinned to starting coordinates such that the relaxation has a minimal impact on final structures. | +| `--colabfold_db_load_mode` | `0` | Specify the way that MMSeqs2 will load the required databases in memory | +| `--colabfold_use_templates` | `false` | Use PDB templates to support predictions. The ColabFold notebooks do not use templates by default. | +| `--colabfold_create_index` | `false` | Create index for ColabFold databases during setup. On network filesystems it can be more performant to re-compute the index on the fly | + +> You can override any of these parameters via the command line or a params file. diff --git a/docs/usage/esmfold.md b/docs/usage/esmfold.md new file mode 100644 index 000000000..03b207ca2 --- /dev/null +++ b/docs/usage/esmfold.md @@ -0,0 +1,57 @@ +--- +title: ESMFold +weight: 50 +--- + +# ESMFold + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [ESMFold](https://github.com/facebookresearch/esm) | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + +## General Usage + +ESMFold mode can be run using the command below: + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode esmfold \ + --esmfold_model_preset \ + --esmfold_db \ + --use_gpu \ + -profile +``` + +> [!NOTE] +> ESMFold does not require searching large sequence databases for sequences homologous to the prediction target and instead relies on a pre-trained protein language model (pLM) to inform predictions. + +> [!WARNING] +> `--esmfold_model_preset` is used to infer how to handle multi-entry fasta files. Choosing `monomer` will result in a multi-entry fasta being processed as a series of monomer entries rather than as a single oligomeric complex. + +## File Structure + +The file structure of `--esmfold_db` must be as follows: + +
    +Directory structure + +``` +/params/ +├── esm2_t36_3B_UR50D-contact-regression.pt +├── esm2_t36_3B_UR50D.pt +└── esmfold_3B_v1.pt +``` + +
    + +## Additional Arguments + +See the [ESMFold](https://github.com/facebookresearch/esm) documentation for a full description of additional arguments. The arguments supported by the proteinfold workflow are described briefly below: + +| Parameter | Default | Description | +| ------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--esmfold_num_recycles` | `4` | The ESMFold model provides initial structure predictions as a recycled model input in an iterative refinement process. This parameter controls the number of times model outputs are recycled. Increasing the number of recycles has been found to improve performance for some challening cases. | + +> You can override any of these parameters via the command line or a params file. diff --git a/docs/usage/helixfold3.md b/docs/usage/helixfold3.md new file mode 100644 index 000000000..3311c5906 --- /dev/null +++ b/docs/usage/helixfold3.md @@ -0,0 +1,122 @@ +--- +title: HelixFold3 +weight: 60 +--- + +# HelixFold3 + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :------------------------------------------------------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | + +## General Usage + +HelixFold3 mode can be run using the command below: + +```console +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode helixfold3 \ + --helixfold3_db \ + --use_gpu \ + -profile +``` + +## File Structure + +The file structure of `--helixfold3_db` must be as follows: + +
    +Directory structure + +``` +/ +├── bfd +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│   └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── maxit-v11.200-prod-src +│   ├── annotation-v1.0 +│   └── ... +├── mgnify +│   └── mgy_clusters.fa +├── params +│   ├── ccd_preprocessed_etkdg.pkl.gz +│   └── HelixFold3-240814.pdparams +├── pdb_mmcif +│   ├── mmcif_files +│   └── obsolete.dat +├── pdb_seqres +│   └── pdb_seqres.txt +├── rfam +│   └── Rfam-14.9_rep_seq.fasta +├── small_bfd +│   └── bfd-first_non_consensus_sequences.fasta +├── uniprot +│   └── uniprot.fasta +├── uniref30 +│   ├── UniRef30_2023_02_a3m.ffdata +│   ├── UniRef30_2023_02_a3m.ffindex +│   ├── UniRef30_2023_02_cs219.ffdata +│   ├── UniRef30_2023_02_cs219.ffindex +│   ├── UniRef30_2023_02_hhm.ffdata +│   ├── UniRef30_2023_02_hhm.ffindex +│   └── UniRef30_2023_02.md5sums +└── uniref90 +    └── uniref90.fasta +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--helixfold3_init_models_path
    +--helixfold3_ccd_preprocessed_path +--helixfold3_rfam_path +--helixfold3_maxit_src_path +--helixfold3_bfd_path +--helixfold3_small_bfd_path +--helixfold3_mgnify_path +--helixfold3_pdb_mmcif_path +--helixfold3_obsolete_path +--helixfold3_uniclust30_path +--helixfold3_uniref90_path +--helixfold3_pdb_seqres_path +--helixfold3_uniprot_path +``` + +Without setting the `--helixfold3_db` flag, all of the required data files will be downloaded during the workflow execution. + +> [!WARNING] +> The HelixFold3 reference sequence databases require ~2TB of disk space. + +## JSON format + +HelixFold3 supports modelling of general molecular structures. Currently, only protein entities are supported using the FASTA format. Non-protein entities must be specified in an input JSON file according to the HelixFold3 [specification](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3#-understanding-model-input). + +HelixFold3 JSON files can be run with proteinfold in helixfold3 mode by substituting the typical FASTA file in the input samplesheet. + +``` +id,fasta +T1024,T1024.json +``` + +> [!NOTE] +> Structures predicted from the helixfold3 json input will not be compatible with running multiple modes simultaneously. + +## Additional Parameters + +See the [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3#-running-helixfold-for-inference) documentation for a full description of additional arguments. The arguments supported by the proteinfold workflow are described briefly below: + +| Parameter | Default | Description | +| -------------------------------- | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--helixfold3_max_template_date` | `2038-01-19` | Structural templates from the PDB are used as additional context when making predictions. Molecules with solved structures in the PDB can be trivially predicted by using these structures as inputs. When benchmarking model performance it can be useful to restrict the use of templates to those deposited before a fixed date to ensure solved structures do not bias predictions. | +| `--helixfold3_precision` | `bf16` | Controls the numerical precision during neural network inference. bf16 is supported by GPU accelerators A100, H100 and higher, while others will require fp32 inference. (bf16/fp32) | +| `--helixfold3_infer_times` | `4` | The number of independent seeds used to generate structure predictions using the HelixFold3 model. | + +> You can override any of these parameters via the command line or a params file. diff --git a/docs/usage/rosettafold2na.md b/docs/usage/rosettafold2na.md new file mode 100644 index 000000000..aa8102552 --- /dev/null +++ b/docs/usage/rosettafold2na.md @@ -0,0 +1,113 @@ +--- +title: RoseTTAFold2NA +weight: 80 +--- + +# RoseTTAFold2NA + +| Mode | Split MSA | RNA | Small-molecule | PTM | Constraints | pLM | Protein | MSA server | +| :--------------------------------------------------------- | :-------: | :-: | :------------: | :-: | :---------: | :-: | :-----: | :--------: | +| [RoseTTAFold2NA](https://github.com/uw-ipd/RoseTTAFold2NA) | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | + +RoseTTAFold2NA can be run using the command below: + +```bash +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode rosettafold2na \ + --rosettafold2na_db \ + --use_gpu \ + -profile +``` + +> [!NOTE] +> RosettaFold2NA now expects each samplesheet row to reference a multi-chain FASTA that includes every interacting molecule. Add a `type=` hint to each header (for example `type=protein`, `type=rna`, `type=double_dna`, or `type=single_dna`) so the adaptor can tag chains with the correct RF2NA entity codes (`P`, `R`, `D`, `S`). If no hint is present, the chain type is inferred from sequence composition (pure `ACUGN` → RNA, pure `ACTGN` → DNA which defaults to `D` unless explicitly tagged single-strand, otherwise protein). + +## File Structure + +The file structure of `--rosettafold2na_db` must be as follows: + +
    +Directory structure + +```console +/ +├── bfd +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── params +│ └── network +│ └── weights +│ └── RF2NA_apr23.pt +├── pdb100 +│ ├── pdb100_2021Mar03_a3m.ffdata +│ ├── pdb100_2021Mar03_a3m.ffindex +│ ├── pdb100_2021Mar03_cs219.ffdata +│ ├── pdb100_2021Mar03_cs219.ffindex +│ ├── pdb100_2021Mar03_hhm.ffdata +│ ├── pdb100_2021Mar03_hhm.ffindex +│ ├── pdb100_2021Mar03_pdb.ffdata +│ └── pdb100_2021Mar03_pdb.ffindex +├── RNA +│ ├── Rfam.full_region +│ ├── Rfam.cm.* +│ ├── id_mapping.tsv.gz +│ ├── rfam_annotations.tsv.gz +│ ├── rnacentral.fasta.* +│ ├── nt.* +│ └── ... +└── UniRef30_2020_06 + ├── UniRef30_2020_06_a3m.ffdata + ├── UniRef30_2020_06_a3m.ffindex + ├── UniRef30_2020_06_cs219.ffdata + ├── UniRef30_2020_06_cs219.ffindex + ├── UniRef30_2020_06_hhm.ffdata + ├── UniRef30_2020_06_hhm.ffindex + └── UniRef30_2020_06.md5sums +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--rosettafold2na_uniref30_path +--rosettafold2na_bfd_path +--rosettafold2na_pdb100_path +--rosettafold2na_rna_path +--rosettafold2na_weights_path +``` + +Without setting the `--rosettafold2na_db` flag, all required data files will be downloaded during workflow execution. + +> [!WARNING] +> RoseTTAFold2NA reference databases are large and require substantial local disk space. + +## Input Format + +RoseTTAFold2NA mode uses FASTA input from the samplesheet. Multi-entry FASTA files are supported. + +To avoid ambiguity, annotate each FASTA header with a molecule type: + +```console +>A type=protein +MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQ... +>B type=rna +AUGGCUACG... +>C type=double_dna +ATGCGT... +>D type=single_dna +ATTTGCA... +``` + +Supported entity types are: + +- `protein` (`P`) +- `rna` (`R`) +- `double_dna` (`D`) +- `single_dna` (`S`) diff --git a/docs/usage/rosettafold_all_atom.md b/docs/usage/rosettafold_all_atom.md new file mode 100644 index 000000000..c97c0723e --- /dev/null +++ b/docs/usage/rosettafold_all_atom.md @@ -0,0 +1,90 @@ +--- +title: RoseTTAFold-All-Atom +weight: 70 +--- + +# RoseTTAFold-All-Atom + +| Mode | Protein | RNA | Small-molecule | PTM | Constraints | pLM | MSA server | Split MSA | +| :-------------------------------------------------------------------------------- | :-----: | :-: | :------------: | :-: | :---------: | :-: | :--------: | :-------: | +| [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | + +RoseTTAFold All-Atom can be run using the command below: + +```bash +nextflow run nf-core/proteinfold \ + --input samplesheet.csv \ + --outdir \ + --mode rosettafold_all_atom \ + --rosettafold_all_atom_db \ + --use_gpu \ + -profile +``` + +## File Structure + +The file structure of `--rosettafold_all_atom_db` must be as follows: + +
    +Directory structure + +```console +/ +├── bfd +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── params +│   └── RFAA_paper_weights.pt +├── pdb100 +│   ├── LICENSE +│   ├── pdb100_2021Mar03_a3m.ffdata +│   ├── pdb100_2021Mar03_a3m.ffindex +│   ├── pdb100_2021Mar03_cs219.ffdata +│   ├── pdb100_2021Mar03_cs219.ffindex +│   ├── pdb100_2021Mar03_hhm.ffdata +│   ├── pdb100_2021Mar03_hhm.ffindex +│   ├── pdb100_2021Mar03_pdb.ffdata +│   └── pdb100_2021Mar03_pdb.ffindex +└── uniref30 + ├── UniRef30_2023_02_a3m.ffdata + ├── UniRef30_2023_02_a3m.ffindex + ├── UniRef30_2023_02_cs219.ffdata + ├── UniRef30_2023_02_cs219.ffindex + ├── UniRef30_2023_02_hhm.ffdata + ├── UniRef30_2023_02_hhm.ffindex + └── UniRef30_2023_02.md5sums +``` + +
    + +If individual components are available at different locations in the filesystem, they can be set using the following flags: + +```console +--rosettafold_all_atom_bfd_path +--rosettafold_all_atom_paper_weights_path +--rosettafold_all_atom_uniref30_path +--rosettafold_all_atom_pdb100_path +``` + +Without setting the `--rosettafold_all_atom_db` flag, all of the required data files will be downloaded during the workflow execution. + +> [!WARNING] +> The RoseTTAFold-All-Atom reference databases require ~2TB of disk space. + +## YAML format + +RoseTTAFold-All-Atom allows modelling nucleic acids and small molecule ligands as well as specifying post-translational modifications. However, this input information is not supported in the FASTA format and must be specified in an input YAML file according to the RoseTTAFold-All-Atom [specification](https://github.com/baker-laboratory/RoseTTAFold-All-Atom?tab=readme-ov-file#predicting-protein-nucleic-acid-complexes). + +RoseTTAFold-All-Atom YAML files can be run with proteinfold in rosettafold_all_atom mode by substituting the typical FASTA file in the input samplesheet. + +``` +id,fasta +T1024,T1024.yaml +``` + +> [!NOTE] +> Structures predicted from the RoseTTAFold-All-Atom YAML input will not be compatible with running multiple modes simultaneously. diff --git a/main.nf b/main.nf index d6da0f09b..857415a56 100644 --- a/main.nf +++ b/main.nf @@ -9,29 +9,37 @@ ---------------------------------------------------------------------------------------- */ -nextflow.enable.dsl = 2 - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS / WORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -if (params.mode == "alphafold2") { - include { PREPARE_ALPHAFOLD2_DBS } from './subworkflows/local/prepare_alphafold2_dbs' - include { ALPHAFOLD2 } from './workflows/alphafold2' -} else if (params.mode == "colabfold") { - include { PREPARE_COLABFOLD_DBS } from './subworkflows/local/prepare_colabfold_dbs' - include { COLABFOLD } from './workflows/colabfold' -} else if (params.mode == "esmfold") { - include { PREPARE_ESMFOLD_DBS } from './subworkflows/local/prepare_esmfold_dbs' - include { ESMFOLD } from './workflows/esmfold' -} +include { PREPARE_ALPHAFOLD2_DBS } from './subworkflows/local/prepare_alphafold2_dbs' +include { PREPARE_ALPHAFOLD3_DBS } from './subworkflows/local/prepare_alphafold3_dbs' +include { PREPARE_ESMFOLD_DBS } from './subworkflows/local/prepare_esmfold_dbs' +include { PREPARE_ROSETTAFOLD_ALL_ATOM_DBS } from './subworkflows/local/prepare_rosettafold_all_atom_dbs' +include { PREPARE_HELIXFOLD3_DBS } from './subworkflows/local/prepare_helixfold3_dbs' +include { PREPARE_BOLTZ_DBS } from './subworkflows/local/prepare_boltz_dbs' +include { PREPARE_ROSETTAFOLD2NA_DBS } from './subworkflows/local/prepare_rosettafold2na_dbs' + +include { PREPARE_COLABFOLD_DBS as PREPARE_COLABFOLD_DBS_COLABFOLD } from './subworkflows/local/prepare_colabfold_dbs' +include { PREPARE_COLABFOLD_DBS as PREPARE_COLABFOLD_DBS_BOLTZ } from './subworkflows/local/prepare_colabfold_dbs' + +include { ALPHAFOLD2 } from './workflows/alphafold2' +include { ALPHAFOLD3 } from './workflows/alphafold3' +include { COLABFOLD } from './workflows/colabfold' +include { ESMFOLD } from './workflows/esmfold' +include { ROSETTAFOLD_ALL_ATOM } from './workflows/rosettafold_all_atom' +include { HELIXFOLD3 } from './workflows/helixfold3' +include { BOLTZ } from './workflows/boltz' +include { ROSETTAFOLD2NA } from './workflows/rosettafold2na' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' include { getColabfoldAlphafold2Params } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' include { getColabfoldAlphafold2ParamsPath } from './subworkflows/local/utils_nfcore_proteinfold_pipeline' +include { POST_PROCESSING } from './subworkflows/local/post_processing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -51,44 +59,58 @@ params.colabfold_alphafold2_params_path = getColabfoldAlphafold2ParamsPath() // // WORKFLOW: Run main analysis pipeline // + workflow NFCORE_PROTEINFOLD { + take: + samplesheet // channel: samplesheet read in from --input + main: - ch_multiqc = Channel.empty() - ch_versions = Channel.empty() + ch_samplesheet = samplesheet + ch_multiqc = channel.empty() + ch_versions = channel.empty() + ch_report_input = channel.empty() + ch_top_ranked_model = channel.empty() + requested_modes = params.mode.toLowerCase().split(",") + requested_modes_size = requested_modes.size() + + ch_dummy_file = channel.fromPath("$projectDir/assets/NO_FILE") + ch_dummy_file_pae = channel.fromPath("$projectDir/assets/NO_FILE_PAE") // // WORKFLOW: Run alphafold2 // - if(params.mode == "alphafold2") { + if(requested_modes.contains("alphafold2")) { + // // SUBWORKFLOW: Prepare Alphafold2 DBs // PREPARE_ALPHAFOLD2_DBS ( params.alphafold2_db, - params.full_dbs, - params.bfd_path, - params.small_bfd_path, + params.alphafold2_full_dbs, + params.alphafold2_bfd_path, + params.alphafold2_small_bfd_path, params.alphafold2_params_path, - params.mgnify_path, - params.pdb70_path, - params.pdb_mmcif_path, - params.uniref30_alphafold2_path, - params.uniref90_path, - params.pdb_seqres_path, - params.uniprot_path, - params.bfd_link, - params.small_bfd_link, + params.alphafold2_mgnify_path, + params.alphafold2_pdb70_path, + params.alphafold2_pdb_mmcif_path, + params.alphafold2_pdb_obsolete_path, + params.alphafold2_uniref30_path, + params.alphafold2_uniref90_path, + params.alphafold2_pdb_seqres_path, + params.alphafold2_uniprot_path, + params.alphafold2_bfd_link, + params.alphafold2_small_bfd_link, params.alphafold2_params_link, - params.mgnify_link, - params.pdb70_link, - params.pdb_mmcif_link, - params.pdb_obsolete_link, - params.uniref30_alphafold2_link, - params.uniref90_link, - params.pdb_seqres_link, - params.uniprot_sprot_link, - params.uniprot_trembl_link + params.alphafold2_mgnify_link, + params.alphafold2_pdb70_link, + params.alphafold2_pdb_mmcif_link, + params.alphafold2_pdb_obsolete_link, + params.alphafold2_uniref30_link, + params.alphafold2_uniref90_link, + params.alphafold2_pdb_seqres_link, + params.alphafold2_uniprot_sprot_link, + params.alphafold2_uniprot_trembl_link ) ch_versions = ch_versions.mix(PREPARE_ALPHAFOLD2_DBS.out.versions) @@ -96,64 +118,186 @@ workflow NFCORE_PROTEINFOLD { // WORKFLOW: Run nf-core/alphafold2 workflow // ALPHAFOLD2 ( + ch_samplesheet, ch_versions, - params.full_dbs, + params.alphafold2_full_dbs, params.alphafold2_mode, params.alphafold2_model_preset, + params.uniref30_prefix, PREPARE_ALPHAFOLD2_DBS.out.params, - PREPARE_ALPHAFOLD2_DBS.out.bfd.ifEmpty([]).first(), - PREPARE_ALPHAFOLD2_DBS.out.small_bfd.ifEmpty([]).first(), + PREPARE_ALPHAFOLD2_DBS.out.bfd, + PREPARE_ALPHAFOLD2_DBS.out.small_bfd, PREPARE_ALPHAFOLD2_DBS.out.mgnify, PREPARE_ALPHAFOLD2_DBS.out.pdb70, PREPARE_ALPHAFOLD2_DBS.out.pdb_mmcif, + PREPARE_ALPHAFOLD2_DBS.out.pdb_obsolete, PREPARE_ALPHAFOLD2_DBS.out.uniref30, PREPARE_ALPHAFOLD2_DBS.out.uniref90, PREPARE_ALPHAFOLD2_DBS.out.pdb_seqres, PREPARE_ALPHAFOLD2_DBS.out.uniprot ) - ch_multiqc = ALPHAFOLD2.out.multiqc_report - ch_versions = ch_versions.mix(ALPHAFOLD2.out.versions) + ch_multiqc = ch_multiqc.mix(ALPHAFOLD2.out.multiqc_report.collect()) + ch_versions = ch_versions.mix(ALPHAFOLD2.out.versions) + ch_report_input = ch_report_input + .mix(ALPHAFOLD2 + .out + .pdb + .map { it -> + [ it[0], + it[1].sort { path -> + def filename = path.name + def matcher = filename =~ /ranked_(\d+)\.pdb/ + if (matcher.matches()) { + return matcher[0][1].toInteger() + } else { + return 0 // fallback if no match + } + }.subList(0, Math.min(5, it[1].size() as int)) + ] + } + .join(ALPHAFOLD2.out.msa) + .join(ALPHAFOLD2.out.pae) + ) + + ch_top_ranked_model = ch_top_ranked_model.mix(ALPHAFOLD2.out.top_ranked_pdb) + } + + // + // WORKFLOW: Run alphafold3 + // + if(requested_modes.contains("alphafold3")) { + + // + // SUBWORKFLOW: Prepare Alphafold3 DBs + // + PREPARE_ALPHAFOLD3_DBS ( + params.alphafold3_db, + params.alphafold3_params_path, + params.alphafold3_small_bfd_path, + params.alphafold3_mgnify_path, + params.alphafold3_pdb_mmcif_path, + params.alphafold3_uniref90_path, + params.alphafold3_pdb_seqres_path, + params.alphafold3_uniprot_path, + params.alphafold3_rnacentral_path, + params.alphafold3_nt_rna_path, + params.alphafold3_rfam_path, + params.alphafold3_small_bfd_link, + params.alphafold3_mgnify_link, + params.alphafold3_pdb_mmcif_link, + params.alphafold3_uniref90_link, + params.alphafold3_pdb_seqres_link, + params.alphafold3_uniprot_link, + params.alphafold3_rnacentral_link, + params.alphafold3_nt_rna_link, + params.alphafold3_rfam_link + ) + ch_versions = ch_versions.mix(PREPARE_ALPHAFOLD3_DBS.out.versions) + + // + // WORKFLOW: Run nf-core/alphafold3 workflow + // + ALPHAFOLD3 ( + ch_samplesheet, + ch_versions, + PREPARE_ALPHAFOLD3_DBS.out.params, + PREPARE_ALPHAFOLD3_DBS.out.small_bfd, + PREPARE_ALPHAFOLD3_DBS.out.mgnify, + PREPARE_ALPHAFOLD3_DBS.out.pdb_mmcif, + PREPARE_ALPHAFOLD3_DBS.out.uniref90, + PREPARE_ALPHAFOLD3_DBS.out.pdb_seqres, + PREPARE_ALPHAFOLD3_DBS.out.uniprot + ) + + ch_multiqc = ch_multiqc.mix(ALPHAFOLD3.out.multiqc_report) + ch_versions = ch_versions.mix(ALPHAFOLD3.out.versions) + ch_report_input = ch_report_input + .mix( + ALPHAFOLD3 + .out + .pdb + .map { it -> + [ + it[0], + it[1].sort { path -> + def filename = path.name + def matcher = filename =~ /.*_ranked_(\d+)\.pdb/ + if (matcher.matches()) { + return matcher[0][1].toInteger() + } else { + return 0 // fallback if no match + } + }.subList(0, Math.min(5, it[1].size() as int)) + ] + } + .join(ALPHAFOLD3.out.msa) + .join(ALPHAFOLD3.out.pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(ALPHAFOLD3.out.top_ranked_pdb) } // // WORKFLOW: Run colabfold // - else if(params.mode == "colabfold") { + if(requested_modes.contains("colabfold")) { + // // SUBWORKFLOW: Prepare Colabfold DBs // - PREPARE_COLABFOLD_DBS ( + PREPARE_COLABFOLD_DBS_COLABFOLD ( params.colabfold_db, - params.colabfold_server, + params.use_msa_server, params.colabfold_alphafold2_params_path, - params.colabfold_db_path, - params.uniref30_colabfold_path, + params.colabfold_envdb_path, + params.colabfold_uniref30_path, params.colabfold_alphafold2_params_link, params.colabfold_db_link, - params.uniref30_colabfold_link, - params.create_colabfold_index + params.colabfold_uniref30_link, + params.colabfold_create_index ) - ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS.out.versions) + ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS_COLABFOLD.out.versions) // // WORKFLOW: Run nf-core/colabfold workflow // COLABFOLD ( + ch_samplesheet, ch_versions, params.colabfold_model_preset, - PREPARE_COLABFOLD_DBS.out.params, - PREPARE_COLABFOLD_DBS.out.colabfold_db, - PREPARE_COLABFOLD_DBS.out.uniref30, - params.num_recycles_colabfold + PREPARE_COLABFOLD_DBS_COLABFOLD.out.params, + PREPARE_COLABFOLD_DBS_COLABFOLD.out.colabfold_db, + PREPARE_COLABFOLD_DBS_COLABFOLD.out.uniref30, + params.colabfold_num_recycles ) - ch_multiqc = COLABFOLD.out.multiqc_report - ch_versions = ch_versions.mix(COLABFOLD.out.versions) + + ch_multiqc = ch_multiqc.mix(COLABFOLD.out.multiqc_report) + ch_versions = ch_versions.mix(COLABFOLD.out.versions) + ch_report_input = ch_report_input + .mix(COLABFOLD.out.pdb.map { it -> + [ it[0], + it[1].sort { path -> + def filename = path.name + def matcher = filename =~ /_relaxed_rank_(\d+)\.pdb/ + if (matcher.matches()) { + return matcher[0][1].toInteger() + } else { + return 0 // fallback if no match + } + }.subList(0, Math.min(5, it[1].size() as int)) + ] + } + .join(COLABFOLD.out.msa) + .join(COLABFOLD.out.pae) + ) + + ch_top_ranked_model = ch_top_ranked_model.mix(COLABFOLD.out.top_ranked_pdb) } // // WORKFLOW: Run esmfold // - else if(params.mode == "esmfold") { + if(requested_modes.contains("esmfold")) { + // // SUBWORKFLOW: Prepare esmfold DBs // @@ -170,16 +314,286 @@ workflow NFCORE_PROTEINFOLD { // WORKFLOW: Run nf-core/esmfold workflow // ESMFOLD ( + ch_samplesheet, ch_versions, PREPARE_ESMFOLD_DBS.out.params, - params.num_recycles_esmfold + params.esmfold_num_recycles + ) + + ch_multiqc = ch_multiqc.mix(ESMFOLD.out.multiqc_report.collect()) + ch_versions = ch_versions.mix(ESMFOLD.out.versions) + ch_report_input = ch_report_input.mix( + ESMFOLD.out.pdb + .combine(ch_dummy_file) + .combine(ch_dummy_file_pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(ESMFOLD.out.pdb) + } + + // + + // WORKFLOW: Run rosettafold_all_atom + // + if(requested_modes.contains("rosettafold_all_atom")) { + + // + // SUBWORKFLOW: Prepare Rosettafold-all-atom DBs + // + PREPARE_ROSETTAFOLD_ALL_ATOM_DBS ( + params.rosettafold_all_atom_db, + params.rosettafold_all_atom_bfd_path, + params.rosettafold_all_atom_uniref30_path, + params.rosettafold_all_atom_pdb100_path, + params.rosettafold_all_atom_paper_weights_path, + params.rosettafold_all_atom_bfd_link, + params.rosettafold_all_atom_uniref30_link, + params.rosettafold_all_atom_pdb100_link, + params.rosettafold_all_atom_paper_weights_link + ) + ch_versions = ch_versions.mix(PREPARE_ROSETTAFOLD_ALL_ATOM_DBS.out.versions) + + // + // WORKFLOW: Run nf-core/rosettafold_all_atom workflow + // + ROSETTAFOLD_ALL_ATOM ( + ch_samplesheet, + ch_versions, + params.uniref30_prefix, + PREPARE_ROSETTAFOLD_ALL_ATOM_DBS.out.bfd, + PREPARE_ROSETTAFOLD_ALL_ATOM_DBS.out.uniref30, + PREPARE_ROSETTAFOLD_ALL_ATOM_DBS.out.pdb100, + PREPARE_ROSETTAFOLD_ALL_ATOM_DBS.out.rfaa_paper_weights + ) + ch_multiqc = ch_multiqc.mix(ROSETTAFOLD_ALL_ATOM.out.multiqc_report.collect()) + ch_versions = ch_versions.mix(ROSETTAFOLD_ALL_ATOM.out.versions) + ch_report_input = ch_report_input.mix(ROSETTAFOLD_ALL_ATOM.out.pdb + .join(ROSETTAFOLD_ALL_ATOM.out.msa) + .join(ROSETTAFOLD_ALL_ATOM.out.pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(ROSETTAFOLD_ALL_ATOM.out.pdb) + } + + // + // WORKFLOW: Run helixfold3 + // + if(requested_modes.contains("helixfold3")) { + + // + // SUBWORKFLOW: Prepare helixfold3 DBs + // + PREPARE_HELIXFOLD3_DBS ( + params.helixfold3_db, + params.helixfold3_uniclust30_link, + params.helixfold3_ccd_preprocessed_link, + params.helixfold3_rfam_link, + params.helixfold3_init_models_link, + params.helixfold3_bfd_link, + params.helixfold3_small_bfd_link, + params.helixfold3_uniprot_sprot_link, + params.helixfold3_uniprot_trembl_link, + params.helixfold3_pdb_seqres_link, + params.helixfold3_uniref90_link, + params.helixfold3_mgnify_link, + params.helixfold3_pdb_mmcif_link, + params.helixfold3_obsolete_link, + params.helixfold3_maxit_src_link, + params.helixfold3_uniclust30_path, + params.helixfold3_ccd_preprocessed_path, + params.helixfold3_rfam_path, + params.helixfold3_init_models_path, + params.helixfold3_bfd_path, + params.helixfold3_small_bfd_path, + params.helixfold3_uniprot_path, + params.helixfold3_pdb_seqres_path, + params.helixfold3_uniref90_path, + params.helixfold3_mgnify_path, + params.helixfold3_pdb_mmcif_path, + params.helixfold3_obsolete_path, + params.helixfold3_maxit_src_path + ) + ch_versions = ch_versions.mix(PREPARE_HELIXFOLD3_DBS.out.versions) + + // + // WORKFLOW: Run nf-core/helixfold3 workflow + // + HELIXFOLD3 ( + ch_samplesheet, + ch_versions, + params.uniref30_prefix, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_uniclust30, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_ccd_preprocessed, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_rfam, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_bfd, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_small_bfd, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_uniprot, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_pdb_seqres, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_uniref90, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_mgnify, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_mmcif_files, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_obsolete, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_init_models, + PREPARE_HELIXFOLD3_DBS.out.helixfold3_maxit_src + ) + ch_multiqc = ch_multiqc.mix(HELIXFOLD3.out.multiqc_report.collect()) + ch_versions = ch_versions.mix(HELIXFOLD3.out.versions) + ch_report_input = ch_report_input + .mix(HELIXFOLD3.out.pdb.map { it -> + [ it[0], + it[1].sort { path -> + def filename = path.name + def matcher = filename =~ /ranked_(\d+)\.pdb/ + if (matcher.matches()) { + return matcher[0][1].toInteger() + } else { + return 0 // fallback if no match + } + }.subList(0, Math.min(5, it[1].size() as int)) + ] + } + .join(HELIXFOLD3.out.msa) + .join(HELIXFOLD3.out.pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(HELIXFOLD3.out.top_ranked_pdb) + } + + // + // WORKFLOW: Run rosettafold2na + // + if(requested_modes.contains("rosettafold2na")) { + + // + // SUBWORKFLOW: Prepare RosettaFold2NA DBs + // + PREPARE_ROSETTAFOLD2NA_DBS ( + params.rosettafold2na_db, + params.rosettafold2na_bfd_path, + params.rosettafold2na_uniref30_path, + params.rosettafold2na_pdb100_path, + params.rosettafold2na_rna_path, + params.rosettafold2na_weights_path, + params.rosettafold2na_bfd_link, + params.rosettafold2na_uniref30_link, + params.rosettafold2na_pdb100_link, + params.rosettafold2na_weights_link, + params.rfam_full_region_link, + params.rfam_cm_link, + params.rnacentral_rfam_annotations_link, + params.rnacentral_id_mapping_link, + params.rnacentral_sequences_link + ) + ch_versions = ch_versions.mix(PREPARE_ROSETTAFOLD2NA_DBS.out.versions) + + // + // WORKFLOW: Run nf-core/rosettafold2na workflow + // + ROSETTAFOLD2NA ( + ch_samplesheet, + ch_versions, + PREPARE_ROSETTAFOLD2NA_DBS.out.bfd, + PREPARE_ROSETTAFOLD2NA_DBS.out.uniref30, + PREPARE_ROSETTAFOLD2NA_DBS.out.pdb100, + PREPARE_ROSETTAFOLD2NA_DBS.out.rna, + PREPARE_ROSETTAFOLD2NA_DBS.out.rosettafold2na_weights ) - ch_multiqc = ESMFOLD.out.multiqc_report - ch_versions = ch_versions.mix(ESMFOLD.out.versions) + ch_multiqc = ch_multiqc.mix(ROSETTAFOLD2NA.out.multiqc_report.collect()) + ch_versions = ch_versions.mix(ROSETTAFOLD2NA.out.versions) + ch_report_input = ch_report_input + .mix( + ROSETTAFOLD2NA + .out + .pdb + .map { meta, pdb -> [ meta, [ pdb ] ] } + .join(ROSETTAFOLD2NA.out.msa) + .join(ROSETTAFOLD2NA.out.pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(ROSETTAFOLD2NA.out.pdb) } + + // WORKFLOW: Run Boltz + // + if (requested_modes.contains("boltz")) { + + PREPARE_BOLTZ_DBS( + params.boltz_db, + params.boltz_ccd_path, + params.boltz_model_path, + params.boltz2_aff_path, + params.boltz2_conf_path, + params.boltz2_mols_path, + params.boltz_ccd_link, + params.boltz_model_link, + params.boltz2_aff_link, + params.boltz2_conf_link, + params.boltz2_mols_link + ) + ch_versions = ch_versions.mix(PREPARE_BOLTZ_DBS.out.versions) + + PREPARE_COLABFOLD_DBS_BOLTZ ( + params.colabfold_db, + params.use_msa_server, + params.colabfold_alphafold2_params_path, + params.colabfold_envdb_path, + params.colabfold_uniref30_path, + params.colabfold_alphafold2_params_link, + params.colabfold_db_link, + params.colabfold_uniref30_link, + params.colabfold_create_index + ) + ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS_BOLTZ.out.versions) + + BOLTZ( + ch_samplesheet, + ch_versions, + PREPARE_BOLTZ_DBS.out.boltz_ccd, + PREPARE_BOLTZ_DBS.out.boltz_model, + PREPARE_BOLTZ_DBS.out.boltz2_aff, + PREPARE_BOLTZ_DBS.out.boltz2_conf, + PREPARE_BOLTZ_DBS.out.boltz2_mols, + PREPARE_COLABFOLD_DBS_BOLTZ.out.colabfold_db, + PREPARE_COLABFOLD_DBS_BOLTZ.out.uniref30, + params.use_msa_server + ) + ch_multiqc = ch_multiqc.mix(BOLTZ.out.multiqc_report) + ch_versions = ch_versions.mix(BOLTZ.out.versions) + ch_report_input = ch_report_input.mix( + BOLTZ.out.pdb + .join(BOLTZ.out.msa) + .join(BOLTZ.out.pae) + ) + ch_top_ranked_model = ch_top_ranked_model.mix(BOLTZ.out.top_ranked_pdb) + } + // + // POST PROCESSING: generate visualisation reports + // + ch_multiqc_config = channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true).first() + ch_multiqc_custom_config = params.multiqc_config ? channel.fromPath( params.multiqc_config ).first() : channel.empty() + ch_multiqc_logo = params.multiqc_logo ? channel.fromPath( params.multiqc_logo ).first() : channel.empty() + ch_multiqc_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + ch_report_template = channel.value(file("$projectDir/assets/report_template.html", checkIfExists: true)) + ch_comparison_template = channel.value(file("$projectDir/assets/comparison_template.html", checkIfExists: true)) + + POST_PROCESSING( + params.skip_visualisation, + requested_modes_size, + ch_report_input, + ch_report_template, + ch_comparison_template, + params.skip_foldseek, + params.foldseek_db, + params.foldseek_db_path, + params.skip_multiqc, + params.outdir, + ch_versions, + ch_multiqc, + ch_multiqc_config, + ch_multiqc_custom_config, + ch_multiqc_logo, + ch_multiqc_methods_description, + ch_top_ranked_model + ) + emit: - multiqc_report = ch_multiqc // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [version1, version2, ...] + multiqc_report = ch_multiqc } /* @@ -196,17 +610,22 @@ workflow { // PIPELINE_INITIALISATION ( params.version, - params.help, params.validate_params, params.monochrome_logs, args, - params.outdir + params.outdir, + params.input, + params.help, + params.help_full, + params.show_hidden ) // // WORKFLOW: Run main workflow // - NFCORE_PROTEINFOLD () + NFCORE_PROTEINFOLD ( + PIPELINE_INITIALISATION.out.samplesheet + ) // // SUBWORKFLOW: Run completion tasks diff --git a/modules.json b/modules.json index cdb36bb6b..387857a74 100644 --- a/modules.json +++ b/modules.json @@ -11,6 +11,12 @@ "installed_by": ["modules"], "patch": "modules/nf-core/aria2/aria2.diff" }, + "foldseek/easysearch": { + "branch": "master", + "git_sha": "a02efd7783000a416d5d2f1b2bc86b8d41b9f439", + "installed_by": ["modules"], + "patch": "modules/nf-core/foldseek/easysearch/foldseek-easysearch.diff" + }, "gunzip": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", @@ -18,7 +24,7 @@ }, "mmseqs/createindex": { "branch": "master", - "git_sha": "151460db852d636979d9ff3ee631e2268060d4c3", + "git_sha": "38697a933bef7041bb935c9b8374d9948ce6c794", "installed_by": ["modules"] }, "mmseqs/tsv2exprofiledb": { @@ -28,15 +34,20 @@ }, "multiqc": { "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", + "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", "installed_by": ["modules"], "patch": "modules/nf-core/multiqc/multiqc.diff" }, "untar": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"], "patch": "modules/nf-core/untar/untar.diff" + }, + "unzip": { + "branch": "master", + "git_sha": "4dd9d8439a429c7ee566e0e2347f76ddeef27e66", + "installed_by": ["modules"] } } }, @@ -44,17 +55,17 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", + "git_sha": "65f5e638d901a51534c68fd5c1c19e8112fb4df1", "installed_by": ["subworkflows"] }, - "utils_nfvalidation_plugin": { + "utils_nfschema_plugin": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "fdc08b8b1ae74f56686ce21f7ea11ad11990ce57", "installed_by": ["subworkflows"] } } diff --git a/modules/local/boltz_fasta/environment.yml b/modules/local/boltz_fasta/environment.yml new file mode 100644 index 000000000..012de0929 --- /dev/null +++ b/modules/local/boltz_fasta/environment.yml @@ -0,0 +1,6 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.8.3 diff --git a/modules/local/boltz_fasta/main.nf b/modules/local/boltz_fasta/main.nf new file mode 100644 index 000000000..4c120aec8 --- /dev/null +++ b/modules/local/boltz_fasta/main.nf @@ -0,0 +1,42 @@ +process BOLTZ_FASTA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(fasta), path(msa) + + output: + tuple val(meta), path ("output_fasta/*.fasta"), path(msa), emit: formatted_fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def msa_files = msa ? "--msa " + msa.join(' ') : '' + """ + fasta_to_boltz.py ${fasta} ${meta.id} ${msa_files} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + mkdir output_fasta + touch "output_fasta/${meta.id}.fasta" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/colabfold_batch.nf b/modules/local/colabfold_batch.nf deleted file mode 100644 index 5dab51fb8..000000000 --- a/modules/local/colabfold_batch.nf +++ /dev/null @@ -1,61 +0,0 @@ -process COLABFOLD_BATCH { - tag "$meta.id" - label 'process_medium' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local COLABFOLD_BATCH module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_colabfold:1.1.1" - - input: - tuple val(meta), path(fasta) - val colabfold_model_preset - path ('params/*') - path ('colabfold_db/*') - path ('uniref30/*') - val numRec - - output: - path ("*") , emit: pdb - path ("*_mqc.png") , emit: multiqc - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def VERSION = '1.5.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - - """ - ln -r -s params/alphafold_params_*/* params/ - colabfold_batch \\ - $args \\ - --num-recycle ${numRec} \\ - --data \$PWD \\ - --model-type ${colabfold_model_preset} \\ - ${fasta} \\ - \$PWD - for i in `find *_relaxed_rank_001*.pdb`; do cp \$i `echo \$i | sed "s|_relaxed_rank_|\t|g" | cut -f1`"_colabfold.pdb"; done - for i in `find *.png -maxdepth 0`; do cp \$i \${i%'.png'}_mqc.png; done - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - colabfold_batch: $VERSION - END_VERSIONS - """ - - stub: - def VERSION = '1.5.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - touch ./"${fasta.baseName}"_colabfold.pdb - touch ./"${fasta.baseName}"_mqc.png - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - colabfold_batch: $VERSION - END_VERSIONS - """ -} diff --git a/modules/local/colabfold_batch/Dockerfile b/modules/local/colabfold_batch/Dockerfile new file mode 100644 index 000000000..d886bfc7a --- /dev/null +++ b/modules/local/colabfold_batch/Dockerfile @@ -0,0 +1,49 @@ +ARG CUDA_VERSION=12.9.1 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu24.04 + +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_colabfold" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the COLABFOLD_BATCH module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Joshua Storm Caley " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends --no-install-suggests \ + wget \ + git \ + cuda-nvcc-$(echo $CUDA_VERSION | cut -d'.' -f1,2 | tr '.' '-') && \ + \ + wget -qnc https://github.com/conda-forge/miniforge/releases/download/24.11.0-0/Miniforge3-24.11.0-0-Linux-x86_64.sh && \ + bash Miniforge3-24.11.0-0-Linux-x86_64.sh -bfp /usr/local && \ + rm -f Miniforge3-24.11.0-0-Linux-x86_64.sh && \ + \ + CONDA_OVERRIDE_CUDA=$(echo $CUDA_VERSION | cut -d'.' -f1,2) mamba install -c conda-forge -c nvidia -c bioconda \ + openmm pdbfixer kalign2=2.04 hhsuite=3.3.0 -y && \ + \ + pip install --no-cache-dir --no-warn-conflicts "colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold.git@e8ebd9a" && \ + pip install --no-cache-dir silence_tensorflow && \ + pip uninstall -y jax jaxlib || true && \ + pip install --no-cache-dir "jax[cuda12]==0.5.3" "jaxlib==0.5.3" && \ + \ + mamba clean -a -y && \ + rm -rf /usr/local/pkgs && \ + find /usr/local -name "*.pyc" -delete && \ + find /usr/local -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true && \ + rm -rf /usr/local/share/doc /usr/local/share/man /usr/local/share/info && \ + \ + apt-get remove -y wget git && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + pip cache purge + +ENV MPLBACKEND=Agg +ENV MPLCONFIGDIR=/tmp/mplconfig +ENV XDG_CACHE_HOME=/tmp/xdg_cache +ENV XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" +ENV CUDA_ROOT="/usr/local/cuda" diff --git a/modules/local/colabfold_batch/main.nf b/modules/local/colabfold_batch/main.nf new file mode 100644 index 000000000..ee567e2d8 --- /dev/null +++ b/modules/local/colabfold_batch/main.nf @@ -0,0 +1,99 @@ +process COLABFOLD_BATCH { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_colabfold:2.0.0" + + input: + tuple val(meta), path(fasta) + val colabfold_model_preset + path ('params/*') + path ('colabfold_db/*') + path ('uniref30/*') + val numRec + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_colabfold.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("raw/*relaxed_rank_*.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_colabfold_msa.tsv") , emit: msa + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_*_pae.tsv") , optional: true, emit: paes + tuple val(meta), path ("${meta.id}_0_pae.tsv") , optional: true, emit: pae + tuple val(meta), path ("${meta.id}_ptm.tsv") , optional: true, emit: ptms + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptms + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local COLABFOLD_BATCH module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + + """ + if compgen -G "params/alphafold_params_*" >/dev/null; then + ln -s \$(realpath params/alphafold_params_*/*) params/ + fi + + touch params/download_finished.txt + touch params/download_complexes_multimer_v3_finished.txt + touch params/download_complexes_multimer_v2_finished.txt + touch params/download_complexes_multimer_v1_finished.txt + + colabfold_batch \\ + $args \\ + --num-recycle ${numRec} \\ + --data \$PWD \\ + --model-type ${colabfold_model_preset} \\ + ${fasta} \\ + raw/ + + if [ ! -e `find raw/*_relaxed_rank_001_*.pdb` ]; then + prefix=relaxed + cp raw/*_relaxed_rank_001*.pdb ${meta.id}_colabfold.pdb + else + prefix=unrelaxed + cp raw/*_unrelaxed_rank_001*.pdb ${meta.id}_colabfold.pdb + fi + + extract_metrics.py --name ${meta.id} \\ + --colabfold_metrics_fns raw/*scores_rank*.json \\ + --structs raw/*_\${prefix}_rank*.pdb \\ + --paired_a3m raw/${meta.id}.a3m + + cp raw/*_coverage.png ${meta.id}_seq_coverage.png + mv "${meta.id}_msa.tsv" "${meta.id}_colabfold_msa.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alphafold_colabfold: \$(pip list | grep "^alphafold-colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + colabfold_batch: \$(pip list | grep "^colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + mkdir raw + touch ./"${meta.id}"_colabfold.pdb + touch ./raw/${meta.id}_relaxed_rank_001_model_1_seed_000.pdb + touch ./raw/${meta.id}_relaxed_rank_002_model_2_seed_000.pdb + touch ./raw/${meta.id}_relaxed_rank_003_model_3_seed_000.pdb + touch ./${meta.id}_seq_coverage.png + touch ./raw/${meta.id}_scores_rank.json + touch ./${meta.id}_0_pae.tsv + touch ./${meta.id}_ptm.tsv + touch ./${meta.id}_plddt.tsv + touch ./${meta.id}_colabfold_msa.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alphafold_colabfold: \$(pip list | grep "^alphafold-colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + colabfold_batch: \$(pip list | grep "^colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/combine_uniprot/environment.yml b/modules/local/combine_uniprot/environment.yml new file mode 100644 index 000000000..a5702139b --- /dev/null +++ b/modules/local/combine_uniprot/environment.yml @@ -0,0 +1,8 @@ +name: combine_uniprot +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/local/combine_uniprot.nf b/modules/local/combine_uniprot/main.nf similarity index 86% rename from modules/local/combine_uniprot.nf rename to modules/local/combine_uniprot/main.nf index 7f4637b33..cae8fa2c4 100644 --- a/modules/local/combine_uniprot.nf +++ b/modules/local/combine_uniprot/main.nf @@ -1,7 +1,7 @@ process COMBINE_UNIPROT { label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" @@ -18,7 +18,6 @@ process COMBINE_UNIPROT { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' """ set -e @@ -36,7 +35,7 @@ process COMBINE_UNIPROT { touch uniprot.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') END_VERSIONS """ } diff --git a/modules/local/compare_structures/environment.yml b/modules/local/compare_structures/environment.yml new file mode 100644 index 000000000..9f657a6fd --- /dev/null +++ b/modules/local/compare_structures/environment.yml @@ -0,0 +1,9 @@ +name: compare_structures +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::biopython=1.84 + - conda-forge::matplotlib=3.9.2 + - conda-forge::pip=24.2 + - conda-forge::plotly=5.24.1 diff --git a/modules/local/compare_structures/main.nf b/modules/local/compare_structures/main.nf new file mode 100644 index 000000000..b5fb52fc1 --- /dev/null +++ b/modules/local/compare_structures/main.nf @@ -0,0 +1,52 @@ +process COMPARE_STRUCTURES { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/24/241f0746484727a3633f544c3747bfb77932e1c8c252e769640bd163232d9112/data' : + 'community.wave.seqera.io/library/biopython_matplotlib_pip_plotly:35975fa0fc54b2d3' }" + + input: + tuple val(meta), val(pdb) + tuple val(meta_msa), val(msa) + path (all_files) + path(template) + + output: + tuple val(meta), path ("*report.html"), emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + generate_comparison_report.py \\ + --msa ${msa.join(' ')} \\ + --pdb ${pdb.join(' ')} \\ + --html_template ${template} \\ + --output_dir ./ \\ + --name ${meta.id} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_comparison_report.py: \$(python3 --version) + END_VERSIONS + """ + + stub: + """ + touch test_alphafold2_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_comparison_report.py: \$(python3 --version) + END_VERSIONS + """ +} diff --git a/modules/local/download_pdbmmcif/environment.yml b/modules/local/download_pdbmmcif/environment.yml new file mode 100644 index 000000000..4a53be04d --- /dev/null +++ b/modules/local/download_pdbmmcif/environment.yml @@ -0,0 +1,7 @@ +name: download_pdbmmcif +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::aria2=1.36.0 + - conda-forge::rsync=3.3.0 diff --git a/modules/local/download_pdbmmcif.nf b/modules/local/download_pdbmmcif/main.nf similarity index 66% rename from modules/local/download_pdbmmcif.nf rename to modules/local/download_pdbmmcif/main.nf index 98ef831ed..64d6a8f45 100644 --- a/modules/local/download_pdbmmcif.nf +++ b/modules/local/download_pdbmmcif/main.nf @@ -2,28 +2,26 @@ * Download PDB MMCIF database */ process DOWNLOAD_PDBMMCIF { - tag "${source_url_pdb_mmcif}--${source_url_pdb_obsolete}" + tag "${source_url_pdb_mmcif}" label 'process_low' label 'error_retry' - conda "bioconda::aria2=1.36.0 conda-forge::rsync=3.2.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-4a7c46784ad871c48746744c6b8dbc5d0a97b9ca:33e61a87922824f8afcecf88a7717a2d4cb514e9-0' : - 'biocontainers/mulled-v2-4a7c46784ad871c48746744c6b8dbc5d0a97b9ca:33e61a87922824f8afcecf88a7717a2d4cb514e9-0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3c/3c2e1079a0721851248bd2aa45f3d4cd32bfdb7395d609132567d772150965cc/data' : + 'community.wave.seqera.io/library/aria2_rsync:1627a7e9b559cfa0' }" input: val source_url_pdb_mmcif - val source_url_pdb_obsolete output: - path ('*') , emit: ch_db - path "versions.yml", emit: versions + path ('mmcif_files'), emit: ch_db + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' """ set -euo pipefail @@ -54,14 +52,10 @@ process DOWNLOAD_PDBMMCIF { # Delete empty download directory structure. find ./raw -type d -empty -delete - aria2c \\ - $source_url_pdb_obsolete - cat <<-END_VERSIONS > versions.yml "${task.process}": sed: \$(echo \$(sed --version 2>&1) | head -1 | sed 's/^.*GNU sed) //; s/ .*\$//') rsync: \$(rsync --version | head -1 | sed 's/^rsync version //; s/ protocol version [[:digit:]]*//') - aria2c: \$( aria2c -v | head -1 | sed 's/aria2 version //' ) END_VERSIONS """ @@ -72,7 +66,8 @@ process DOWNLOAD_PDBMMCIF { cat <<-END_VERSIONS > versions.yml "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') + sed: \$(echo \$(sed --version 2>&1) | head -1 | sed 's/^.*GNU sed) //; s/ .*\$//') + rsync: \$(rsync --version | head -1 | sed 's/^rsync version //; s/ protocol version [[:digit:]]*//') END_VERSIONS """ } diff --git a/modules/local/download_pdbmmcif_af3/environment.yml b/modules/local/download_pdbmmcif_af3/environment.yml new file mode 100644 index 000000000..b3e8b2ae9 --- /dev/null +++ b/modules/local/download_pdbmmcif_af3/environment.yml @@ -0,0 +1,6 @@ +name: download_pdbmmcif_af3 +channels: + - conda-forge +dependencies: + - conda-forge::zstd=1.5.6 + - conda-forge::wget=1.21.4 diff --git a/modules/local/download_pdbmmcif_af3/main.nf b/modules/local/download_pdbmmcif_af3/main.nf new file mode 100644 index 000000000..ca44ee385 --- /dev/null +++ b/modules/local/download_pdbmmcif_af3/main.nf @@ -0,0 +1,52 @@ +/* + * Download PDB MMCIF database + */ +process DOWNLOAD_PDBMMCIF_AF3 { + tag "${source_url_pdb_mmcif}" + label 'process_low' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fa/fa33501a8b3ff76af2b4a13c68af6255d120b1e9ff1b4c94bfb4e6de627bfd71/data' : + 'community.wave.seqera.io/library/wget_zstd:588693a86d59d291' }" + + input: + val source_url_pdb_mmcif + + output: + path ('mmcif_files/*.cif'), emit: ch_db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + mkdir mmcif_files + + wget --quiet --output-document=- ${source_url_pdb_mmcif} | \\ + tar --use-compress-program=zstd \\ + --strip-components 1 \\ + -xf - \\ + --directory="./mmcif_files" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo \$(wget --version 2>&1) | grep 'GNU Wget' | cut -f3 -d ' ') + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir mmcif_files + touch mmcif_files/stub.cif + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo \$(wget --version 2>&1) | grep 'GNU Wget' | cut -f3 -d ' ' || echo "unknown") + untar: \$(echo \$(tar --version 2>&1 | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//' | grep -m1 '^[0-9]' || echo unknown) + END_VERSIONS + """ +} diff --git a/modules/local/download_rna_rf2na/Dockerfile b/modules/local/download_rna_rf2na/Dockerfile new file mode 120000 index 000000000..1c91bc482 --- /dev/null +++ b/modules/local/download_rna_rf2na/Dockerfile @@ -0,0 +1 @@ +../run_rosettafold2na/Dockerfile \ No newline at end of file diff --git a/modules/local/download_rna_rf2na/main.nf b/modules/local/download_rna_rf2na/main.nf new file mode 100644 index 000000000..d6837521f --- /dev/null +++ b/modules/local/download_rna_rf2na/main.nf @@ -0,0 +1,77 @@ +process DOWNLOAD_RNA_DATABASES { + tag "Download and process RNA databases" + label 'process_medium' + + container "quay.io/nf-core/proteinfold_rosettafold2na:2.0.0" + + input: + val rfam_full_region_link + val rfam_cm_link + val rnacentral_rfam_annotations_link + val rnacentral_id_mapping_link + val rnacentral_sequences_link + + output: + path "RNA", emit: ch_db + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("DOWNLOAD_RNA_DATABASES module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + """ + mkdir -p RNA + cd RNA + + # Download and process Rfam + wget -O Rfam.full_region.gz ${rfam_full_region_link} + wget -O Rfam.cm.gz ${rfam_cm_link} + gunzip Rfam.full_region.gz + gunzip Rfam.cm.gz + cmpress Rfam.cm + + # Download and process RNAcentral + wget -O id_mapping.tsv.gz ${rnacentral_id_mapping_link} + wget -O rfam_annotations.tsv.gz ${rnacentral_rfam_annotations_link} + wget -O rnacentral_sequences.fasta.gz ${rnacentral_sequences_link} + + # Use the reprocess_rnac.pl script from the RoseTTAFold2NA repository + /app/RoseTTAFold2NA/input_prep/reprocess_rnac.pl id_mapping.tsv.gz rfam_annotations.tsv.gz + + gunzip -c rnacentral_sequences.fasta.gz | makeblastdb -in - -dbtype nucl -parse_seqids -out rnacentral.fasta -title "RNACentral" + + # Download nt database + update_blastdb.pl --decompress nt + + cd .. + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cmpress: \$(cmpress -h | grep -oP 'INFERNAL \\K\\d+\\.\\d+') + makeblastdb: \$(makeblastdb -version | grep -oP 'makeblastdb: \\K\\d+\\.\\d+\\.\\d+') + update_blastdb: \$(update_blastdb.pl --version | grep -oP 'Update BLAST databases \\K\\d+\\.\\d+\\.\\d+') + perl: \$(perl --version | grep -oP 'This is perl.*\\K\\d+\\.\\d+\\.\\d+') + rf2na: \$(grep "version" /app/RoseTTAFold2NA/README.md | awk '{print \$2}') + END_VERSIONS + """ + + stub: + """ + mkdir -p RNA + touch RNA/Rfam.full_region RNA/Rfam.cm RNA/id_mapping.tsv RNA/rfam_annotations.tsv RNA/rnacentral.fasta + touch RNA/nt.00.nhr RNA/nt.00.nin RNA/nt.00.nsq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cmpress: 1.1.4 + makeblastdb: 2.12.0 + update_blastdb: 2.12.0 + perl: 5.32.1 + rf2na: 1.0.0 + END_VERSIONS + """ +} diff --git a/modules/local/fasta2json/environment.yml b/modules/local/fasta2json/environment.yml new file mode 100644 index 000000000..012de0929 --- /dev/null +++ b/modules/local/fasta2json/environment.yml @@ -0,0 +1,6 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.8.3 diff --git a/modules/local/fasta2json/main.nf b/modules/local/fasta2json/main.nf new file mode 100644 index 000000000..3553bb683 --- /dev/null +++ b/modules/local/fasta2json/main.nf @@ -0,0 +1,39 @@ +process FASTA2JSON { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(fasta) + output: + + tuple val(meta), path ("*.json"), emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + fasta_to_json.py ${fasta} ${meta.id} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch "${meta.id}.json" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/fasta2yaml/environment.yml b/modules/local/fasta2yaml/environment.yml new file mode 100644 index 000000000..012de0929 --- /dev/null +++ b/modules/local/fasta2yaml/environment.yml @@ -0,0 +1,6 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.8.3 diff --git a/modules/local/fasta2yaml/main.nf b/modules/local/fasta2yaml/main.nf new file mode 100644 index 000000000..c3e1cc2ec --- /dev/null +++ b/modules/local/fasta2yaml/main.nf @@ -0,0 +1,43 @@ +process FASTA2YAML { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.yaml"), emit: yaml + tuple val(meta), path ("out_fasta/*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + fasta_to_yaml.py ${fasta} ${meta.id} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch "${meta.id}.yaml" + mkdir out_fasta + touch "out_fasta/A.fasta" + touch "out_fasta/B.fasta" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/fasta_to_alphafold3_json/environment.yml b/modules/local/fasta_to_alphafold3_json/environment.yml new file mode 100644 index 000000000..1df48e6e1 --- /dev/null +++ b/modules/local/fasta_to_alphafold3_json/environment.yml @@ -0,0 +1,6 @@ +name: fasta_to_alphafold3_json +channels: + - conda-forge +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::biopython=1.84 diff --git a/modules/local/fasta_to_alphafold3_json/main.nf b/modules/local/fasta_to_alphafold3_json/main.nf new file mode 100644 index 000000000..ab4b7ec0a --- /dev/null +++ b/modules/local/fasta_to_alphafold3_json/main.nf @@ -0,0 +1,44 @@ +process FASTA_TO_ALPHAFOLD3_JSON { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c7/c7dabd3f132a613fb11ee27c66e9517eb7649eee64f4e4f63747841105883b40/data' : + 'community.wave.seqera.io/library/biopython_python:06582b7b722f3db3' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.json"), emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fasta_to_alphafold3_json.py \\ + ${fasta} \\ + ${prefix} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.json + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/generate_report/environment.yml b/modules/local/generate_report/environment.yml new file mode 100644 index 000000000..07a5b9f11 --- /dev/null +++ b/modules/local/generate_report/environment.yml @@ -0,0 +1,9 @@ +name: generate_report +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::biopython=1.84 + - conda-forge::matplotlib=3.9.2 + - conda-forge::pip=24.2 + - conda-forge::plotly=5.24.1 diff --git a/modules/local/generate_report/main.nf b/modules/local/generate_report/main.nf new file mode 100644 index 000000000..f33599828 --- /dev/null +++ b/modules/local/generate_report/main.nf @@ -0,0 +1,56 @@ +process GENERATE_REPORT { + tag "$meta.id-$meta.model" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/24/241f0746484727a3633f544c3747bfb77932e1c8c252e769640bd163232d9112/data' : + 'community.wave.seqera.io/library/biopython_matplotlib_pip_plotly:35975fa0fc54b2d3' }" + + input: + tuple val(meta), path(pdb), path(msa), path(pae) + path(template) + + output: + tuple val(meta), path ("*report.html") , emit: report + tuple val(meta), path ("*seq_coverage.png"), optional: true, emit: sequence_coverage + tuple val(meta), path ("*_LDDT.html") , emit: plddt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + generate_report.py \\ + --type ${meta.model} \\ + --msa ${msa} \\ + --pae ${pae} \\ + --pdb ${pdb.join(' ')} \\ + --html_template ${template} \\ + --output_dir ./ \\ + --name ${meta.id} \\ + $args \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_report.py: \$(python3 --version) + END_VERSIONS + """ + + stub: + """ + touch test_alphafold2_report.html + touch test_seq_coverage.png + touch test_LDDT.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + generate_report.py: \$(python3 --version) + END_VERSIONS + """ +} diff --git a/modules/local/mmcif2pdb/environment.yml b/modules/local/mmcif2pdb/environment.yml new file mode 100644 index 000000000..4b30ae2ef --- /dev/null +++ b/modules/local/mmcif2pdb/environment.yml @@ -0,0 +1,5 @@ +name: mmcif2pdb +channels: + - conda-forge +dependencies: + - conda-forge::biopython=1.84 diff --git a/modules/local/mmcif2pdb/main.nf b/modules/local/mmcif2pdb/main.nf new file mode 100644 index 000000000..e0a7c1375 --- /dev/null +++ b/modules/local/mmcif2pdb/main.nf @@ -0,0 +1,46 @@ +process MMCIF2PDB { + tag "$meta.id" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb3700531c7ec639f59f084ab64c05e881d654dcf829db163539f2f0b095e09d/data' : + 'community.wave.seqera.io/library/biopython:1.84--3318633dad0031e7' }" + + input: + tuple val(meta), path("*") + + output: + tuple val(meta), path("*.pdb"), emit: pdb + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + for mmcif in *.cif + do + pdb_out=\$(basename "\$mmcif" .cif) + mmcif_to_pdb.py \${mmcif} --pdb_out "\${pdb_out}.pdb" + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + for mmcif in *.cif + do + pdb_out=\$(basename "\$mmcif") + touch \${pdb_out}.pdb + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/mmseqs_colabfoldsearch/Dockerfile b/modules/local/mmseqs_colabfoldsearch/Dockerfile new file mode 100644 index 000000000..1c57a9f16 --- /dev/null +++ b/modules/local/mmseqs_colabfoldsearch/Dockerfile @@ -0,0 +1,43 @@ +FROM ubuntu:24.04 +ARG MMSEQS2_VERSION=18-8cc5c + +LABEL org.opencontainers.image.title="nf-core/proteinfold_mmseqs_colabfoldsearch" \ + org.opencontainers.image.description="Lightweight Docker image containing all software requirements to run the MMSEQS_COLABFOLDSEARCH module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Joshua Storm Caley " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3 \ + python3-dev \ + python3-pip \ + wget \ + git \ + build-essential \ + cmake && \ + ln -sf /usr/bin/python3 /usr/bin/python && \ + \ + pip install --no-cache-dir --break-system-packages \ + "colabfold @ git+https://github.com/sokrypton/ColabFold.git@e8ebd9a" && \ + \ + wget -q https://github.com/soedinglab/MMseqs2/releases/download/${MMSEQS2_VERSION}/mmseqs-linux-sse41.tar.gz && \ + tar xzf mmseqs-linux-sse41.tar.gz && \ + cp mmseqs/bin/* /usr/local/bin/ && \ + rm -rf mmseqs mmseqs-linux-sse41.tar.gz && \ + \ + apt-get remove -y \ + python3-dev \ + wget \ + git \ + build-essential \ + cmake && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + pip cache purge + +ENV MPLBACKEND=Agg +ENV MPLCONFIGDIR=/tmp/mplconfig +ENV XDG_CACHE_HOME=/tmp/xdg_cache diff --git a/modules/local/mmseqs_colabfoldsearch.nf b/modules/local/mmseqs_colabfoldsearch/main.nf similarity index 59% rename from modules/local/mmseqs_colabfoldsearch.nf rename to modules/local/mmseqs_colabfoldsearch/main.nf index c2140c5b7..5cd2b362a 100644 --- a/modules/local/mmseqs_colabfoldsearch.nf +++ b/modules/local/mmseqs_colabfoldsearch/main.nf @@ -1,19 +1,14 @@ process MMSEQS_COLABFOLDSEARCH { tag "$meta.id" label 'process_high_memory' + label 'process_high' - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local MMSEQS_COLABFOLDSEARCH module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_colabfold:1.1.1" + container "nf-core/proteinfold_mmseqs_colabfoldsearch:2.0.0" input: tuple val(meta), path(fasta) - path ('db/params') - path colabfold_db - path uniref30 + path ('db/*') + path ('db/*') output: tuple val(meta), path("**.a3m"), emit: a3m @@ -23,34 +18,36 @@ process MMSEQS_COLABFOLDSEARCH { task.ext.when == null || task.ext.when script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local MMSEQS_COLABFOLDSEARCH module does not support Conda. Please use Docker / Singularity / Podman instead.") + } def args = task.ext.args ?: '' - def VERSION = '1.5.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - ln -r -s $uniref30/uniref30_* ./db - ln -r -s $colabfold_db/colabfold_envdb* ./db - - /localcolabfold/colabfold-conda/bin/colabfold_search \\ + colabfold_search \\ $args \\ --threads $task.cpus ${fasta} \\ ./db \\ - "result/" + --af3-json \\ + "results/" cat <<-END_VERSIONS > versions.yml "${task.process}": - colabfold_search: $VERSION + colabfold_search: \$(pip list | grep "^colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + mmseqs: \$(mmseqs version) END_VERSIONS """ stub: - def VERSION = '1.5.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ mkdir results touch results/${meta.id}.a3m cat <<-END_VERSIONS > versions.yml "${task.process}": - colabfold_search: $VERSION + colabfold_search: \$(pip list | grep "^colabfold" | awk '{print \$2}' 2>/dev/null || echo "unknown") + mmseqs: \$(mmseqs version) END_VERSIONS """ } diff --git a/modules/local/multifasta_to_csv/environment.yml b/modules/local/multifasta_to_csv/environment.yml new file mode 100644 index 000000000..a0f9dc7d0 --- /dev/null +++ b/modules/local/multifasta_to_csv/environment.yml @@ -0,0 +1,8 @@ +name: multifasta_to_csv +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/local/multifasta_to_csv.nf b/modules/local/multifasta_to_csv/main.nf similarity index 96% rename from modules/local/multifasta_to_csv.nf rename to modules/local/multifasta_to_csv/main.nf index d5d68fbfe..5a737f781 100644 --- a/modules/local/multifasta_to_csv.nf +++ b/modules/local/multifasta_to_csv/main.nf @@ -2,7 +2,7 @@ process MULTIFASTA_TO_CSV { tag "$meta.id" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/local/multifasta_to_singlefasta/environment.yml b/modules/local/multifasta_to_singlefasta/environment.yml new file mode 100644 index 000000000..273e53dda --- /dev/null +++ b/modules/local/multifasta_to_singlefasta/environment.yml @@ -0,0 +1,8 @@ +name: multifasta_to_singlefasta +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/local/multifasta_to_singlefasta.nf b/modules/local/multifasta_to_singlefasta/main.nf similarity index 95% rename from modules/local/multifasta_to_singlefasta.nf rename to modules/local/multifasta_to_singlefasta/main.nf index e97444166..268f2629b 100644 --- a/modules/local/multifasta_to_singlefasta.nf +++ b/modules/local/multifasta_to_singlefasta/main.nf @@ -8,7 +8,7 @@ process MULTIFASTA_TO_SINGLEFASTA { 'nf-core/ubuntu:20.04' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta, stageAs: 'query.fasta') output: tuple val(meta), path("${meta.id}.fasta"), emit: input_fasta diff --git a/modules/local/rosettafold2na_fasta/environment.yml b/modules/local/rosettafold2na_fasta/environment.yml new file mode 100644 index 000000000..238fcb6a5 --- /dev/null +++ b/modules/local/rosettafold2na_fasta/environment.yml @@ -0,0 +1,6 @@ +name: rosettafold2na_fasta +channels: + - conda-forge + - defaults +dependencies: + - python=3.8 diff --git a/modules/local/rosettafold2na_fasta/main.nf b/modules/local/rosettafold2na_fasta/main.nf new file mode 100644 index 000000000..3e435c49b --- /dev/null +++ b/modules/local/rosettafold2na_fasta/main.nf @@ -0,0 +1,40 @@ +process ROSETTAFOLD2NA_FASTA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("rf2na_input", type: "dir"), emit: rf2na_input + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + fasta_to_rosettafold.py "${meta.id}" "${fasta}" + + cat <<'END_VERSIONS' > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + mkdir -p rf2na_input + touch rf2na_input/chain_map.tsv + + cat <<'END_VERSIONS' > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_alphafold2.nf b/modules/local/run_alphafold2.nf deleted file mode 100644 index cb3527d39..000000000 --- a/modules/local/run_alphafold2.nf +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Run Alphafold2 - */ -process RUN_ALPHAFOLD2 { - tag "$meta.id" - label 'process_medium' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local RUN_ALPHAFOLD2 module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_alphafold2_standard:1.1.1" - - input: - tuple val(meta), path(fasta) - val db_preset - val alphafold2_model_preset - path ('params/*') - path ('bfd/*') - path ('small_bfd/*') - path ('mgnify/*') - path ('pdb70/*') - path ('pdb_mmcif/*') - path ('uniref30/*') - path ('uniref90/*') - path ('pdb_seqres/*') - path ('uniprot/*') - - output: - path ("${fasta.baseName}*") - path "*_mqc.tsv", emit: multiqc - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def db_preset = db_preset ? "full_dbs --bfd_database_path=./bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt --uniref30_database_path=./uniref30/UniRef30_2021_03" : - "reduced_dbs --small_bfd_database_path=./small_bfd/bfd-first_non_consensus_sequences.fasta" - if (alphafold2_model_preset == 'multimer') { - alphafold2_model_preset += " --pdb_seqres_database_path=./pdb_seqres/pdb_seqres.txt --uniprot_database_path=./uniprot/uniprot.fasta " - } - else { - alphafold2_model_preset += " --pdb70_database_path=./pdb70/pdb70_from_mmcif_200916/pdb70 " - } - """ - if [ -f pdb_seqres/pdb_seqres.txt ] - then sed -i "/^\\w*0/d" pdb_seqres/pdb_seqres.txt - fi - if [ -d params/alphafold_params_* ]; then ln -r -s params/alphafold_params_*/* params/; fi - python3 /app/alphafold/run_alphafold.py \ - --fasta_paths=${fasta} \ - --model_preset=${alphafold2_model_preset} \ - --db_preset=${db_preset} \ - --output_dir=\$PWD \ - --data_dir=\$PWD \ - --uniref90_database_path=./uniref90/uniref90.fasta \ - --mgnify_database_path=./mgnify/mgy_clusters_2022_05.fa \ - --template_mmcif_dir=./pdb_mmcif/mmcif_files \ - --obsolete_pdbs_path=./pdb_mmcif/obsolete.dat \ - --random_seed=53343 \ - $args - - cp "${fasta.baseName}"/ranked_0.pdb ./"${fasta.baseName}".alphafold.pdb - cd "${fasta.baseName}" - awk '{print \$6"\\t"\$11}' ranked_0.pdb | uniq > ranked_0_plddt.tsv - for i in 1 2 3 4 - do awk '{print \$6"\\t"\$11}' ranked_\$i.pdb | uniq | awk '{print \$2}' > ranked_"\$i"_plddt.tsv - done - paste ranked_0_plddt.tsv ranked_1_plddt.tsv ranked_2_plddt.tsv ranked_3_plddt.tsv ranked_4_plddt.tsv > plddt.tsv - echo -e Positions"\\t"rank_0"\\t"rank_1"\\t"rank_2"\\t"rank_3"\\t"rank_4 > header.tsv - cat header.tsv plddt.tsv > ../"${fasta.baseName}"_plddt_mqc.tsv - cd .. - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - """ - touch ./"${fasta.baseName}".alphafold.pdb - touch ./"${fasta.baseName}"_mqc.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') - END_VERSIONS - """ -} diff --git a/modules/local/run_alphafold2/Dockerfile b/modules/local/run_alphafold2/Dockerfile new file mode 100644 index 000000000..83a3a409e --- /dev/null +++ b/modules/local/run_alphafold2/Dockerfile @@ -0,0 +1,87 @@ +ARG CUDA_VERSION=12.2.2 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-runtime-ubuntu20.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_alphafold2_standard" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the RUN_ALPHAFOLD2 module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Athanasios Baltzis, Jose Espinosa-Carrasco , Leila Mansouri, Joshua Caley " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +# Use bash to support string substitution. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC +RUN apt-get update --quiet \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y --quiet \ + build-essential \ + cmake \ + cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA_VERSION//./-}) \ + git \ + hmmer \ + kalign \ + tzdata \ + wget \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get autoremove -y \ + && apt-get clean + +# Clone AlphaFold2 +RUN git clone https://github.com/deepmind/alphafold.git /app/alphafold && \ + cd /app/alphafold && \ + git checkout e9b68483fc8764e7a1906e74a5d1f0ea2137311d && \ + cd - + +# Compile HHsuite from source +RUN git clone --branch v3.3.0 --single-branch https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ + && mkdir /tmp/hh-suite/build \ + && pushd /tmp/hh-suite/build \ + && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ + && make -j && make install \ + && ln -s /opt/hhsuite/bin/* /usr/bin \ + && popd \ + && rm -rf /tmp/hh-suite + +# Install Miniforge package manager +RUN wget -q -P /tmp \ + "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" \ + && bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda \ + && rm /tmp/Miniforge3-$(uname)-$(uname -m).sh + +# Install Conda packages. +ENV PATH="/opt/conda/bin:$PATH" +ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" +ENV CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes" +RUN conda install --quiet --yes conda==24.11.1 pip python=3.11 \ + && conda install --quiet --yes --channel nvidia cuda=${CUDA_VERSION} \ + && conda install --quiet --yes --channel conda-forge openmm=8.0.0 pdbfixer \ + && conda clean --all --force-pkgs-dirs --yes + +RUN mamba install --quiet --yes cuda-toolkit=${CUDA_VERSION} \ + && mamba clean --all --force-pkgs-dirs --yes + +RUN wget -q -P /app/alphafold/alphafold/common/ \ + https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt + +# Install pip packages. +RUN pip3 install --upgrade pip --no-cache-dir \ + && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ + && pip3 install --upgrade --no-cache-dir \ + jax==0.4.26 \ + jaxlib==0.4.26+cuda12.cudnn89 \ + -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + +# Add SETUID bit to the ldconfig binary so that non-root users can run it. +RUN chmod u+s /sbin/ldconfig.real + +# Currently needed to avoid undefined_symbol error. +RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7 + +# Run ldconfig to ensure GPUs are visible at runtime. +# See https://github.com/NVIDIA/nvidia-docker/issues/1399 +RUN ldconfig + +WORKDIR /app/alphafold diff --git a/modules/local/run_alphafold2/main.nf b/modules/local/run_alphafold2/main.nf new file mode 100644 index 000000000..eea900702 --- /dev/null +++ b/modules/local/run_alphafold2/main.nf @@ -0,0 +1,124 @@ +/* + * Run Alphafold2 + */ +process RUN_ALPHAFOLD2 { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_alphafold2_standard:2.0.0" + + input: + tuple val(meta), path(fasta) + val db_preset + val alphafold2_model_preset + val uniref30_prefix + path ('params/*') + path ('bfd/*') + path ('small_bfd/*') + path ('mgnify/*') + path ('pdb70/*') + path ('pdb_mmcif/mmcif_files') + path ('pdb_mmcif/*') + path ('uniref30/*') + path ('uniref90/*') + path ('pdb_seqres/*') + path ('uniprot/*') + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_alphafold2.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("raw/ranked*.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_alphafold2_msa.tsv") , emit: msa + // Note: alphafold2_model_preset == "monomer" the pae file won't exist, thus the optional + tuple val(meta), path ("${meta.id}_*_pae.tsv") , optional: true, emit: paes + tuple val(meta), path ("${meta.id}_0_pae.tsv") , optional: true, emit: pae + tuple val(meta), path ("${meta.id}_ptm.tsv") , optional: true, emit: ptms + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptms + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + // Note: --pkls ${fasta.baseName}/*.pkl redundantly processes the features.pkl file. Just providing conceptual reminder of file types for future refactor + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ALPHAFOLD2 module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + def db_preset_cmd = db_preset ? "full_dbs --bfd_database_path=./bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt --uniref30_database_path=./uniref30/${uniref30_prefix}" : + "reduced_dbs --small_bfd_database_path=./small_bfd/bfd-first_non_consensus_sequences.fasta" + def extra_dbs = "" + if (alphafold2_model_preset == 'multimer') { + extra_dbs = " --pdb_seqres_database_path=./pdb_seqres/pdb_seqres.txt --uniprot_database_path=./uniprot/uniprot.fasta " + } else { + extra_dbs = " --pdb70_database_path=./pdb70/pdb70 " + } + """ + fix_obsolete.py pdb_mmcif/obsolete.dat > clean_obsolete.dat + + ## Handles multiple versions of mgnify database and selects the latest version + mgnify_db_path=\$(ls -v ./mgnify/mgy_clusters*.fa | tail -n 1) + + python3 /app/alphafold/run_alphafold.py \ + --fasta_paths=${fasta} \ + --model_preset=${alphafold2_model_preset}${extra_dbs} \ + --db_preset=${db_preset_cmd} \ + --output_dir=\$PWD \ + --data_dir=\$PWD \ + --uniref90_database_path=./uniref90/uniref90.fasta \ + --mgnify_database_path=\$mgnify_db_path \ + --template_mmcif_dir=./pdb_mmcif/mmcif_files \ + --obsolete_pdbs_path=./clean_obsolete.dat \ + $args + + cp "${fasta.baseName}"/ranked_0.pdb ./"${meta.id}"_alphafold2.pdb + + extract_metrics.py --name ${meta.id} \\ + --pkls ${fasta.baseName}/features.pkl ${fasta.baseName}/*.pkl \\ + --structs ${fasta.baseName}/ranked*.pdb + + mv "${meta.id}_msa.tsv" "${meta.id}_alphafold2_msa.tsv" + + # Can't use fasta.baseName to batch outputs in publishDir + mv "${fasta.baseName}" raw/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + touch "${meta.id}_alphafold2.pdb" + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_alphafold2_msa.tsv" + touch "${meta.id}_0_pae.tsv" + touch "${meta.id}_ptm.tsv" + touch "${meta.id}_iptm.tsv" + mkdir "raw" + touch "raw/ranked_0.pdb" + touch "raw/ranked_1.pdb" + touch "raw/ranked_2.pdb" + touch "raw/ranked_3.pdb" + touch "raw/ranked_4.pdb" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_alphafold2_msa.nf b/modules/local/run_alphafold2_msa.nf deleted file mode 100644 index fdc67e885..000000000 --- a/modules/local/run_alphafold2_msa.nf +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Run Alphafold2 MSA - */ -process RUN_ALPHAFOLD2_MSA { - tag "$meta.id" - label 'process_medium' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local RUN_ALPHAFOLD2_MSA module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_alphafold2_msa:1.1.1" - - input: - tuple val(meta), path(fasta) - val db_preset - val alphafold2_model_preset - path ('params/*') - path ('bfd/*') - path ('small_bfd/*') - path ('mgnify/*') - path ('pdb70/*') - path ('pdb_mmcif/*') - path ('uniref30/*') - path ('uniref90/*') - path ('pdb_seqres/*') - path ('uniprot/*') - - output: - path ("${fasta.baseName}*") - path ("${fasta.baseName}.features.pkl"), emit: features - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def db_preset = db_preset ? "full_dbs --bfd_database_path=./bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt --uniref30_database_path=./uniref30/UniRef30_2021_03" : - "reduced_dbs --small_bfd_database_path=./small_bfd/bfd-first_non_consensus_sequences.fasta" - if (alphafold2_model_preset == 'multimer') { - alphafold2_model_preset += " --pdb_seqres_database_path=./pdb_seqres/pdb_seqres.txt --uniprot_database_path=./uniprot/uniprot.fasta " - } - else { - alphafold2_model_preset += " --pdb70_database_path=./pdb70/pdb70_from_mmcif_200916/pdb70 " - } - """ - if [ -f pdb_seqres/pdb_seqres.txt ] - then sed -i "/^\\w*0/d" pdb_seqres/pdb_seqres.txt - fi - python3 /app/alphafold/run_msa.py \ - --fasta_paths=${fasta} \ - --model_preset=${alphafold2_model_preset} \ - --db_preset=${db_preset} \ - --output_dir=\$PWD \ - --data_dir=\$PWD \ - --uniref90_database_path=./uniref90/uniref90.fasta \ - --mgnify_database_path=./mgnify/mgy_clusters_2022_05.fa \ - --template_mmcif_dir=./pdb_mmcif/mmcif_files \ - --obsolete_pdbs_path=./pdb_mmcif/obsolete.dat \ - $args - - cp "${fasta.baseName}"/features.pkl ./"${fasta.baseName}".features.pkl - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - """ - touch ./"${fasta.baseName}".features.pkl - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') - END_VERSIONS - """ -} diff --git a/modules/local/run_alphafold2_msa/Dockerfile b/modules/local/run_alphafold2_msa/Dockerfile new file mode 120000 index 000000000..54e1ef6a3 --- /dev/null +++ b/modules/local/run_alphafold2_msa/Dockerfile @@ -0,0 +1 @@ +../run_alphafold2_pred/Dockerfile \ No newline at end of file diff --git a/modules/local/run_alphafold2_msa/main.nf b/modules/local/run_alphafold2_msa/main.nf new file mode 100644 index 000000000..2004af816 --- /dev/null +++ b/modules/local/run_alphafold2_msa/main.nf @@ -0,0 +1,92 @@ +/* + * Run Alphafold2 MSA + */ +process RUN_ALPHAFOLD2_MSA { + tag "$meta.id" + label 'process_medium' + + container "nf-core/proteinfold_alphafold2_pred:2.0.0" + + input: + tuple val(meta), path(fasta) + val db_preset + val alphafold2_model_preset + val uniref30_prefix + path ('params/*') + path ('bfd/*') + path ('small_bfd/*') + path ('mgnify/*') + path ('pdb70/*') + path ('pdb_mmcif/mmcif_files') + path ('pdb_mmcif/*') + path ('uniref30/*') + path ('uniref90/*') + path ('pdb_seqres/*') + path ('uniprot/*') + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("raw/features.pkl"), emit: features + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ALPHAFOLD2_MSA module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + def db_preset_cmd = db_preset ? "full_dbs --bfd_database_path=./bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt --uniref30_database_path=./uniref30/${uniref30_prefix}" : + "reduced_dbs --small_bfd_database_path=./small_bfd/bfd-first_non_consensus_sequences.fasta" + def extra_dbs = "" + if (alphafold2_model_preset == 'multimer') { + extra_dbs = " --pdb_seqres_database_path=./pdb_seqres/pdb_seqres.txt --uniprot_database_path=./uniprot/uniprot.fasta " + } else { + extra_dbs = " --pdb70_database_path=./pdb70/pdb70 " + } + """ + fix_obsolete.py pdb_mmcif/obsolete.dat > clean_obsolete.dat + + ## Handles multiple versions of mgnify database and selects the latest version + mgnify_db_path=\$(ls -v ./mgnify/mgy_clusters*.fa | tail -n 1) + + python3 /app/alphafold/run_msa.py \ + --fasta_paths=${fasta} \ + --model_preset=${alphafold2_model_preset}${extra_dbs} \ + --db_preset=${db_preset_cmd} \ + --output_dir=\$PWD \ + --data_dir=\$PWD \ + --uniref90_database_path=./uniref90/uniref90.fasta \ + --mgnify_database_path=\$mgnify_db_path \ + --template_mmcif_dir=./pdb_mmcif/mmcif_files \ + --obsolete_pdbs_path=./clean_obsolete.dat \ + $args + + # Can't use fasta.baseName to batch outputs in publishDir + mv "${fasta.baseName}" raw/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + mkdir ./raw + touch ./raw/features.pkl + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_alphafold2_pred.nf b/modules/local/run_alphafold2_pred.nf deleted file mode 100644 index 92b5d2a59..000000000 --- a/modules/local/run_alphafold2_pred.nf +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Run Alphafold2 PRED - */ -process RUN_ALPHAFOLD2_PRED { - tag "$meta.id" - label 'process_medium' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local RUN_ALPHAFOLD2_PRED module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_alphafold2_split:1.1.1" - - input: - tuple val(meta), path(fasta) - val db_preset - val alphafold2_model_preset - path ('params/*') - path ('bfd/*') - path ('small_bfd/*') - path ('mgnify/*') - path ('pdb70/*') - path ('pdb_mmcif/*') - path ('uniref30/*') - path ('uniref90/*') - path ('pdb_seqres/*') - path ('uniprot/*') - path msa - - output: - path ("${fasta.baseName}*") - path "*_mqc.tsv", emit: multiqc - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - if [ -d params/alphafold_params_* ]; then ln -r -s params/alphafold_params_*/* params/; fi - python3 /app/alphafold/run_predict.py \ - --fasta_paths=${fasta} \ - --model_preset=${alphafold2_model_preset} \ - --output_dir=\$PWD \ - --data_dir=\$PWD \ - --random_seed=53343 \ - --msa_path=${msa} \ - $args - - cp "${fasta.baseName}"/ranked_0.pdb ./"${fasta.baseName}".alphafold.pdb - cd "${fasta.baseName}" - awk '{print \$6"\\t"\$11}' ranked_0.pdb | uniq > ranked_0_plddt.tsv - for i in 1 2 3 4 - do awk '{print \$6"\\t"\$11}' ranked_\$i.pdb | uniq | awk '{print \$2}' > ranked_"\$i"_plddt.tsv - done - paste ranked_0_plddt.tsv ranked_1_plddt.tsv ranked_2_plddt.tsv ranked_3_plddt.tsv ranked_4_plddt.tsv > plddt.tsv - echo -e Positions"\\t"rank_0"\\t"rank_1"\\t"rank_2"\\t"rank_3"\\t"rank_4 > header.tsv - cat header.tsv plddt.tsv > ../"${fasta.baseName}"_plddt_mqc.tsv - cd .. - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - """ - touch ./"${fasta.baseName}".alphafold.pdb - touch ./"${fasta.baseName}"_mqc.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - awk: \$(gawk --version| head -1 | sed 's/GNU Awk //; s/, API:.*//') - END_VERSIONS - """ -} diff --git a/modules/local/run_alphafold2_pred/Dockerfile b/modules/local/run_alphafold2_pred/Dockerfile new file mode 100644 index 000000000..c42668d28 --- /dev/null +++ b/modules/local/run_alphafold2_pred/Dockerfile @@ -0,0 +1,92 @@ +ARG CUDA_VERSION=12.2.2 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-runtime-ubuntu20.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_alphafold2_pred" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the RUN_ALPHAFOLD2_PRED module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Athanasios Baltzis, Jose Espinosa-Carrasco , Leila Mansouri" \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +# Use bash to support string substitution. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + build-essential \ + cmake \ + cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA_VERSION//./-}) \ + git \ + hmmer \ + kalign \ + tzdata \ + wget \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get autoremove -y \ + && apt-get clean + +# Clone AlphaFold2 +RUN git clone https://github.com/cbcrg/alphafold.git /app/alphafold && \ + cd /app/alphafold && \ + git checkout 1b3170e9409472ec8ad044f9935c92bedd7b4674 && \ + cd - + +# Compile HHsuite from source +RUN git clone --branch v3.3.0 --single-branch https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ + && mkdir /tmp/hh-suite/build \ + && pushd /tmp/hh-suite/build \ + && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ + && make -j && make install \ + && ln -s /opt/hhsuite/bin/* /usr/bin \ + && popd \ + && rm -rf /tmp/hh-suite + +# Install Miniforge package manager +RUN wget -q -P /tmp \ + "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" \ + && bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda \ + && rm /tmp/Miniforge3-$(uname)-$(uname -m).sh + +# Install Conda packages. +ENV PATH="/opt/conda/bin:$PATH" +ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" +ENV CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes" +RUN conda install --quiet --yes conda==24.11.1 pip python=3.11 \ + && conda install --quiet --yes --channel nvidia cuda=${CUDA_VERSION} \ + && conda install --quiet --yes --channel conda-forge openmm=8.0.0 pdbfixer \ + && conda clean --all --force-pkgs-dirs --yes + +RUN mamba install --quiet --yes cuda-toolkit=${CUDA_VERSION} \ + && mamba clean --all --force-pkgs-dirs --yes + +RUN wget -q -P /app/alphafold/alphafold/common/ \ + https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt + +# Download updated requirements.txt +RUN wget -q -O /app/alphafold/updated_requirements.txt https://raw.githubusercontent.com/google-deepmind/alphafold/e1554305c03c0f002a13e8d5bccc2239927f63d6/requirements.txt +RUN echo -e "chex==0.0.7\ndm-tree==0.1.8\nimmutabledict==2.0.0\npandas==2.0.3" >> /app/alphafold/updated_requirements.txt + +# Install pip packages. +RUN pip3 install --upgrade pip --no-cache-dir \ + && pip3 install -r /app/alphafold/updated_requirements.txt --no-cache-dir \ + && pip3 install --upgrade --no-cache-dir \ + jax==0.4.26 \ + jaxlib==0.4.26+cuda12.cudnn89 \ + -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + +RUN sed -i "s|alphafold/common/stereo_chemical_props.txt|/app/alphafold/alphafold/common/stereo_chemical_props.txt|g" /app/alphafold/alphafold/common/residue_constants.py + +# Add SETUID bit to the ldconfig binary so that non-root users can run it. +RUN chmod u+s /sbin/ldconfig.real + +# Currently needed to avoid undefined_symbol error. +RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7 + +# Run ldconfig to ensure GPUs are visible at runtime. +# See https://github.com/NVIDIA/nvidia-docker/issues/1399 +RUN ldconfig + +WORKDIR /app/alphafold diff --git a/modules/local/run_alphafold2_pred/main.nf b/modules/local/run_alphafold2_pred/main.nf new file mode 100644 index 000000000..30a581a32 --- /dev/null +++ b/modules/local/run_alphafold2_pred/main.nf @@ -0,0 +1,101 @@ +/* + * Run Alphafold2 PRED + */ +process RUN_ALPHAFOLD2_PRED { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_alphafold2_pred:2.0.0" + + input: + tuple val(meta), path(fasta), path(features) + val alphafold2_model_preset + path ('params/*') + path ('bfd/*') + path ('small_bfd/*') + path ('mgnify/*') + path ('pdb70/*') + path ('pdb_mmcif/mmcif_files') + path ('pdb_mmcif/*') + path ('uniref30/*') + path ('uniref90/*') + path ('pdb_seqres/*') + path ('uniprot/*') + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_alphafold2.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("raw/ranked*.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_alphafold2_msa.tsv") , emit: msa + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + //Note: alphafold2_model_preset == "monomer" the pae file won't exist. + tuple val(meta), path ("${meta.id}_*_pae.tsv") , optional: true, emit: paes + tuple val(meta), path ("${meta.id}_0_pae.tsv") , optional: true, emit: pae + tuple val(meta), path ("${meta.id}_ptm.tsv") , optional: true, emit: ptms + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptms + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ALPHAFOLD2_PRED module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + """ + python3 /app/alphafold/run_predict.py \\ + --fasta_paths=${fasta} \\ + --model_preset=${alphafold2_model_preset} \\ + --output_dir=\$PWD \\ + --data_dir=\$PWD \\ + --msa_path=${features} $args + + cp "${fasta.baseName}"/ranked_0.pdb ./"${meta.id}"_alphafold2.pdb + + extract_metrics.py --name ${meta.id} \\ + --pkls ${features} ${fasta.baseName}/*.pkl \\ + --structs ${fasta.baseName}/ranked*.pdb + + mv "${meta.id}_msa.tsv" "${meta.id}_alphafold2_msa.tsv" + + # Can't use fasta.baseName to batch outputs in publishDir + mv "${fasta.baseName}" raw/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + touch "${meta.id}_alphafold2.pdb" + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_alphafold2_msa.tsv" + touch "${meta.id}_0_pae.tsv" + mkdir "raw/" + touch "raw/ranked_0.pdb" + touch "raw/ranked_1.pdb" + touch "raw/ranked_2.pdb" + touch "raw/ranked_3.pdb" + touch "raw/ranked_4.pdb" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + alphafold2: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_alphafold3/Dockerfile b/modules/local/run_alphafold3/Dockerfile new file mode 100644 index 000000000..a58af1945 --- /dev/null +++ b/modules/local/run_alphafold3/Dockerfile @@ -0,0 +1,61 @@ +FROM nvidia/cuda:12.6.0-base-ubuntu22.04 + +LABEL org.opencontainers.image.title="nf-core/proteinfold_alphafold3" \ + org.opencontainers.image.description="Docker image containing all software requirements to run AlphaFold3 using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Jose Espinosa-Carrasco " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +# Some RUN statements are combined together to make Docker build run faster. +# Get latest package listing, install software-properties-common, git and wget. +# git is required for pyproject.toml toolchain's use of CMakeLists.txt. +RUN apt update --quiet \ + && apt install --yes --quiet software-properties-common \ + && apt install --yes --quiet git wget jq \ + && rm -rf /var/lib/apt/lists/* + +# Get apt repository of specific Python versions. Then install Python. Tell APT +# this isn't an interactive TTY to avoid timezone prompt when installing. +RUN add-apt-repository ppa:deadsnakes/ppa \ + && DEBIAN_FRONTEND=noninteractive apt install --yes --quiet python3.11 python3-pip python3.11-venv python3.11-dev +RUN python3.11 -m venv /alphafold3_venv +ENV PATH="/hmmer/bin:/alphafold3_venv/bin:$PATH" + +# Install HMMER. Do so before copying the source code, so that docker can cache +# the image layer containing HMMER. +RUN mkdir /hmmer_build /hmmer ; \ + wget http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz --directory-prefix /hmmer_build ; \ + (cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz) ; \ + (cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \ + (cd /hmmer_build/hmmer-3.4 && make -j8) ; \ + (cd /hmmer_build/hmmer-3.4 && make install) ; \ + (cd /hmmer_build/hmmer-3.4/easel && make install) ; \ + rm -R /hmmer_build + +# Clone AlphaFold3 +RUN git clone https://github.com/google-deepmind/alphafold3.git /app/alphafold && \ + cd /app/alphafold && \ + git checkout 3c27a149a25b6c8930ce68cfeb2a6cbb9afb3359 && \ + cd - + +# # Install the Python dependencies AlphaFold 3 needs. +# RUN pip3 install -r dev-requirements.txt +# RUN pip3 install --no-deps . + +# Install pip packages. +RUN pip3 install --upgrade pip --no-cache-dir \ + && pip3 install -r /app/alphafold/dev-requirements.txt --no-cache-dir \ + && pip3 install --no-deps /app/alphafold/ --no-cache-dir \ + && pip3 install --no-deps biopython + +# # Build chemical components database (this binary was installed by pip). +RUN build_data + +# To work around a known XLA issue causing the compilation time to greatly +# increase, the following environment variable setting XLA flags must be enabled +# when running AlphaFold 3: +ENV XLA_FLAGS="--xla_gpu_enable_triton_gemm=false" +# Memory settings used for folding up to 5,120 tokens on A100 80 GB. +ENV XLA_PYTHON_CLIENT_PREALLOCATE=true +ENV XLA_CLIENT_MEM_FRACTION=0.95 diff --git a/modules/local/run_alphafold3/main.nf b/modules/local/run_alphafold3/main.nf new file mode 100644 index 000000000..48e38815e --- /dev/null +++ b/modules/local/run_alphafold3/main.nf @@ -0,0 +1,150 @@ +/* + * Run Alphafold3 + */ +process RUN_ALPHAFOLD3 { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + container "nf-core/proteinfold_alphafold3_standard:2.0.0" + + input: + tuple val(meta), path(json) + path "params/*" + path "small_bfd/*" + path "mgnify/*" + path "mmcif_files" + path "uniref90/*" + path "pdb_seqres/*" + path "uniprot/*" + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_alphafold3.cif") , emit: top_ranked_cif + tuple val(meta), path ("raw/*ranked_*.cif") , emit: cif + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_alphafold3_msa.tsv") , emit: msa + tuple val(meta), path ("${meta.id}_0_pae.tsv") , emit: pae + tuple val(meta), path ("${meta.id}_ptm.tsv") , emit: ptms + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptms + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ALPHAFOLD3 module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def af3_id = meta.id.toLowerCase() + """ + # Check database files exist and set variables + pdb_seqres=\$(ls -v ./pdb_seqres/pdb_seqres.txt ./pdb_seqres/pdb_seqres_2022_09_28.fasta 2>/dev/null | tail -n 1 || echo "") + if [[ -z "\$pdb_seqres" ]]; then + echo "ERROR: No pdb_seqres file found" + exit 1 + fi + + uniref90=\$(ls -v ./uniref90/uniref90*.fa ./uniref90/uniref90*.fasta 2>/dev/null | tail -n 1 || echo "") + if [[ -z "\$uniref90" ]]; then + echo "ERROR: No uniref90 file found" + exit 1 + fi + + mgnify=\$(ls -v ./mgnify/mgy_clusters*.fa ./mgnify/mgnify_clusters*.fasta 2>/dev/null | tail -n 1 || echo "") + if [[ -z "\$mgnify" ]]; then + echo "ERROR: No mgnify file found" + exit 1 + fi + + uniprot=\$(ls -v ./uniprot/uniprot.fasta ./uniprot/uniprot*.fa 2>/dev/null | tail -n 1 || echo "") + if [[ -z "\$uniprot" ]]; then + echo "ERROR: No uniprot file found" + exit 1 + fi + + python3 /app/alphafold/run_alphafold.py \\ + --json_path=${json} \\ + --model_dir=./params \\ + --uniref90_database_path=\$uniref90 \\ + --mgnify_database_path=\$mgnify \\ + --pdb_database_path=./mmcif_files \\ + --small_bfd_database_path=./small_bfd/bfd-first_non_consensus_sequences.fasta \\ + --uniprot_cluster_annot_database_path=\$uniprot \\ + --seqres_database_path=\$pdb_seqres \\ + --output_dir=\$PWD \\ + $args + + ### Move the rest of the models and rename them according to their rank + name=\$(jq -r '.name' ${json}) + + ## Copy top ranked model to root + cp -n "\${name}/\${name}_model.cif" "${prefix}_alphafold3.cif" + + ## Sort the rows by ranking_score in descending order + sorted_csv=\$(head -n 1 "\${name}/ranking_scores.csv"; tail -n +2 "\${name}/ranking_scores.csv" | sort -t, -k3 -nr) + rank=0 + + ## Create raw directory for intermediate files + mkdir -p raw + + ## Generate files with rank tag in raw directory + echo "\$sorted_csv" | tail -n +2 | while IFS=',' read -r seed sample ranking_score; do + cp -n "\${name}/seed-\${seed}_sample-\${sample}/model.cif" "raw/seed_\${seed}_sample_\${sample}_ranked_\${rank}.cif" + rank=\$((rank + 1)) + done + + extract_metrics.py --name ${prefix} \\ + --jsons ${af3_id}/${af3_id}_data.json ${af3_id}/${af3_id}_summary_confidences.json ${af3_id}/${af3_id}_confidences.json \\ + --structs raw/*ranked_*.cif + + mv "${prefix}_msa.tsv" "${meta.id}_alphafold3_msa.tsv" + + ## Move alphafold3 output directory to raw for save_intermediates + mv \${name}/* raw/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + alphafold3: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER //' || echo "unknown") + rdkit: \$(python3 -c "import rdkit; print(rdkit.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p raw + touch ${prefix}_alphafold3.cif + touch raw/${prefix}_ranked_1.cif + touch raw/${prefix}_ranked_2.cif + touch raw/${prefix}_ranked_3.cif + touch raw/${prefix}_ranked_4.cif + touch raw/${prefix}_ranked_5.cif + touch ${prefix}_plddt.tsv + touch ${prefix}_alphafold3_msa.tsv + touch ${prefix}_0_pae.tsv + touch ${prefix}_ptm.tsv + touch ${prefix}_iptm.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + alphafold3: \$(cd /app/alphafold && git rev-parse HEAD 2>/dev/null || echo "unknown") + jax: \$(python3 -c "import jax; print(jax.__version__)" 2>/dev/null || echo "unknown") + jaxlib: \$(python3 -c "import jaxlib; print(jaxlib.__version__)" 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER //') + rdkit: \$(python3 -c "import rdkit; print(rdkit.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_boltz/Dockerfile b/modules/local/run_boltz/Dockerfile new file mode 100644 index 000000000..9ebd9e49f --- /dev/null +++ b/modules/local/run_boltz/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.12-slim + +LABEL org.opencontainers.image.title="nf-core/proteinfold_boltz" \ + org.opencontainers.image.description="Docker image containing all software requirements to run Boltz using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Jose Espinosa-Carrasco , Ziad Al-Bkhetan " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + procps \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir boltz==2.2.1 + +# The update to 2.2.1, was complaining about this missing dependencies +RUN pip install --no-cache-dir \ + cuequivariance_ops_cu12==0.8.1 \ + cuequivariance_ops_torch_cu12==0.8.1 \ + cuequivariance_torch==0.8.1 \ + triton==3.3.0 + +CMD ["boltz"] diff --git a/modules/local/run_boltz/main.nf b/modules/local/run_boltz/main.nf new file mode 100644 index 000000000..0960aa4e9 --- /dev/null +++ b/modules/local/run_boltz/main.nf @@ -0,0 +1,109 @@ +/* + * Run Boltz + */ +process RUN_BOLTZ { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_boltz:2.0.0" + + input: + tuple val(meta), path(fasta), path(files) + path ('boltz1_conf.ckpt') + path ('ccd.pkl') + path ('boltz2_aff.ckpt') + path ('boltz2_conf.ckpt') + path ('mols') + + output: + tuple val(meta), path ("boltz_results_${meta.id}") , optional: true, emit: intermediates + tuple val(meta), path ("boltz_results_*/processed/msa/*.npz") , emit: msa + tuple val(meta), path ("boltz_results_*/processed/structures/*.npz") , emit: structures + tuple val(meta), path ("boltz_results_*/predictions/*/confidence*.json") , emit: confidence + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_boltz.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("boltz_results_*/predictions/*/*.pdb") , emit: pdb + tuple val(meta), path ("boltz_results_*/predictions/*/plddt_*model_0.npz") , emit: plddt + tuple val(meta), path ("boltz_results_*/predictions/*/pae_*model_0.npz") , emit: pae + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: plddt_raw + tuple val(meta), path ("${meta.id}_boltz_msa.tsv") , emit: msa_raw + tuple val(meta), path ("${meta.id}_*_pae.tsv") , emit: pae_raw + tuple val(meta), path ("${meta.id}_ptm.tsv") , emit: ptm_raw + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptm_raw + tuple val(meta), path ("${meta.id}_chainwise_ptm.tsv") , emit: summary_chainwise_ptm_raw + tuple val(meta), path ("${meta.id}_chainwise_iptm.tsv") , optional: true, emit: chainwise_iptm_raw + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_BOLTZ module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + """ + mkdir -p ./home + export HOME=./home + + [ ! -f mols.tar ] && touch mols.tar + + if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L | grep -q "MIG"; then + echo ">>> MIG mode detected. Mocking pynvml.nvmlDeviceGetNumGpuCores to avoid errors in Boltz. See https://github.com/nf-core/proteinfold/issues/417" + boltz_wrapper.py predict "${fasta}" --output_format "pdb" ${args} --cache ./ + else + boltz predict "${fasta}" --output_format "pdb" ${args} --cache ./ + fi + + cp boltz_results_*/predictions/${meta.id}/*_0.pdb ./${meta.id}_boltz.pdb + if [ -f boltz_results_*/msa/${meta.id}_0.csv ]; then + cp boltz_results_*/msa/${meta.id}_*.csv ./ + fi + + extract_metrics.py --name ${meta.id} \\ + --structs boltz_results_*/predictions/${meta.id}/*.pdb \\ + --jsons boltz_results_*/predictions/${meta.id}/confidence_*_model_*.json \\ + --npzs boltz_results_*/predictions/${meta.id}/pae_*_model_*.npz \\ + --csvs ${meta.id}_*.csv + + mv "${meta.id}_msa.tsv" "${meta.id}_boltz_msa.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + boltz: \$(pip list | grep -i boltz | awk '{print \$2}' 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + mkdir -p ./home + export HOME=./home + + mkdir -p boltz_results_${meta.id}/processed/msa/ + mkdir -p boltz_results_${meta.id}/processed/structures/ + mkdir -p boltz_results_${meta.id}/predictions/${meta.id}/ + + touch boltz_results_${meta.id}/processed/msa/${meta.id}.npz + touch boltz_results_${meta.id}/processed/structures/${meta.id}.npz + touch boltz_results_${meta.id}/predictions/${meta.id}/confidence_${meta.id}.json + touch boltz_results_${meta.id}/predictions/${meta.id}/${meta.id}.pdb + touch boltz_results_${meta.id}/predictions/${meta.id}/plddt_${meta.id}_model_0.npz + touch boltz_results_${meta.id}/predictions/${meta.id}/pae_${meta.id}_model_0.npz + + touch "${meta.id}_boltz.pdb" + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_boltz_msa.tsv" + touch "${meta.id}_0_pae.tsv" + touch "${meta.id}_ptm.tsv" + touch "${meta.id}_iptm.tsv" + touch "${meta.id}_chainwise_ptm.tsv" + touch "${meta.id}_chainwise_iptm.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + boltz: \$(pip list | grep -i boltz | awk '{print \$2}' 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_esmfold.nf b/modules/local/run_esmfold.nf deleted file mode 100644 index 66c5bbc79..000000000 --- a/modules/local/run_esmfold.nf +++ /dev/null @@ -1,57 +0,0 @@ -process RUN_ESMFOLD { - tag "$meta.id" - label 'process_medium' - - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local RUN_ESMFOLD module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "nf-core/proteinfold_esmfold:1.1.1" - - input: - tuple val(meta), path(fasta) - path ('./checkpoints/') - val numRec - - output: - path ("${fasta.baseName}*.pdb"), emit: pdb - path ("${fasta.baseName}_plddt_mqc.tsv"), emit: multiqc - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - - """ - esm-fold \ - -i ${fasta} \ - -o \$PWD \ - -m \$PWD \ - --num-recycles ${numRec} \ - $args - - awk '{print \$2"\\t"\$3"\\t"\$4"\\t"\$6"\\t"\$11}' "${fasta.baseName}"*.pdb | grep -v 'N/A' | uniq > plddt.tsv - echo -e Atom_serial_number"\\t"Atom_name"\\t"Residue_name"\\t"Residue_sequence_number"\\t"pLDDT > header.tsv - cat header.tsv plddt.tsv > "${fasta.baseName}"_plddt_mqc.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - esm-fold: $VERSION - END_VERSIONS - """ - - stub: - def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - touch ./"${fasta.baseName}".pdb - touch ./"${fasta.baseName}"_plddt_mqc.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - esm-fold: $VERSION - END_VERSIONS - """ -} diff --git a/modules/local/run_esmfold/Dockerfile b/modules/local/run_esmfold/Dockerfile new file mode 100644 index 000000000..5dd8f5d3e --- /dev/null +++ b/modules/local/run_esmfold/Dockerfile @@ -0,0 +1,74 @@ +ARG CUDA_VERSION=11.7.1 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 AS builder +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_esmfold" \ + org.opencontainers.image.description="Docker image containing all software requirements to run ESMFold using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Athanasios Baltzis, Jose Espinosa-Carrasco " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +# Add env variables +ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-11.7/lib64:/conda/lib/python3.9/site-packages/nvidia/cusparse/lib:$LD_LIBRARY_PATH" \ + PATH="/conda/bin:$PATH" + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + build-essential \ + wget \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Install Miniconda package manager +# Avoids using defaults channel +RUN wget -q -P /tmp "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \ + bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /conda && \ + rm -rf /tmp/Miniforge3-$(uname)-$(uname -m).sh /var/lib/apt/lists/* && \ + apt-get autoremove -y && apt-get clean -y + +RUN /conda/bin/conda install --quiet -y conda==24.11.1 pip python=3.9 \ + && conda clean --all --force-pkgs-dirs --yes + +RUN cd / && /conda/bin/conda update -qy conda \ + && /conda/bin/conda install -y -c conda-forge pip python + +RUN /conda/bin/pip install --no-cache-dir git+https://github.com/facebookresearch/esm.git +RUN /conda/bin/pip install --no-cache-dir "fair-esm[esmfold]" +RUN /conda/bin/pip install --no-cache-dir torch==2.0.0 torchvision==0.15.0 --upgrade --force-reinstall --index-url https://download.pytorch.org/whl/cu118 +RUN /conda/bin/pip install --no-cache-dir \ + pytorch_lightning==1.9.5 \ + biopython==1.85 \ + ## https://github.com/facebookresearch/esm/issues/621#issuecomment-1741684707 + deepspeed==0.10.3 \ + dm-tree==0.1.8 \ + ml-collections==0.1.0 \ + numpy==1.23.5 \ + PyYAML==6.0.2 \ + requests==2.26.0 \ + scipy==1.13.1 \ + tqdm==4.62.2 \ + typing-extensions==4.12.2 \ + wandb==0.12.21 + +RUN /conda/bin/pip install --no-cache-dir 'dllogger @ git+https://github.com/NVIDIA/dllogger.git' + +## https://github.com/aqlaboratory/insilico_design_pipeline/blob/main/README.md#additional-notes +RUN git clone https://github.com/aqlaboratory/openfold.git && \ + cd openfold && \ + git checkout 4b41059694619831a7db195b7e0988fc4ff3a307 && \ + sed -i 's|deepspeed.utils.is_initialized|deepspeed.comm.comm.is_initialized|g' openfold/model/primitives.py && \ + /conda/bin/pip install . && \ + cd .. && rm -rf openfold + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +COPY --from=builder /conda /conda + +ENV LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-$(cut -f1,2 -d. <<< ${CUDA_VERSION})/lib64:/conda/lib/python3.9/site-packages/nvidia/cusparse/lib:$LD_LIBRARY_PATH" \ + PATH="/conda/bin:$PATH" diff --git a/modules/local/run_esmfold/main.nf b/modules/local/run_esmfold/main.nf new file mode 100644 index 000000000..dc4394c75 --- /dev/null +++ b/modules/local/run_esmfold/main.nf @@ -0,0 +1,71 @@ +process RUN_ESMFOLD { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_esmfold:2.0.0" + + input: + tuple val(meta), path(fasta) + path ('./checkpoints/') + val numRec + + output: + tuple val(meta), path ("${meta.id}_esmfold.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("*.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ESMFOLD module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + // KR - note: removed the *.pdb -> tmp.pdb, tmp.pdb -> esmfold.pdb. Why not just take directly? + // Only one .pdb per ESMFold run + """ + esm-fold \ + -i ${fasta} \ + -o \$PWD \ + -m \$PWD \ + --num-recycles ${numRec} \ + $args + + mv *.pdb ${meta.id}_esmfold.pdb + + extract_metrics.py --name ${meta.id} \\ + --structs ${meta.id}_esmfold.pdb + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + esm-fold: $VERSION + python: \$(python3 --version | sed 's/Python //g') + pytorch: \$(python3 -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown") + openfold: \$(python -m pip show openfold | grep "^Version" | sed 's/.*Version: //' 2>/dev/null || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch "${meta.id}_esmfold.pdb" + touch "${meta.id}_plddt.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + esm-fold: $VERSION + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + pytorch: \$(python3 -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown") + openfold: \$(python -m pip show openfold 2>/dev/null | grep "^Version" | sed 's/.*Version: //' || echo "unknown") + numpy: \$(python3 -c "import numpy; print(numpy.__version__)" 2>/dev/null || echo "unknown") + biopython: \$(python3 -c "import Bio; print(Bio.__version__)" 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_helixfold3/Dockerfile b/modules/local/run_helixfold3/Dockerfile new file mode 100644 index 000000000..c697cc9e2 --- /dev/null +++ b/modules/local/run_helixfold3/Dockerfile @@ -0,0 +1,41 @@ +ARG CUDA_VERSION=12.8.1 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_helixfold3" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the RUN_HELIXFOLD3 module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Joshua Caley , Jose Espinosa-Carrasco " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget git && \ + wget -q -P /tmp "https://github.com/conda-forge/miniforge/releases/download/24.11.0-0/Miniforge3-$(uname)-$(uname -m).sh" && \ + bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /conda && \ + git clone --filter=blob:none --no-checkout https://github.com/PaddlePaddle/PaddleHelix.git /app/helixfold3 && \ + cd /app/helixfold3 && \ + git sparse-checkout init --cone && \ + git sparse-checkout set apps/protein_folding/helixfold3 && \ + git checkout 705c2974a833cdc3a4420f4e3379da596091c97f && \ + mv apps/protein_folding/helixfold3/* . && \ + rm -rf /tmp/Miniforge3-$(uname)-$(uname -m).sh /var/lib/apt/lists/* /root/.cache apps && \ + apt-get remove --purge -y wget git && apt-get autoremove -y && apt-get clean -y + +COPY environment.yaml /app/helixfold3/environment.yaml + +ENV PATH="/conda/bin:$PATH" + +RUN mamba env create --file=/app/helixfold3/environment.yaml && \ + mamba install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -n helixfold && \ + mamba install -y -c conda-forge openbabel -n helixfold && \ + mamba clean --all --force-pkgs-dirs -y && \ + rm -rf /root/.cache + +ENV PATH="/conda/bin:/app/helixfold3:./maxit_src/bin:/conda/envs/helixfold/bin:$PATH" \ + RCSBROOT="./maxit_src" \ + MAXIT_SRC="./maxit_src" \ + PYTHON_BIN="/conda/envs/helixfold/bin/python3.9" \ + ENV_BIN="/conda/envs/helixfold/bin" \ + OBABEL_BIN="/conda/envs/helixfold/bin" diff --git a/modules/local/run_helixfold3/environment.yaml b/modules/local/run_helixfold3/environment.yaml new file mode 100644 index 000000000..82833a68b --- /dev/null +++ b/modules/local/run_helixfold3/environment.yaml @@ -0,0 +1,35 @@ +name: helixfold +channels: + - conda-forge + - bioconda + - nvidia + - biocore + +dependencies: + - python=3.9 + - cuda-toolkit=12.0 + - cudnn=8.4.0 + - nccl=2.14 + - libgcc=14.2.0 + - libgomp=14.2.0 + - pip=25.0.1 + - aria2=1.37.0 + - hmmer=3.4 + - kalign2=2.04 + - hhsuite=3.3.0 + - openbabel=3.1.1 + - pip: + - paddlepaddle-gpu==2.6.1 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html + - absl-py==0.13.0 + - biopython==1.79 + - chex==0.0.7 + - dm-haiku==0.0.4 + - dm-tree==0.1.6 + - docker==5.0.0 + - immutabledict==2.0.0 + - jax==0.2.14 + - ml-collections==0.1.0 + - pandas==1.3.4 + - scipy==1.9.0 + - rdkit-pypi==2022.9.5 + - posebusters==0.3.6 diff --git a/modules/local/run_helixfold3/main.nf b/modules/local/run_helixfold3/main.nf new file mode 100644 index 000000000..b22417775 --- /dev/null +++ b/modules/local/run_helixfold3/main.nf @@ -0,0 +1,134 @@ +/* + * Run HelixFold3 + */ +process RUN_HELIXFOLD3 { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_helixfold3:2.0.0" + + input: + tuple val(meta), path(fasta) + val uniref30_prefix + path ('uniref30/*') + path ('ccd_preprocessed_etkdg.pkl.gz') + path ('Rfam-14.9_rep_seq.fasta') + path ('bfd/*') + path ('small_bfd/*') + path ('uniprot/*') + path ('pdb_seqres/*') + path ('uniref90/*') + path ('mgnify/*') + path ('mmcif_files') + path ('obsolete.dat') + path ('init_models/*') + path ('maxit_src') + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_helixfold3.pdb") , emit: top_ranked_pdb + tuple val(meta), path ("${meta.id}_helixfold3.cif") , emit: main_cif + tuple val(meta), path ("raw/ranked*.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_helixfold3_msa.tsv") , emit: msa + // If ${meta.id}-rank*/all_results.json" doesn't have PAE vales in the key, this will be empty + tuple val(meta), path ("${meta.id}_1_pae.tsv") , emit: pae + tuple val(meta), path ("${meta.id}_*_pae.tsv") , emit: paes + tuple val(meta), path ("${meta.id}_ptm.tsv") , emit: ptms + tuple val(meta), path ("${meta.id}_iptm.tsv") , optional: true, emit: iptms + path ("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_HELIXFOLD3 module does not support Conda. Please use Docker / Singularity / Podman / Apptainer instead.") + } + def args = task.ext.args ?: '' + def VERSION = '705c2974a833cdc3a4420f4e3379da596091c97f' + """ + init_model_path=\$(ls ./init_models/*.pdparams | head -n 1) + mgnify_db_path=\$(ls -v ./mgnify/mgy_clusters*.fa | tail -n 1) + + [ -x ./maxit_src/bin/maxit ] || chmod +x ./maxit_src/bin/maxit + + mamba run --name helixfold python3.9 /app/helixfold3/inference.py \\ + --maxit_binary "./maxit_src/bin/maxit" \\ + --jackhmmer_binary_path "jackhmmer" \\ + --hhblits_binary_path "hhblits" \\ + --hhsearch_binary_path "hhsearch" \\ + --kalign_binary_path "kalign" \\ + --hmmsearch_binary_path "hmmsearch" \\ + --hmmbuild_binary_path "hmmbuild" \\ + --nhmmer_binary_path "nhmmer" \\ + --bfd_database_path="./bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \\ + --small_bfd_database_path="./small_bfd/bfd-first_non_consensus_sequences.fasta" \\ + --uniclust30_database_path="./uniref30/${uniref30_prefix}" \\ + --uniprot_database_path="./uniprot/uniprot.fasta" \\ + --pdb_seqres_database_path="./pdb_seqres/pdb_seqres.txt" \\ + --rfam_database_path="./Rfam-14.9_rep_seq.fasta" \\ + --template_mmcif_dir="./mmcif_files" \\ + --obsolete_pdbs_path="./obsolete.dat" \\ + --ccd_preprocessed_path="./ccd_preprocessed_etkdg.pkl.gz" \\ + --uniref90_database_path "./uniref90/uniref90.fasta" \\ + --mgnify_database_path "\$mgnify_db_path" \\ + --input_json="${fasta}" \\ + --output_dir="\$PWD" \\ + --init_model "\$init_model_path" \\ + $args + + cp "${fasta.baseName}/${fasta.baseName}-rank1/predicted_structure.pdb" "./${meta.id}_helixfold3.pdb" + cp "${fasta.baseName}/${fasta.baseName}-rank1/predicted_structure.cif" "./${meta.id}_helixfold3.cif" + + mamba run --name helixfold extract_metrics.py --name ${meta.id} \\ + --structs ${fasta.baseName}/${fasta.baseName}-rank*/predicted_structure.pdb \\ + --pkls "${fasta.baseName}/final_features.pkl" \\ + --jsons ${fasta.baseName}/${fasta.baseName}-rank*/all_results.json + + mkdir -p raw + for i in 1 2 3 4 5; do + cp "${fasta.baseName}/${fasta.baseName}-rank\$i/predicted_structure.pdb" "raw/ranked_\$i.pdb" + done + + mv "${meta.id}_msa.tsv" "${meta.id}_helixfold3_msa.tsv" + mv "${fasta.baseName}" raw/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>&1 | sed 's/Python //g') + helixfold3: "${VERSION}" + hmmer: \$(hmmsearch -h 2>&1 | grep -o 'HMMER [0-9.]*' | sed 's/HMMER //') + hhsuite: \$(hhblits -h 2>&1 | head -1 | awk '{print \$2}' | tr -d ':') + END_VERSIONS + """ + + stub: + def VERSION = '705c2974a833cdc3a4420f4e3379da596091c97f' + """ + touch "${meta.id}_helixfold3.cif" + touch "${meta.id}_helixfold3.pdb" + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_helixfold3_msa.tsv" + touch "${meta.id}_ptm.tsv" + touch "${meta.id}_iptm.tsv" + touch "${meta.id}_1_pae.tsv" + touch "${meta.id}_2_pae.tsv" + touch "${meta.id}_3_pae.tsv" + touch "${meta.id}_4_pae.tsv" + touch "${meta.id}_5_pae.tsv" + mkdir -p raw + touch "raw/ranked_1.pdb" + touch "raw/ranked_2.pdb" + touch "raw/ranked_3.pdb" + touch "raw/ranked_4.pdb" + touch "raw/ranked_5.pdb" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/run_rosettafold2na/Dockerfile b/modules/local/run_rosettafold2na/Dockerfile new file mode 100644 index 000000000..702a05159 --- /dev/null +++ b/modules/local/run_rosettafold2na/Dockerfile @@ -0,0 +1,94 @@ +######################################################################## +# Dockerfile for RoseTTAFold2NA (CUDA 12.2), auto-activating conda env +######################################################################## +ARG CUDA_VERSION=12.2.2 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-runtime-ubuntu20.04 AS builder +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_rosettafold2na" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the RUN_ROSETTAFOLD2NA module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Patri Bota , Jose Espinosa-Carrasco " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +# Switch default shell to bash for subsequent RUN commands +SHELL ["/bin/bash", "-c"] + +# Add env variables +ENV PYTHONPATH="/app/RoseTTAFold2NA" \ + PATH="/conda/bin:/app/RoseTTAFold2NA:/conda/envs/RF2NA/bin:$PATH" \ + DGLBACKEND="pytorch" \ + LD_LIBRARY_PATH="/usr/local/cuda-$(cut -f1,2 -d. <<< ${CUDA_VERSION})/lib64:$LD_LIBRARY_PATH" \ + LANG=en_US.UTF-8 \ + LC_ALL=en_US.UTF-8 + +######################################################################## +# 1. Install minimal system dependencies and Miniforge (conda/mamba) +######################################################################## +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + wget \ + git \ + curl \ + ncbi-blast+ \ + libopenblas0 \ + libopenblas-dev \ + locales \ + && sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen \ + && locale-gen en_US.UTF-8 \ + && wget -q -P /tmp \ + "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" \ + && bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /conda \ + && rm -rf /tmp/Miniforge3-$(uname)-$(uname -m).sh /var/lib/apt/lists/* \ + && apt-get autoremove -y && apt-get clean -y + +######################################################################## +# 2. Clone RoseTTAFold2NA repo, create conda environment, and pin DGL +######################################################################## + +## Using an add-hoc environment file to update dependencies +COPY environment.yaml ./environment.yaml +RUN /conda/bin/mamba env create -f environment.yaml + +RUN git clone --depth 1 https://github.com/uw-ipd/RoseTTAFold2NA.git /app/RoseTTAFold2NA && \ + cd /app/RoseTTAFold2NA && \ + git checkout f761af286729ea08a6ddab149023c1b73458fbe2 && \ + /conda/bin/mamba run -n RF2NA bash -c "\ + cd SE3Transformer && \ + pip install --no-cache-dir -r requirements.txt && \ + python setup.py install" && \ + /conda/bin/mamba clean --all --force-pkgs-dirs -y + +# Install pip packages for extract_metrics.py +RUN pip install --upgrade pip --no-cache-dir \ + && pip install --no-cache-dir \ + numpy + +# ######################################################################## +# # 3. Final clean-up: remove build tools and caches +# ######################################################################## +RUN apt-get autoremove -y && \ + apt-get remove --purge -y wget git && \ + apt-get clean -y && \ + rm -rf /root/.cache + +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +COPY --from=builder /app /app +COPY --from=builder /conda /conda + +# Add env variables +ENV PYTHONPATH="/app/RoseTTAFold2NA" \ + PATH="/conda/bin:/app/RoseTTAFold2NA:/conda/envs/RF2NA/bin:$PATH" \ + DGLBACKEND="pytorch" \ + LD_LIBRARY_PATH="/usr/local/cuda-$(cut -f1,2 -d. <<< ${CUDA_VERSION})/lib64:$LD_LIBRARY_PATH" \ + LANG=en_US.UTF-8 \ + LC_ALL=en_US.UTF-8 + +# Make curl available where update_blastdbp.pl expects it +RUN ln -s /conda/envs/RF2NA/bin/curl /usr/bin/curl diff --git a/modules/local/run_rosettafold2na/environment.yaml b/modules/local/run_rosettafold2na/environment.yaml new file mode 100644 index 000000000..3ff9eab39 --- /dev/null +++ b/modules/local/run_rosettafold2na/environment.yaml @@ -0,0 +1,29 @@ +name: RF2NA +dependencies: + - python==3.10.14 + - pip + - bioconda::mafft + - bioconda::hhsuite + - bioconda::blast + - bioconda::hmmer>=3.3 + - bioconda::infernal + - bioconda::cd-hit + - bioconda::csblast + - conda-forge::llvm-openmp + - pip: + - pandas + - torch==2.2.1 + - torchvision + - torchaudio + - torchdata + - torch_geometric + - psutil + - tqdm + - torchdata + - pydantic + - e3nn + - wandb + - pynvml + - git+https://github.com/NVIDIA/dllogger#egg=dllogger + - https://data.dgl.ai/wheels/cu121/dgl-2.0.0%2Bcu121-cp310-cp310-manylinux1_x86_64.whl + - numpy==1.26.4 diff --git a/modules/local/run_rosettafold2na/main.nf b/modules/local/run_rosettafold2na/main.nf new file mode 100644 index 000000000..2650610e7 --- /dev/null +++ b/modules/local/run_rosettafold2na/main.nf @@ -0,0 +1,119 @@ +/* + * Run RF2NA (RoseTTAFold 2 for Nucleic Acids) + */ +process RUN_ROSETTAFOLD2NA { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_rosettafold2na:2.0.0" + + input: + tuple val(meta), path(rf2na_input) + path ('bfd/*') + path ('UniRef30_2020_06/*') + path ('pdb100_2021Mar03/*') + path ('RNA/*') + path ('network/weights/*') + + output: + path ("raw/**") , emit: raw + tuple val(meta), path("${meta.id}_rosettafold2na.pdb") , emit: top_ranked_pdb + tuple val(meta), path("raw/*.pdb") , emit: pdb + tuple val(meta), path("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path("${meta.id}_rosettafold2na_msa.tsv") , emit: msa + tuple val(meta), path("${meta.id}_0_pae.tsv") , emit: pae + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_RF2NA module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + + def VERSION = 'v0.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + # Otherwise will through error when running .command.{sh,run} for debugging + if [ ! -e "\$PWD/run_RF2NA.sh" ]; then + ln -s /app/RoseTTAFold2NA/run_RF2NA.sh ./ + mkdir ./input_prep + ln -s /app/RoseTTAFold2NA/input_prep/* ./input_prep + ln -s /app/RoseTTAFold2NA/network/* ./network + fi + + rf2na_input_dir="\${rf2na_input:-rf2na_input}" + + chain_map="\${rf2na_input_dir}/chain_map.tsv" + if [ ! -s "\$chain_map" ]; then + echo "[ROSETTAFOLD2NA] Missing chain_map.tsv produced by ROSETTAFOLD2NA_FASTA." >&2 + exit 1 + fi + + chain_args=() + while IFS=\$'\\t' read -r chain_type chain_file _; do + [ -z "\$chain_type" ] && continue + case "\${chain_type}" in + P|R|D|S) ;; + *) echo "[ROSETTAFOLD2NA] Unsupported chain type '\${chain_type}'. Allowed types: P, R, D, S." >&2; exit 1 ;; + esac + chain_args+=( "\${chain_type}:\${rf2na_input_dir}/\${chain_file}" ) + done < <(tail -n +2 "\$chain_map") + + if [ "\${#chain_args[@]}" -eq 0 ]; then + echo "[ROSETTAFOLD2NA] No valid chain specifications found in chain_map.tsv." >&2 + exit 1 + fi + + ./run_RF2NA.sh ${meta.id}_rf2na_output "\${chain_args[@]}" + + ## Create raw directory for intermediate files + mkdir -p raw + + ## Copy top ranked model to root and raw + cp ${meta.id}_rf2na_output/models/model_00.pdb ./${meta.id}_rosettafold2na.pdb + cp ${meta.id}_rf2na_output/models/*.pdb raw/ + + # Extract PAE matrix from NPZ and save as TSV for reporting + /conda/envs/RF2NA/bin/python3 - <<'PY' "${meta.id}_rf2na_output/models/model_00.npz" "${meta.id}_0_pae.tsv" +import numpy as np, sys +npz, out = sys.argv[1], sys.argv[2] +d = np.load(npz) +np.savetxt(out, d["pae"], fmt="%.3f", delimiter="\t") +PY + + A3M_ARGS="${'$'}(find "${meta.id}_rf2na_output" -maxdepth 1 -name "*.a3m" -print | sed 's/^/ --a3ms /' | tr -d '\n')" + extract_metrics.py --name ${meta.id} \ + --structs "${meta.id}_rf2na_output/models/model_00.pdb" ${'$'}A3M_ARGS + + mv "${meta.id}_msa.tsv" "${meta.id}_rosettafold2na_msa.tsv" + + ## Move rf2na output directory to raw for save_intermediates + mv ${meta.id}_rf2na_output/* raw/ + +cat <<-END_VERSIONS > versions.yml +"${task.process}": + python: \$(python3 --version | sed 's/Python //g') + rosettafold2na: "${VERSION}" +END_VERSIONS + """ + + stub: + def VERSION = 'v0.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + mkdir -p raw + touch "${meta.id}_rosettafold2na.pdb" + touch raw/model_00.pdb + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_0_pae.tsv" + touch "${meta.id}_rosettafold2na_msa.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + rosettafold2na: "${VERSION}" + END_VERSIONS + """ +} diff --git a/modules/local/run_rosettafold_all_atom/Dockerfile b/modules/local/run_rosettafold_all_atom/Dockerfile new file mode 100644 index 000000000..fea2e2065 --- /dev/null +++ b/modules/local/run_rosettafold_all_atom/Dockerfile @@ -0,0 +1,53 @@ +ARG CUDA_VERSION=12.6.0 +FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu24.04 AS builder +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +LABEL org.opencontainers.image.title="nf-core/proteinfold_rosettafold_all_atom" \ + org.opencontainers.image.description="Docker image containing all software requirements to run the RUN_ROSETTAFOLD_ALL_ATOM module using the nf-core/proteinfold pipeline" \ + org.opencontainers.image.version="2.0.0" \ + org.opencontainers.image.authors="Joshua Caley " \ + org.opencontainers.image.source="https://github.com/nf-core/proteinfold" \ + org.opencontainers.image.licenses="MIT" + +ENV PYTHONPATH="/app/RoseTTAFold-All-Atom" \ + PATH="/conda/bin:/app/RoseTTAFold-All-Atom:$PATH" \ + DGLBACKEND="pytorch" \ + LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-$(cut -f1,2 -d. <<< ${CUDA_VERSION})/lib64:$LD_LIBRARY_PATH" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget git && \ + wget -q -P /tmp "https://github.com/conda-forge/miniforge/releases/download/24.11.0-0/Miniforge3-$(uname)-$(uname -m).sh" && \ + bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /conda && \ + rm -rf /tmp/Miniforge3-$(uname)-$(uname -m).sh /var/lib/apt/lists/* /root/.cache && \ + git clone --single-branch --depth 1 https://github.com/Australian-Structural-Biology-Computing/RoseTTAFold-All-Atom.git /app/RoseTTAFold-All-Atom && \ + cd /app/RoseTTAFold-All-Atom && \ + git fetch --depth 1 origin e8f94d6d6ddfb07da2119bcfa94359bc6912fd29 && \ + git checkout e8f94d6d6ddfb07da2119bcfa94359bc6912fd29 && \ + /conda/bin/mamba env create --file=environment.yaml && \ + /conda/bin/mamba run -n RFAA bash -c \ + "python /app/RoseTTAFold-All-Atom/rf2aa/SE3Transformer/setup.py install && \ + bash /app/RoseTTAFold-All-Atom/install_dependencies.sh" && \ + /conda/bin/mamba clean --all --force-pkgs-dirs -y && \ + cd /app/RoseTTAFold-All-Atom && \ + wget https://ftp.ncbi.nlm.nih.gov/blast/executables/legacy.NOTSUPPORTED/2.2.26/blast-2.2.26-x64-linux.tar.gz && \ + mkdir -p blast-2.2.26 && \ + tar -xf blast-2.2.26-x64-linux.tar.gz -C blast-2.2.26 && \ + cp -r blast-2.2.26/blast-2.2.26/ blast-2.2.26_bk && \ + rm -r blast-2.2.26 && \ + mv blast-2.2.26_bk/ blast-2.2.26 && \ + rm -rf /root/.cache *.tar.gz && \ + apt-get remove --purge -y wget git && apt-get autoremove -y && apt-get clean -y + +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu24.04 +# FROM directive resets ARGS, so we specify again (the value is retained if +# previously set). +ARG CUDA_VERSION + +COPY --from=builder /app /app +COPY --from=builder /conda /conda + +ENV PYTHONPATH="/app/RoseTTAFold-All-Atom" \ + PATH="/conda/bin:/app/RoseTTAFold-All-Atom:$PATH" \ + DGLBACKEND="pytorch" \ + LD_LIBRARY_PATH="/conda/lib:/usr/local/cuda-$(cut -f1,2 -d. <<< ${CUDA_VERSION})/lib64:$LD_LIBRARY_PATH" diff --git a/modules/local/run_rosettafold_all_atom/main.nf b/modules/local/run_rosettafold_all_atom/main.nf new file mode 100644 index 000000000..2c1147fa0 --- /dev/null +++ b/modules/local/run_rosettafold_all_atom/main.nf @@ -0,0 +1,88 @@ +/* + * Run RoseTTAFold_All_Atom + */ +process RUN_ROSETTAFOLD_ALL_ATOM { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + container "nf-core/proteinfold_rosettafold_all_atom:2.0.0" + + input: + tuple val(meta), path(yaml) + val uniref30_prefix + path ('bfd/*') + path ('uniref30/*') + path ('pdb100_2021Mar03/*') + path ('RFAA_paper_weights.pt') + path (fasta_files) + + output: + path ("raw/**") , emit: raw + tuple val(meta), path ("${meta.id}_rosettafold_all_atom.pdb") , emit: pdb + tuple val(meta), path ("${meta.id}_plddt.tsv") , emit: multiqc + tuple val(meta), path ("${meta.id}_rosettafold_all_atom_msa.tsv") , emit: msa + // I think there should always be PAE from the .pt PyTorch model. extract_metrics.py has condition import torch to handle this + tuple val(meta), path ("${meta.id}_*_pae.tsv") , emit: paes + tuple val(meta), path ("${meta.id}_0_pae.tsv") , emit: pae + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error("Local RUN_ROSETTAFOLD_ALL_ATOM module does not support Conda. Please use Docker / Singularity / Podman instead.") + } + def args = task.ext.args ?: '' + """ + export DB_UR30="uniref30/${uniref30_prefix}" + mamba run --name RFAA python /app/RoseTTAFold-All-Atom/rf2aa/run_inference.py \\ + --config-dir /app/RoseTTAFold-All-Atom/rf2aa/config/inference \\ + --config-name "${yaml}" $args + + # Temporary hack - maybe better to sanitize YAML - job_name -> meta.id? + yaml_name="\$(grep ^job_name ${yaml} | awk '{print \$2}' | sed 's/\"//g')" + + cp "\$yaml_name".pdb "${meta.id}"_rosettafold_all_atom.pdb + + mamba run --name RFAA extract_metrics.py --name ${meta.id} \\ + --structs "${meta.id}_rosettafold_all_atom.pdb" \\ + --a3ms "\$yaml_name"/*/t000_.msa0.a3m \\ + --pts "\$yaml_name"_aux.pt + + mv "${meta.id}_msa.tsv" "${meta.id}_rosettafold_all_atom_msa.tsv" + + mkdir -p raw + if [[ -d "\$yaml_name" ]]; then + mv "\$yaml_name" raw/ + fi + if [[ -f "\${yaml_name}_aux.pt" ]]; then + mv "\${yaml_name}_aux.pt" raw/ + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + rosettafold-all-atom: \$(cd /app/RoseTTAFold-All-Atom && git rev-parse HEAD 2>/dev/null || echo "unknown") + END_VERSIONS + """ + + stub: + """ + touch "${meta.id}_rosettafold_all_atom.pdb" + touch "${meta.id}.pdb" + touch "${meta.id}_plddt.tsv" + touch "${meta.id}_rosettafold_all_atom_msa.tsv" + touch "${meta.id}_0_pae.tsv" + mkdir -p raw + touch raw/${meta.id}_aux.pt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>/dev/null | sed 's/Python //g' || echo "unknown") + rosettafold-all-atom: \$(cd /app/RoseTTAFold-All-Atom && git rev-parse HEAD 2>/dev/null || echo "unknown") + END_VERSIONS + """ +} diff --git a/modules/local/split_msa/environment.yml b/modules/local/split_msa/environment.yml new file mode 100644 index 000000000..012de0929 --- /dev/null +++ b/modules/local/split_msa/environment.yml @@ -0,0 +1,6 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.8.3 diff --git a/modules/local/split_msa/main.nf b/modules/local/split_msa/main.nf new file mode 100644 index 000000000..f207fbdbc --- /dev/null +++ b/modules/local/split_msa/main.nf @@ -0,0 +1,40 @@ +process SPLIT_MSA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(msa) + output: + tuple val(meta), path ("output_msa/*.csv"), emit: msa_csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + msa_manager.py ${msa} -o output_msa --meta_id ${meta.id} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + mkdir output_msa + touch "output_msa/A.csv" + touch "output_msa/B.csv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/zstd_decompress/environment.yml b/modules/local/zstd_decompress/environment.yml new file mode 100644 index 000000000..f7762a422 --- /dev/null +++ b/modules/local/zstd_decompress/environment.yml @@ -0,0 +1,5 @@ +name: zstd_decompress +channels: + - conda-forge +dependencies: + - conda-forge::zstd=1.5.6 diff --git a/modules/local/zstd_decompress/main.nf b/modules/local/zstd_decompress/main.nf new file mode 100644 index 000000000..eb3c3be54 --- /dev/null +++ b/modules/local/zstd_decompress/main.nf @@ -0,0 +1,45 @@ +process ZSTD_DECOMPRESS { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0a/0a27033ae5d8add5059f44c62a6004bfcd061d33020edee095fbb204e6f32fee/data' : + 'community.wave.seqera.io/library/zstd:b5faa75d5b75be7f' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: decompressed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.zst$/, "")) + """ + zstd \\ + --decompress \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + zstd: \$(echo \$(zstd --version 2>&1) | grep -o 'v[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.zst$/, "")) + """ + touch ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + zstd: \$(echo \$(zstd --version 2>&1) | grep -o 'v[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+') + END_VERSIONS + """ +} diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff index e22fe2cfc..15f7de50f 100644 --- a/modules/nf-core/aria2/aria2.diff +++ b/modules/nf-core/aria2/aria2.diff @@ -6,7 +6,7 @@ Changes in module 'nf-core/aria2' - tag "$meta.id" + tag "$source_url" label 'process_single' - + conda "${moduleDir}/environment.yml" ************************************************************ diff --git a/modules/nf-core/aria2/environment.yml b/modules/nf-core/aria2/environment.yml index 5dc58a072..50c54a6e9 100644 --- a/modules/nf-core/aria2/environment.yml +++ b/modules/nf-core/aria2/environment.yml @@ -2,6 +2,5 @@ name: aria2 channels: - conda-forge - bioconda - - defaults dependencies: - conda-forge::aria2=1.36.0 diff --git a/modules/nf-core/aria2/tests/main.nf.test.snap b/modules/nf-core/aria2/tests/main.nf.test.snap index 96911f636..6af00d29a 100644 --- a/modules/nf-core/aria2/tests/main.nf.test.snap +++ b/modules/nf-core/aria2/tests/main.nf.test.snap @@ -57,4 +57,4 @@ ], "timestamp": "2023-12-14T17:34:22.216677" } -} \ No newline at end of file +} diff --git a/modules/nf-core/foldseek/easysearch/environment.yml b/modules/nf-core/foldseek/easysearch/environment.yml new file mode 100644 index 000000000..b9dcc6e2e --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::foldseek=9.427df8a diff --git a/modules/nf-core/foldseek/easysearch/foldseek-easysearch.diff b/modules/nf-core/foldseek/easysearch/foldseek-easysearch.diff new file mode 100644 index 000000000..26eebe6f8 --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/foldseek-easysearch.diff @@ -0,0 +1,46 @@ +Changes in component 'nf-core/foldseek/easysearch' +'modules/nf-core/foldseek/easysearch/environment.yml' is unchanged +'modules/nf-core/foldseek/easysearch/meta.yml' is unchanged +Changes in 'foldseek/easysearch/main.nf': +--- modules/nf-core/foldseek/easysearch/main.nf ++++ modules/nf-core/foldseek/easysearch/main.nf +@@ -12,7 +12,8 @@ + tuple val(meta_db), path(db) + + output: +- tuple val(meta), path("${meta.id}.m8"), emit: aln ++ tuple val(meta), path("${meta.id}.m8"), emit: aln, optional: true ++ tuple val(meta), path("${meta.id}_${meta.model.toLowerCase()}_foldseek.html"), emit: report, optional: true + path "versions.yml" , emit: versions + + when: +@@ -21,12 +22,16 @@ + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" ++ def output_file = "${prefix}.m8" ++ if (args.contains("--format-mode 3")){ ++ output_file = "${meta.id}_${meta.model.toLowerCase()}_foldseek.html" ++ } + """ + foldseek \\ + easy-search \\ + ${pdb} \\ + ${db}/${meta_db.id} \\ +- ${prefix}.m8 \\ ++ ${output_file} \\ + tmpFolder \\ + ${args} + +@@ -40,6 +45,7 @@ + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.m8 ++ touch ${prefix}.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +'modules/nf-core/foldseek/easysearch/tests/main.nf.test.snap' is unchanged +'modules/nf-core/foldseek/easysearch/tests/main.nf.test' is unchanged +************************************************************ diff --git a/modules/nf-core/foldseek/easysearch/main.nf b/modules/nf-core/foldseek/easysearch/main.nf new file mode 100644 index 000000000..eab305db7 --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/main.nf @@ -0,0 +1,55 @@ +process FOLDSEEK_EASYSEARCH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/foldseek:9.427df8a--pl5321hb365157_0': + 'biocontainers/foldseek:9.427df8a--pl5321hb365157_0' }" + + input: + tuple val(meta) , path(pdb) + tuple val(meta_db), path(db) + + output: + tuple val(meta), path("${meta.id}.m8"), emit: aln, optional: true + tuple val(meta), path("${meta.id}_${meta.model.toLowerCase()}_foldseek.html"), emit: report, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output_file = "${prefix}.m8" + if (args.contains("--format-mode 3")){ + output_file = "${meta.id}_${meta.model.toLowerCase()}_foldseek.html" + } + """ + foldseek \\ + easy-search \\ + ${pdb} \\ + ${db}/${meta_db.id} \\ + ${output_file} \\ + tmpFolder \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + foldseek: \$(foldseek --help | grep Version | sed 's/.*Version: //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.m8 + touch ${prefix}.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + foldseek: \$(foldseek --help | grep Version | sed 's/.*Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/foldseek/easysearch/meta.yml b/modules/nf-core/foldseek/easysearch/meta.yml new file mode 100644 index 000000000..772f135b7 --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/meta.yml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "foldseek_easysearch" +description: Search for protein structural hits against a foldseek database of protein + structures +keywords: + - protein + - structure + - comparisons +tools: + - "foldseek": + description: "Foldseek: fast and accurate protein structure search" + homepage: "https://search.foldseek.com/search" + documentation: "https://github.com/steineggerlab/foldseek" + tool_dev_url: "https://github.com/steineggerlab/foldseek" + doi: "10.1038/s41587-023-01773-0" + licence: ["GPL v3"] + identifier: biotools:foldseek +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - pdb: + type: file + description: Protein structure(s) in PDB, mmCIF or mmJSON format to compare + against a foldseek database (also works with folder input) + pattern: "*.{pdb,mmcif,mmjson}" + ontologies: [] + - - meta_db: + type: map + description: | + Groovy Map containing sample information for the foldseek db + e.g. `[ id:'test', single_end:false ]` + - db: + type: directory + description: foldseek database from protein structures + pattern: "*" +output: + aln: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - ${meta.id}.m8: + type: file + description: | + Structural comparisons file output + Query, Target, Identity, Alignment length, Mismatches, Gap openings, + Query start, Query end, Target start, Target end, E-value, Bit score + pattern: "*.{m8}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@vagkaratzas" diff --git a/modules/nf-core/foldseek/easysearch/tests/main.nf.test b/modules/nf-core/foldseek/easysearch/tests/main.nf.test new file mode 100644 index 000000000..c71e2743e --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/tests/main.nf.test @@ -0,0 +1,66 @@ +nextflow_process { + + name "Test Process FOLDSEEK_EASYSEARCH" + script "../main.nf" + process "FOLDSEEK_EASYSEARCH" + tag "modules" + tag "modules_nfcore" + tag "foldseek" + tag "foldseek/createdb" + tag "foldseek/easysearch" + + setup { + run("FOLDSEEK_CREATEDB") { + script "../../createdb/main.nf" + process { + """ + input[0] = [ [ id:'test_db' ], [ file(params.modules_testdata_base_path + 'proteomics/pdb/1tim.pdb', checkIfExists: true) ] ] + """ + } + } + } + + test("proteomics - pdb") { + + when { + process { + """ + input[0] = [ [ id:'test_search' ], [ file(params.modules_testdata_base_path + 'proteomics/pdb/8tim.pdb', checkIfExists: true) ] ] + input[1] = FOLDSEEK_CREATEDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.aln.get(0).get(1)).readLines().contains("8tim_A\t1tim_A\t0.967\t247\t8\t0\t1\t247\t1\t247\t1.152E-43\t1523") }, + { assert process.out.versions } + ) + } + + } + + test("proteomics - pdb -stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test_search' ], [ file(params.modules_testdata_base_path + 'proteomics/pdb/8tim.pdb', checkIfExists: true) ] ] + input[1] = FOLDSEEK_CREATEDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/foldseek/easysearch/tests/main.nf.test.snap b/modules/nf-core/foldseek/easysearch/tests/main.nf.test.snap new file mode 100644 index 000000000..819648dd0 --- /dev/null +++ b/modules/nf-core/foldseek/easysearch/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "proteomics - pdb -stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_search" + }, + "test_search.m8:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,ddc75b2e08b63a7082ecad353073fd3b" + ], + "aln": [ + [ + { + "id": "test_search" + }, + "test_search.m8:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ddc75b2e08b63a7082ecad353073fd3b" + ] + } + ], + "timestamp": "2024-07-02T13:55:57.915188646" + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createindex/environment.yml b/modules/nf-core/mmseqs/createindex/environment.yml index 71ebe34c5..072223f2a 100644 --- a/modules/nf-core/mmseqs/createindex/environment.yml +++ b/modules/nf-core/mmseqs/createindex/environment.yml @@ -1,7 +1,7 @@ -name: mmseqs_createindex +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::mmseqs2=15.6f452 + - bioconda::mmseqs2=18.8cc5c diff --git a/modules/nf-core/mmseqs/createindex/main.nf b/modules/nf-core/mmseqs/createindex/main.nf index 4e9c82a34..fbff56fe2 100644 --- a/modules/nf-core/mmseqs/createindex/main.nf +++ b/modules/nf-core/mmseqs/createindex/main.nf @@ -1,18 +1,19 @@ process MMSEQS_CREATEINDEX { tag "${meta.id}" label 'process_high' + label 'process_high_memory' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': - 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fe/fe49c17754753d6cd9a31e5894117edaf1c81e3d6053a12bf6dc8f3af1dffe23/data' + : 'community.wave.seqera.io/library/mmseqs2:18.8cc5c--af05c9a98d9f6139'}" input: tuple val(meta), path(db) output: - tuple val(meta), path(db) , emit: db_indexed - path "versions.yml" , emit: versions + tuple val(meta), path(db), emit: db_indexed + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -20,18 +21,16 @@ process MMSEQS_CREATEINDEX { script: def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: "*.dbtype" - def prefix = task.ext.prefix ?: "${meta.id}" - """ - DB_INPUT_PATH_NAME=\$(find -L "$db/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_INPUT_PATH_NAME=\$(find -L "${db}/" -maxdepth 1 -name "${args2}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) mmseqs \\ createindex \\ \${DB_INPUT_PATH_NAME} \\ tmp1 \\ - $args \\ + ${args} \\ --threads ${task.cpus} \\ - --compressed 1 + --split-memory-limit ${(task.memory.toGiga() * 0.8) as int}G cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -40,10 +39,11 @@ process MMSEQS_CREATEINDEX { """ stub: + def args2 = task.ext.args2 ?: "*.dbtype" """ - DB_INPUT_PATH_NAME=\$(find -L "$db/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_INPUT_PATH_NAME=\$(find -L "${db}/" -maxdepth 1 -name "${args2}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) - touch "\${DB_PATH_NAME}.idx" + touch "\${DB_INPUT_PATH_NAME}.idx" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/mmseqs/createindex/meta.yml b/modules/nf-core/mmseqs/createindex/meta.yml index 0e34e5bb5..58b65c7f9 100644 --- a/modules/nf-core/mmseqs/createindex/meta.yml +++ b/modules/nf-core/mmseqs/createindex/meta.yml @@ -8,34 +8,45 @@ keywords: - indexing tools: - "mmseqs": - description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + description: "MMseqs2: ultra fast and sensitive sequence search and clustering + suite" homepage: "https://github.com/soedinglab/MMseqs2" documentation: "https://mmseqs.com/latest/userguide.pdf" tool_dev_url: "https://github.com/soedinglab/MMseqs2" doi: "10.1093/bioinformatics/btw006" licence: ["GPL v3"] + identifier: biotools:mmseqs input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'test', single_end:false ]` - - db: - type: directory - description: | - Directory containing the DB to be indexed - pattern: "*" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db: + type: directory + description: | + Directory containing the DB to be indexed + pattern: "*" output: - - versions: - type: file - description: | - File containing software versions - pattern: "versions.yml" - - db_indexed: - type: directory - description: | - Directory containing the DB and the generated indexes - pattern: "*" + db_indexed: + - - meta: + type: directory + description: | + Directory containing the DB and the generated indexes + pattern: "*" + - db: + type: directory + description: | + Directory containing the DB and the generated indexes + pattern: "*" + versions: + - versions.yml: + type: file + description: | + File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@JoseEspinosa" maintainers: diff --git a/modules/nf-core/mmseqs/createindex/tests/main.nf.test b/modules/nf-core/mmseqs/createindex/tests/main.nf.test index f47ccc375..30bcc9276 100644 --- a/modules/nf-core/mmseqs/createindex/tests/main.nf.test +++ b/modules/nf-core/mmseqs/createindex/tests/main.nf.test @@ -8,6 +8,7 @@ nextflow_process { tag "modules_nfcore" tag "mmseqs" tag "mmseqs/createindex" + tag "untar" test("sars-cov-2 - mmseqs.tar.gz") { @@ -16,7 +17,7 @@ nextflow_process { script "../../../untar/main.nf" process { """ - input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true) ] + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/mmseqs.tar.gz', checkIfExists: true) ] """ } } diff --git a/modules/nf-core/mmseqs/createindex/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createindex/tests/main.nf.test.snap index b0fde15d6..499f8ab9c 100644 --- a/modules/nf-core/mmseqs/createindex/tests/main.nf.test.snap +++ b/modules/nf-core/mmseqs/createindex/tests/main.nf.test.snap @@ -2,10 +2,14 @@ "versions": { "content": [ [ - "versions.yml:md5,63073370e7d20afd4773f5e6e7581582" + "versions.yml:md5,35132b6bcef6fb9a674a5f346c57282d" ] ], - "timestamp": "2023-11-28T11:59:51.777561277" + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-01T16:18:37.261481775" }, "createindex_filtered_files": { "content": [ @@ -14,7 +18,7 @@ "mmseqs.dbtype:md5,f1d3ff8443297732862df21dc4e57262", "mmseqs.fasta:md5,b40600ad3be77f076df716e6cf99c64c", "mmseqs.idx.dbtype:md5,9a198d4f48144e20661df7fd2dc41bf7", - "mmseqs.idx.index:md5,deac42b3a049a1c8a251d81b6b9538c9", + "mmseqs.idx.index:md5,3451d87bd8a5a7182d36d53edd213c59", "mmseqs.index:md5,c012bdab1c61eeafcb99d1b26650f3d0", "mmseqs.lookup:md5,fa898551a6b303614ae6e29c237b7fc6", "mmseqs.source:md5,16bef02c30aadbfa8d035596502f0aa2", @@ -49,6 +53,10 @@ "mmseqs_seq_h.index:md5,0040e6a02964914a87ef1efbe9011cbf" ] ], - "timestamp": "2023-11-28T11:59:51.728054679" + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-01T16:18:37.253778342" } } \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createindex/tests/tags.yml b/modules/nf-core/mmseqs/createindex/tests/tags.yml deleted file mode 100644 index 773b4adcd..000000000 --- a/modules/nf-core/mmseqs/createindex/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -mmseqs/createindex: - - modules/nf-core/mmseqs/createindex/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index ca39fb67e..d02016a00 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,7 +1,7 @@ -name: multiqc +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::multiqc=1.21 + - bioconda::multiqc=1.32 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index bef8f50b2..8f2086b4f 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,16 +1,18 @@ process MULTIQC { label 'process_single' - + tag "$meta.model" conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c6c120d559d7ee04c7442b61ad7cf5a9e8970be5feefb37d68eeaa60c1034eb/data' : + 'community.wave.seqera.io/library/multiqc:1.32--d58f60e4deb769bf' }" input: - path multiqc_files + tuple val(meta), path(multiqc_files) path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) + path(replace_names) + path(sample_names) output: path "*multiqc_report.html", emit: report @@ -23,16 +25,22 @@ process MULTIQC { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : "--filename ${meta.model}_multiqc_report.html" def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ $args \\ $config \\ + $prefix \\ $extra_config \\ $logo \\ + $replace \\ + $samples \\ . cat <<-END_VERSIONS > versions.yml @@ -44,7 +52,7 @@ process MULTIQC { stub: """ mkdir multiqc_data - touch multiqc_plots + mkdir multiqc_plots touch multiqc_report.html cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc35e..ce30eb732 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,6 @@ name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report +description: Aggregate results from bioinformatics analyses across many samples into + a single report keywords: - QC - bioinformatics tools @@ -12,40 +13,73 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + identifier: biotools:multiqc input: - multiqc_files: type: file description: | List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + ontologies: [] - multiqc_config: type: file description: Optional config yml for MultiQC pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML - extra_multiqc_config: type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + description: Second optional config yml for MultiQC. Will override common sections + in multiqc_config. pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML - multiqc_logo: type: file description: Optional logo file for MultiQC pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: + ontologies: [] + - replace_names: type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - sample_names: type: file - description: File containing software versions - pattern: "versions.yml" + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + report: + - "*multiqc_report.html": + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + ontologies: [] + data: + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - "*_plots": + type: file + description: Plots created by MultiQC + pattern: "*_data" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@abhi18av" - "@bunop" diff --git a/modules/nf-core/multiqc/multiqc.diff b/modules/nf-core/multiqc/multiqc.diff index d0efc03ac..d1b0c7d1c 100644 --- a/modules/nf-core/multiqc/multiqc.diff +++ b/modules/nf-core/multiqc/multiqc.diff @@ -1,14 +1,272 @@ Changes in module 'nf-core/multiqc' +--- modules/nf-core/multiqc/meta.yml ++++ modules/nf-core/multiqc/meta.yml +@@ -1,5 +1,6 @@ + name: multiqc +-description: Aggregate results from bioinformatics analyses across many samples into a single report ++description: Aggregate results from bioinformatics analyses across many samples into ++ a single report + keywords: + - QC + - bioinformatics tools +@@ -12,40 +13,59 @@ + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] ++ identifier: biotools:multiqc + input: +- - multiqc_files: +- type: file +- description: | +- List of reports / files recognised by MultiQC, for example the html and zip output of FastQC +- - multiqc_config: +- type: file +- description: Optional config yml for MultiQC +- pattern: "*.{yml,yaml}" +- - extra_multiqc_config: +- type: file +- description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. +- pattern: "*.{yml,yaml}" +- - multiqc_logo: +- type: file +- description: Optional logo file for MultiQC +- pattern: "*.{png}" ++ - - multiqc_files: ++ type: file ++ description: | ++ List of reports / files recognised by MultiQC, for example the html and zip output of FastQC ++ - - multiqc_config: ++ type: file ++ description: Optional config yml for MultiQC ++ pattern: "*.{yml,yaml}" ++ - - extra_multiqc_config: ++ type: file ++ description: Second optional config yml for MultiQC. Will override common sections ++ in multiqc_config. ++ pattern: "*.{yml,yaml}" ++ - - multiqc_logo: ++ type: file ++ description: Optional logo file for MultiQC ++ pattern: "*.{png}" ++ - - replace_names: ++ type: file ++ description: | ++ Optional two-column sample renaming file. First column a set of ++ patterns, second column a set of corresponding replacements. Passed via ++ MultiQC's `--replace-names` option. ++ pattern: "*.{tsv}" ++ - - sample_names: ++ type: file ++ description: | ++ Optional TSV file with headers, passed to the MultiQC --sample_names ++ argument. ++ pattern: "*.{tsv}" + output: + - report: +- type: file +- description: MultiQC report file +- pattern: "multiqc_report.html" ++ - "*multiqc_report.html": ++ type: file ++ description: MultiQC report file ++ pattern: "multiqc_report.html" + - data: +- type: directory +- description: MultiQC data dir +- pattern: "multiqc_data" ++ - "*_data": ++ type: directory ++ description: MultiQC data dir ++ pattern: "multiqc_data" + - plots: +- type: file +- description: Plots created by MultiQC +- pattern: "*_data" ++ - "*_plots": ++ type: file ++ description: Plots created by MultiQC ++ pattern: "*_data" + - versions: +- type: file +- description: File containing software versions +- pattern: "versions.yml" ++ - versions.yml: ++ type: file ++ description: File containing software versions ++ pattern: "versions.yml" + authors: + - "@abhi18av" + - "@bunop" + --- modules/nf-core/multiqc/main.nf +++ modules/nf-core/multiqc/main.nf -@@ -7,7 +7,7 @@ - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" - +@@ -3,14 +3,16 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : +- 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" ++ 'https://depot.galaxyproject.org/singularity/multiqc:1.25.1--pyhdfd78af_0' : ++ 'biocontainers/multiqc:1.25.1--pyhdfd78af_0' }" + input: - path multiqc_files, stageAs: "?/*" -+ path multiqc_files ++ tuple val(meta), path(multiqc_files) path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) ++ path(replace_names) ++ path(sample_names) + + output: + path "*multiqc_report.html", emit: report +@@ -23,16 +25,22 @@ + + script: + def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : "--filename ${meta.model}_multiqc_report.html" + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' +- def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' ++ def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' ++ def replace = replace_names ? "--replace-names ${replace_names}" : '' ++ def samples = sample_names ? "--sample-names ${sample_names}" : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ ++ $prefix \\ + $extra_config \\ + $logo \\ ++ $replace \\ ++ $samples \\ + . + + cat <<-END_VERSIONS > versions.yml +@@ -44,7 +52,7 @@ + stub: + """ + mkdir multiqc_data +- touch multiqc_plots ++ mkdir multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + +--- modules/nf-core/multiqc/environment.yml ++++ modules/nf-core/multiqc/environment.yml +@@ -1,7 +1,5 @@ +-name: multiqc + channels: + - conda-forge + - bioconda +- - defaults + dependencies: +- - bioconda::multiqc=1.21 ++ - bioconda::multiqc=1.25.1 + +--- modules/nf-core/multiqc/tests/main.nf.test.snap ++++ modules/nf-core/multiqc/tests/main.nf.test.snap +@@ -2,14 +2,14 @@ + "multiqc_versions_single": { + "content": [ + [ +- "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" ++ "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + ] + ], + "meta": { +- "nf-test": "0.8.4", +- "nextflow": "23.10.1" ++ "nf-test": "0.9.0", ++ "nextflow": "24.04.4" + }, +- "timestamp": "2024-02-29T08:48:55.657331" ++ "timestamp": "2024-10-02T17:51:46.317523" + }, + "multiqc_stub": { + "content": [ +@@ -17,25 +17,25 @@ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", +- "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" ++ "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + ] + ], + "meta": { +- "nf-test": "0.8.4", +- "nextflow": "23.10.1" ++ "nf-test": "0.9.0", ++ "nextflow": "24.04.4" + }, +- "timestamp": "2024-02-29T08:49:49.071937" ++ "timestamp": "2024-10-02T17:52:20.680978" + }, + "multiqc_versions_config": { + "content": [ + [ +- "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" ++ "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + ] + ], + "meta": { +- "nf-test": "0.8.4", +- "nextflow": "23.10.1" ++ "nf-test": "0.9.0", ++ "nextflow": "24.04.4" + }, +- "timestamp": "2024-02-29T08:49:25.457567" ++ "timestamp": "2024-10-02T17:52:09.185842" + } + } +--- modules/nf-core/multiqc/tests/main.nf.test ++++ modules/nf-core/multiqc/tests/main.nf.test +@@ -8,6 +8,8 @@ + tag "modules_nfcore" + tag "multiqc" + ++ config "./nextflow.config" ++ + test("sarscov2 single-end [fastqc]") { + + when { +@@ -17,6 +19,8 @@ + input[1] = [] + input[2] = [] + input[3] = [] ++ input[4] = [] ++ input[5] = [] + """ + } + } +@@ -41,6 +45,8 @@ + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] ++ input[4] = [] ++ input[5] = [] + """ + } + } +@@ -66,6 +72,8 @@ + input[1] = [] + input[2] = [] + input[3] = [] ++ input[4] = [] ++ input[5] = [] + """ + } + } + +--- /dev/null ++++ modules/nf-core/multiqc/tests/nextflow.config +@@ -0,0 +1,5 @@ ++process { ++ withName: 'MULTIQC' { ++ ext.prefix = null ++ } ++} ************************************************************ diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index f1c4242ef..33316a7dd 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -8,6 +8,8 @@ nextflow_process { tag "modules_nfcore" tag "multiqc" + config "./nextflow.config" + test("sarscov2 single-end [fastqc]") { when { @@ -17,6 +19,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -41,6 +45,8 @@ nextflow_process { input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -66,6 +72,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index bfebd8029..f5af2416c 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.3", + "nextflow": "24.10.4" }, - "timestamp": "2024-02-29T08:48:55.657331" + "timestamp": "2025-10-27T13:33:24.356715" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.3", + "nextflow": "24.10.4" }, - "timestamp": "2024-02-29T08:49:49.071937" + "timestamp": "2025-10-27T13:34:11.103619" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.3", + "nextflow": "24.10.4" }, - "timestamp": "2024-02-29T08:49:25.457567" + "timestamp": "2025-10-27T13:34:04.615233" } -} \ No newline at end of file +} diff --git a/modules/nf-core/multiqc/tests/nextflow.config b/modules/nf-core/multiqc/tests/nextflow.config new file mode 100644 index 000000000..c537a6a3e --- /dev/null +++ b/modules/nf-core/multiqc/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'MULTIQC' { + ext.prefix = null + } +} diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d37..000000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml index 0c9cbb101..9b926b1ff 100644 --- a/modules/nf-core/untar/environment.yml +++ b/modules/nf-core/untar/environment.yml @@ -1,11 +1,12 @@ -name: untar - +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda - - defaults - dependencies: + - conda-forge::coreutils=9.5 - conda-forge::grep=3.11 - - conda-forge::sed=4.7 + - conda-forge::gzip=1.13 + - conda-forge::lbzip2=2.5 + - conda-forge::sed=4.8 - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf index 8a75bb957..4f1913e7b 100644 --- a/modules/nf-core/untar/main.nf +++ b/modules/nf-core/untar/main.nf @@ -1,46 +1,47 @@ process UNTAR { - tag "$archive" + tag "${archive}" label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/d5/d5d18ee243d97f4627bf9a5211058b8beeabd215273bf7f772d6422ba91c4844/data' + : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:49568e208231bddc'}" input: tuple val(meta), path(archive) output: - tuple val(meta), path("$prefix"), emit: untar - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}"), emit: untar + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' - prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + def tar_opts = archive.toString().endsWith('tar.gz')? '-xzvf' : '-xvf' """ - mkdir $prefix + mkdir ${prefix} ## Ensures --strip-components only applied when top level of tar contents is a directory ## If just files or multiple directories, place all in prefix if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then tar \\ - -C $prefix --strip-components 1 \\ - -xavf \\ - $args \\ - $archive \\ - $args2 + -C ${prefix} --strip-components 1 \\ + $tar_opts \\ + ${args} \\ + ${archive} \\ + ${args2} else tar \\ - -C $prefix \\ - -xavf \\ - $args \\ - $archive \\ - $args2 + -C ${prefix} \\ + $tar_opts \\ + ${args} \\ + ${archive} \\ + ${args2} fi cat <<-END_VERSIONS > versions.yml @@ -50,11 +51,11 @@ process UNTAR { """ stub: - prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) """ mkdir $prefix touch ${prefix}/file.txt - + cat <<-END_VERSIONS > versions.yml "${task.process}": untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml index a9a2110f5..1b6bf491e 100644 --- a/modules/nf-core/untar/meta.yml +++ b/modules/nf-core/untar/meta.yml @@ -10,30 +10,41 @@ tools: Extract tar.gz files. documentation: https://www.gnu.org/software/tar/manual/ licence: ["GPL-3.0-or-later"] + identifier: "" input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - archive: - type: file - description: File to be untar - pattern: "*.{tar}.{gz}" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" + ontologies: + - edam: http://edamontology.org/format_3981 # TAR format + - edam: http://edamontology.org/format_3989 # GZIP format output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - untar: - type: directory - description: Directory containing contents of archive - pattern: "*/" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + untar: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + - ${prefix}: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test index 2a7c97bf8..c957517aa 100644 --- a/modules/nf-core/untar/tests/main.nf.test +++ b/modules/nf-core/untar/tests/main.nf.test @@ -6,6 +6,7 @@ nextflow_process { tag "modules" tag "modules_nfcore" tag "untar" + test("test_untar") { when { @@ -19,10 +20,9 @@ nextflow_process { then { assertAll ( { assert process.success }, - { assert snapshot(process.out.untar).match("test_untar") }, + { assert snapshot(process.out).match() }, ) } - } test("test_untar_onlyfiles") { @@ -38,10 +38,48 @@ nextflow_process { then { assertAll ( { assert process.success }, - { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + { assert snapshot(process.out).match() }, ) } + } + + test("test_untar - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } } + test("test_untar_onlyfiles - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + } } diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap index 64550292f..ceb91b792 100644 --- a/modules/nf-core/untar/tests/main.nf.test.snap +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -1,42 +1,158 @@ { "test_untar_onlyfiles": { "content": [ - [ - [ + { + "0": [ [ - - ], + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:28.231047" + }, + "test_untar_onlyfiles - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ [ - "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" ] - ] + } ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.3" }, - "timestamp": "2024-02-28T11:49:41.320643" + "timestamp": "2024-07-10T12:04:45.773103" + }, + "test_untar - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T12:04:36.777441" }, "test_untar": { "content": [ - [ - [ + { + "0": [ [ - - ], + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ], + "1": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" + ], + "untar": [ [ - "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", - "opts.k2d:md5,a033d00cf6759407010b21700938f543", - "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] ] + ], + "versions": [ + "versions.yml:md5,6063247258c56fd271d076bb04dd7536" ] - ] + } ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.3" }, - "timestamp": "2024-02-28T11:49:33.795172" + "timestamp": "2024-07-10T12:04:19.377674" } } \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml deleted file mode 100644 index feb6f15c0..000000000 --- a/modules/nf-core/untar/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -untar: - - modules/nf-core/untar/** diff --git a/modules/nf-core/untar/untar.diff b/modules/nf-core/untar/untar.diff index 0fac85e4f..30827625d 100644 --- a/modules/nf-core/untar/untar.diff +++ b/modules/nf-core/untar/untar.diff @@ -1,42 +1,80 @@ -Changes in module 'nf-core/untar' +Changes in component 'nf-core/untar' +'modules/nf-core/untar/environment.yml' is unchanged +'modules/nf-core/untar/meta.yml' is unchanged +Changes in 'untar/main.nf': --- modules/nf-core/untar/main.nf +++ modules/nf-core/untar/main.nf -@@ -20,8 +20,8 @@ - script: - def args = task.ext.args ?: '' +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container +- ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data' +- : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}" ++ ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/d5/d5d18ee243d97f4627bf9a5211058b8beeabd215273bf7f772d6422ba91c4844/data' ++ : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:49568e208231bddc'}" + + input: + tuple val(meta), path(archive) +@@ -21,6 +21,7 @@ + def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' -- untar = archive.toString() - '.tar.gz' -- + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + def tar_opts = archive.toString().endsWith('tar.gz')? '-xzvf' : '-xvf' -+ untar = archive.toString().endsWith('tar.gz')? archive.toString() - '.tar.gz' : archive.toString() - '.tar' - """ - mkdir output -@@ -30,14 +30,14 @@ - if [[ \$(tar -tzf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + """ + mkdir ${prefix} +@@ -30,14 +31,14 @@ + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then tar \\ - -C output --strip-components 1 \\ -- -xzvf \\ + -C ${prefix} --strip-components 1 \\ +- -xavf \\ + $tar_opts \\ - $args \\ - $archive \\ - $args2 + ${args} \\ + ${archive} \\ + ${args2} else tar \\ - -C output \\ -- -xzvf \\ + -C ${prefix} \\ +- -xavf \\ + $tar_opts \\ - $args \\ - $archive \\ - $args2 -@@ -52,7 +52,7 @@ - """ - + ${args} \\ + ${archive} \\ + ${args2} +@@ -52,30 +53,9 @@ stub: -- untar = archive.toString() - '.tar.gz' -+ untar = archive.toString().endsWith('tar.gz')? archive.toString() - '.tar.gz' : archive.toString() - '.tar' + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) """ - touch $untar - +- mkdir ${prefix} +- ## Dry-run untaring the archive to get the files and place all in prefix +- if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then +- for i in `tar -tf ${archive}`; +- do +- if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; +- then +- touch \${i} +- else +- mkdir -p \${i} +- fi +- done +- else +- for i in `tar -tf ${archive}`; +- do +- if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; +- then +- touch ${prefix}/\${i} +- else +- mkdir -p ${prefix}/\${i} +- fi +- done +- fi +- ++ mkdir $prefix ++ touch ${prefix}/file.txt ++ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') -************************************************************ \ No newline at end of file +'modules/nf-core/untar/tests/main.nf.test.snap' is unchanged +'modules/nf-core/untar/tests/main.nf.test' is unchanged +************************************************************ diff --git a/modules/nf-core/unzip/environment.yml b/modules/nf-core/unzip/environment.yml new file mode 100644 index 000000000..246158953 --- /dev/null +++ b/modules/nf-core/unzip/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::p7zip=16.02 diff --git a/modules/nf-core/unzip/main.nf b/modules/nf-core/unzip/main.nf new file mode 100644 index 000000000..b977ff6d9 --- /dev/null +++ b/modules/nf-core/unzip/main.nf @@ -0,0 +1,48 @@ +process UNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : + 'biocontainers/p7zip:16.02' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}/"), emit: unzipped_archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." } + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName) + """ + 7za \\ + x \\ + -o"${prefix}"/ \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') + END_VERSIONS + """ + + stub: + if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." } + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName) + """ + mkdir "${prefix}" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/unzip/meta.yml b/modules/nf-core/unzip/meta.yml new file mode 100644 index 000000000..ba1eb9129 --- /dev/null +++ b/modules/nf-core/unzip/meta.yml @@ -0,0 +1,50 @@ +name: unzip +description: Unzip ZIP archive files +keywords: + - unzip + - decompression + - zip + - archiving +tools: + - unzip: + description: p7zip is a quick port of 7z.exe and 7za.exe (command line version + of 7zip, see www.7-zip.org) for Unix. + homepage: https://sourceforge.net/projects/p7zip/ + documentation: https://sourceforge.net/projects/p7zip/ + tool_dev_url: https://sourceforge.net/projects/p7zip" + licence: ["LGPL-2.1-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: ZIP file + pattern: "*.zip" + ontologies: + - edam: http://edamontology.org/format_3987 # ZIP format +output: + unzipped_archive: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/: + type: directory + description: Directory contents of the unzipped archive + pattern: "${archive.baseName}/" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/unzip/tests/main.nf.test b/modules/nf-core/unzip/tests/main.nf.test new file mode 100644 index 000000000..238b68d8b --- /dev/null +++ b/modules/nf-core/unzip/tests/main.nf.test @@ -0,0 +1,54 @@ +nextflow_process { + + name "Test Process UNZIP" + script "../main.nf" + process "UNZIP" + + tag "modules" + tag "modules_nfcore" + tag "unzip" + + test("generic [tar] [tar_gz]") { + + when { + process { + """ + input[0] = [ + [ id: 'hello' ], + file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("generic [tar] [tar_gz] stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'hello' ], + file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/unzip/tests/main.nf.test.snap b/modules/nf-core/unzip/tests/main.nf.test.snap new file mode 100644 index 000000000..cdd2ab164 --- /dev/null +++ b/modules/nf-core/unzip/tests/main.nf.test.snap @@ -0,0 +1,76 @@ +{ + "generic [tar] [tar_gz] stub": { + "content": [ + { + "0": [ + [ + { + "id": "hello" + }, + [ + + ] + ] + ], + "1": [ + "versions.yml:md5,52c55ce814e8bc9edc5a6c625ed794b8" + ], + "unzipped_archive": [ + [ + { + "id": "hello" + }, + [ + + ] + ] + ], + "versions": [ + "versions.yml:md5,52c55ce814e8bc9edc5a6c625ed794b8" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-30T19:16:37.11550986" + }, + "generic [tar] [tar_gz]": { + "content": [ + { + "0": [ + [ + { + "id": "hello" + }, + [ + "hello.tar:md5,80c66db79a773bc87b3346035ff9593e" + ] + ] + ], + "1": [ + "versions.yml:md5,52c55ce814e8bc9edc5a6c625ed794b8" + ], + "unzipped_archive": [ + [ + { + "id": "hello" + }, + [ + "hello.tar:md5,80c66db79a773bc87b3346035ff9593e" + ] + ] + ], + "versions": [ + "versions.yml:md5,52c55ce814e8bc9edc5a6c625ed794b8" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-30T19:16:25.120242571" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 7a0c5c4ec..2a0e1e40d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,76 +11,213 @@ params { // Input options input = null - mode = 'alphafold2' // {alphafold2, colabfold, esmfold} + mode = 'alphafold2' // {alphafold2, colabfold, esmfold, rosettafold_all_atom, alphafold3, helixfold3, boltz, rosettafold2na} use_gpu = false + save_intermediates = false + split_fasta = false + db = null + full_dbs = false // true/false, globally sets full_dbs if not independently set + use_msa_server = false + msa_server_url = null + uniref30_prefix = null // Alphafold2 parameters - alphafold2_mode = "standard" - max_template_date = "2020-05-14" - full_dbs = false // true full_dbs, false reduced_dbs - alphafold2_model_preset = "monomer" // for AF2 {monomer (default), monomer_casp14, monomer_ptm, multimer} - alphafold2_db = null + alphafold2_mode = 'split_msa_prediction' // {standard, split_msa_prediction} + alphafold2_max_template_date = '2038-01-19' + alphafold2_full_dbs = null // true full_dbs, false reduced_dbs + alphafold2_model_preset = 'monomer_ptm' // for AF2 {monomer, monomer_casp14, monomer_ptm, multimer} + alphafold2_db = null + alphafold2_random_seed = null // Alphafold2 links - bfd_link = null - small_bfd_link = null - alphafold2_params_link = null - mgnify_link = null - pdb70_link = null - pdb_mmcif_link = null - pdb_obsolete_link = null - uniref30_alphafold2_link = null - uniref90_link = null - pdb_seqres_link = null - uniprot_sprot_link = null - uniprot_trembl_link = null + alphafold2_bfd_link = null + alphafold2_small_bfd_link = null + alphafold2_params_link = null + alphafold2_mgnify_link = null + alphafold2_pdb70_link = null + alphafold2_pdb_mmcif_link = null + alphafold2_pdb_obsolete_link = null + alphafold2_uniref30_link = null + alphafold2_uniref90_link = null + alphafold2_pdb_seqres_link = null + alphafold2_uniprot_sprot_link = null + alphafold2_uniprot_trembl_link = null // Alphafold2 paths - bfd_path = null - small_bfd_path = null - alphafold2_params_path = null - mgnify_path = null - pdb70_path = null - pdb_mmcif_path = null - uniref30_alphafold2_path = null - uniref90_path = null - pdb_seqres_path = null - uniprot_path = null + alphafold2_bfd_path = null + alphafold2_small_bfd_path = null + alphafold2_params_path = null + alphafold2_mgnify_path = null + alphafold2_pdb70_path = null + alphafold2_pdb_mmcif_path = null + alphafold2_pdb_obsolete_path = null + alphafold2_uniref30_path = null + alphafold2_uniref90_path = null + alphafold2_pdb_seqres_path = null + alphafold2_uniprot_path = null + + // Alphafold3 parameters + alphafold3_db = null + + // Alphafold3 links + alphafold3_small_bfd_link = null + alphafold3_mgnify_link = null + alphafold3_pdb_mmcif_link = null + alphafold3_uniref90_link = null + alphafold3_pdb_seqres_link = null + alphafold3_uniprot_link = null + alphafold3_rnacentral_link = null + alphafold3_nt_rna_link = null + alphafold3_rfam_link = null + + // Alphafold3 paths + alphafold3_small_bfd_path = null + alphafold3_params_path = null + alphafold3_mgnify_path = null + alphafold3_pdb_mmcif_path = null + alphafold3_uniref90_path = null + alphafold3_pdb_seqres_path = null + alphafold3_uniprot_path = null + alphafold3_rnacentral_path = null + alphafold3_nt_rna_path = null + alphafold3_rfam_path = null + + // Boltz parameters + boltz_model = null + boltz_use_potentials = false + boltz_use_kernels = true + + // Boltz links + boltz_ccd_link = null + boltz_model_link = null + boltz2_aff_link = null + boltz2_conf_link = null + boltz2_mols_link = null + + // Boltz paths + boltz_db = null + boltz_ccd_path = null + boltz_model_path = null + boltz2_aff_path = null + boltz2_conf_path = null + boltz2_mols_path = null // Colabfold parameters - colabfold_server = "webserver" - colabfold_model_preset = "alphafold2_ptm" // {'auto', 'alphafold2', 'alphafold2_ptm', 'alphafold2_multimer_v1', 'alphafold2_multimer_v2', 'alphafold2_multimer_v3'} - num_recycles_colabfold = 3 - use_amber = true + colabfold_model_preset = "alphafold2_ptm" // {'alphafold2_ptm', 'alphafold2_multimer_v1', 'alphafold2_multimer_v2', 'alphafold2_multimer_v3'} + colabfold_num_recycles = 3 + colabfold_use_amber = true + colabfold_use_gpu_relax = false colabfold_db = null - db_load_mode = 0 - host_url = null - use_templates = true - create_colabfold_index = false + colabfold_db_load_mode = 0 + colabfold_use_templates = false + colabfold_create_index = false // Colabfold links - colabfold_db_link = null - uniref30_colabfold_link = null + colabfold_db_link = null + colabfold_uniref30_link = null // Colabfold paths - colabfold_db_path = null - uniref30_colabfold_path = null + colabfold_envdb_path = null + colabfold_uniref30_path = null // Esmfold parameters - esmfold_db = null - esmfold_model_preset = "monomer" - num_recycles_esmfold = 4 + esmfold_db = null + esmfold_model_preset = "monomer" + esmfold_num_recycles = 4 // Esmfold links - esmfold_3B_v1 = null - esm2_t36_3B_UR50D = null - esm2_t36_3B_UR50D_contact_regression = null + esmfold_3B_v1 = null + esm2_t36_3B_UR50D = null + esm2_t36_3B_UR50D_contact_regression = null // Esmfold paths - esmfold_params_path = null + esmfold_params_path = null + + // RoseTTAFold_All_Atom parameters + rosettafold_all_atom_db = null + + // RoseTTAFold_All_Atom links + rosettafold_all_atom_uniref30_link = null + rosettafold_all_atom_pdb100_link = null + rosettafold_all_atom_bfd_link = null + rosettafold_all_atom_paper_weights_link = null + + // RoseTTAFold_All_Atom paths + rosettafold_all_atom_uniref30_path = null + rosettafold_all_atom_pdb100_path = null + rosettafold_all_atom_bfd_path = null + rosettafold_all_atom_paper_weights_path = null + + // Helixfold3 parameters + helixfold3_db = null + // helixfold3_full_dbs = null // true full_dbs, false reduced_dbs + helixfold3_precision = "bf16" + helixfold3_infer_times = 4 + helixfold3_max_template_date = "2038-01-19" + + // Helixfold3 links + helixfold3_uniclust30_link = null + helixfold3_ccd_preprocessed_link = null + helixfold3_rfam_link = null + helixfold3_init_models_link = null + helixfold3_bfd_link = null + helixfold3_small_bfd_link = null + helixfold3_uniprot_sprot_link = null + helixfold3_uniprot_trembl_link = null + helixfold3_pdb_seqres_link = null + helixfold3_uniref90_link = null + helixfold3_mgnify_link = null + helixfold3_pdb_mmcif_link = null + helixfold3_obsolete_link = null + helixfold3_maxit_src_link = null + + // Helixfold3 paths + helixfold3_uniclust30_path = null + helixfold3_ccd_preprocessed_path = null + helixfold3_rfam_path = null + helixfold3_init_models_path = null + helixfold3_bfd_path = null + helixfold3_small_bfd_path = null + helixfold3_uniprot_path = null + helixfold3_pdb_seqres_path = null + helixfold3_uniref90_path = null + helixfold3_mgnify_path = null + helixfold3_pdb_mmcif_path = null + helixfold3_obsolete_path = null + helixfold3_maxit_src_path = null + + // RosettaFold2NA parameters + rosettafold2na_db = null + + // RosettaFold2NA links + rosettafold2na_uniref30_link = null + rosettafold2na_bfd_link = null + rosettafold2na_pdb100_link = null + rosettafold2na_weights_link = null + rfam_full_region_link = null + rfam_cm_link = null + rnacentral_rfam_annotations_link = null + rnacentral_id_mapping_link = null + rnacentral_sequences_link = null + + // RosettaFold2NA paths + rosettafold2na_uniref30_path = null + rosettafold2na_bfd_path = null + rosettafold2na_pdb100_path = null + rosettafold2na_weights_path = null + rosettafold2na_rna_path = null + + // Foldseek params + skip_foldseek = true + foldseek_easysearch_arg = null + + // Foldseek databases paths + foldseek_db = null + foldseek_db_path = null // Process skipping options - skip_multiqc = false + skip_multiqc = false + skip_visualisation = false // MultiQC options multiqc_config = null @@ -96,50 +233,31 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false - hook_url = null + hook_url = System.getenv('HOOK_URL') help = false + help_full = false + show_hidden = false version = false pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') // Config options config_profile_name = null config_profile_description = null + custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - // Max resource options - // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' - // Schema validation default options - validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = '' - validationShowHiddenParams = false - validate_params = true + validate_params = true } // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Load nf-core custom profiles from different Institutions -try { - includeConfig "${params.custom_config_base}/nfcore_custom.config" -} catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") -} - -// Load nf-core/proteinfold custom profiles from different institutions. -try { - includeConfig "${params.custom_config_base}/pipeline/proteinfold.config" -} catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config/proteinfold profiles: ${params.custom_config_base}/pipeline/proteinfold.config") -} profiles { debug { dumpHashes = true @@ -154,7 +272,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false - conda.channels = ['conda-forge', 'bioconda', 'defaults'] + conda.channels = ['conda-forge', 'bioconda'] apptainer.enabled = false } mamba { @@ -169,12 +287,7 @@ profiles { } docker { docker.enabled = true - docker.userEmulation = true - if (params.use_gpu) { - docker.runOptions = '--gpus all' - } else { - docker.runOptions = '-u $(id -u):$(id -g)' - } + docker.runOptions = params.use_gpu ? '--gpus all' : '-u $(id -u):$(id -g)' conda.enabled = false singularity.enabled = false podman.enabled = false @@ -182,23 +295,33 @@ profiles { charliecloud.enabled = false apptainer.enabled = false } - arm { - if (params.use_gpu) { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64 --gpus all' - } else { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' - } + arm64 { + process.arch = 'arm64' + // TODO https://github.com/nf-core/modules/issues/6694 + // For now if you're using arm64 you have to use wave for the sake of the maintainers + // wave profile + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' + } + emulate_amd64 { + docker.runOptions = params.use_gpu ? + '-u $(id -u):$(id -g) --platform=linux/amd64 --gpus all' : + '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { - singularity.enabled = true - singularity.autoMounts = true - if (params.use_gpu) { singularity.runOptions = '--nv' } - conda.enabled = false - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = true + singularity.autoMounts = true + singularity.pullTimeout = '40m' + singularity.runOptions = params.use_gpu ? '--nv' : '' + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true @@ -230,6 +353,8 @@ profiles { apptainer { apptainer.enabled = true apptainer.autoMounts = true + apptainer.pullTimeout = '40m' + apptainer.runOptions = params.use_gpu ? '--nv' : '' conda.enabled = false docker.enabled = false singularity.enabled = false @@ -244,41 +369,58 @@ profiles { wave.freeze = true wave.strategy = 'conda,container' } - gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' } - test { includeConfig 'conf/test.config' } - test_alphafold2_split { includeConfig 'conf/test_alphafold_split.config' } - test_alphafold2_download { includeConfig 'conf/test_alphafold_download.config' } - test_colabfold_local { includeConfig 'conf/test_colabfold_local.config' } - test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' } - test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' } - test_esmfold { includeConfig 'conf/test_esmfold.config' } - test_full { includeConfig 'conf/test_full.config' } - test_full_alphafold2_standard { includeConfig 'conf/test_full.config' } - test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' } - test_full_alphafold2_multimer { includeConfig 'conf/test_full_alphafold_multimer.config' } - test_full_colabfold_local { includeConfig 'conf/test_full_colabfold_local.config' } - test_full_colabfold_webserver { includeConfig 'conf/test_full_colabfold_webserver.config' } - test_full_colabfold_multimer { includeConfig 'conf/test_full_colabfold_webserver_multimer.config' } - test_full_esmfold { includeConfig 'conf/test_full_esmfold.config' } - test_full_esmfold_multimer { includeConfig 'conf/test_full_esmfold_multimer.config' } + test { includeConfig 'conf/test.config' } + test_alphafold2_split { includeConfig 'conf/test_alphafold_split.config' } + test_alphafold2_download { includeConfig 'conf/test_alphafold_download.config' } + test_alphafold3_standard { includeConfig 'conf/test_alphafold3_standard.config' } + test_alphafold3_download { includeConfig 'conf/test_alphafold3_download.config' } + test_colabfold_local { includeConfig 'conf/test_colabfold_local.config' } + test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' } + test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' } + test_esmfold { includeConfig 'conf/test_esmfold.config' } + test_split_fasta { includeConfig 'conf/test_split_fasta.config' } + test_full { includeConfig 'conf/test_full.config' } + test_full_alphafold2_standard { includeConfig 'conf/test_full.config' } + test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' } + test_full_alphafold2_multimer { includeConfig 'conf/test_full_alphafold_multimer.config' } + test_full_colabfold_local { includeConfig 'conf/test_full_colabfold_local.config' } + test_full_colabfold_webserver { includeConfig 'conf/test_full_colabfold_webserver.config' } + test_full_colabfold_multimer { includeConfig 'conf/test_full_colabfold_webserver_multimer.config' } + test_full_esmfold { includeConfig 'conf/test_full_esmfold.config' } + test_full_esmfold_multimer { includeConfig 'conf/test_full_esmfold_multimer.config' } + test_full_helixfold3 { includeConfig 'conf/test_full_helixfold3.config' } + test_full_boltz { includeConfig 'conf/test_full_boltz.config' } + test_full_rosettafold_all_atom { includeConfig 'conf/test_full_rosettafold_all_atom.config' } + test_full_rosettafold2na { includeConfig 'conf/test_full_rosettafold2na.config' } + test_rosettafold_all_atom { includeConfig 'conf/test_rosettafold_all_atom.config' } + test_helixfold3 { includeConfig 'conf/test_helixfold3.config' } + test_rosettafold2na { includeConfig 'conf/test_rosettafold2na.config' } + test_full_boltz { includeConfig 'conf/test_full_boltz.config' } + test_boltz { includeConfig 'conf/test_boltz.config' } } -// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile -// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled -// Set to your registry if you have a mirror of containers -apptainer.registry = 'quay.io' -docker.registry = 'quay.io' -podman.registry = 'quay.io' -singularity.registry = 'quay.io' +// Load nf-core custom profiles from different institutions -// Nextflow plugins -plugins { - id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet -} +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load nf-core/proteinfold custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" + +// Load nf-core/proteinfold custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/proteinfold.config" : "/dev/null" + +// Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' +charliecloud.registry = 'quay.io' // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -291,88 +433,238 @@ env { JULIA_DEPOT_PATH = "/usr/local/share/julia" } -// Capture exit codes from upstream processes when piping -process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Set bash options +process.shell = [ + "bash", + "-C", // No clobber - prevent output redirection from overwriting files. + "-e", // Exit if a tool returns a non-zero status/exit code + "-u", // Treat unset variables and parameters as an error + "-o", // Returns the status of the last command to exit.. + "pipefail" // ..with a non-zero status or zero if all successfully execute +] // Disable process selector warnings by default. Use debug profile to enable warnings. nextflow.enable.configProcessNamesValidation = false -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${params.trace_report_suffix}.html" } report { enabled = true - file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${params.trace_report_suffix}.html" } trace { enabled = true - file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${params.trace_report_suffix}.txt" } dag { enabled = true - file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${params.trace_report_suffix}.html" } manifest { name = 'nf-core/proteinfold' - author = """Athanasios Baltzis, Jose Espinosa-Carrasco, Harshil Patel""" + contributors = [ + [ + name: 'Athanasios Baltzis', + affiliation: 'Centre for Genomic Regulation, Spain', + github: 'athbaltzis', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0002-7495-1218' + ], + [ + name: ' Jose Espinosa-Carrasco', + affiliation: 'Centre for Genomic Regulation, Spain', + github: 'joseespinosa', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0002-1541-042X' + ], + [ + name: 'Luisa Santus', + affiliation: 'Centre for Genomic Regulation, Spain', + github: 'luisas', + contribution: ['author', 'contributor'], + orcid: '0000-0002-5992-0771' + ], + [ + name: 'Leila Mansouri', + affiliation: 'Centre for Genomic Regulation, Spain', + github: 'l-mansouri', + contribution: ['author', 'contributor'], + orcid: '0000-0001-8442-9709' + ], + [ + name: 'Harshil Patel', + affiliation: 'Seqera', + github: 'drpatelh', + contribution: ['contributor'], + orcid: '0000-0003-2707-7940' + ], + [ + name: 'Joshua Caley', + affiliation: 'UNSW Structural Biology Facility, Australia', + github: 'jscgh', + contribution: ['maintainer', 'contributor'], + orcid: '0000-0002-9374-0969' + ], + [ + name: 'Keiran Rowell', + affiliation: 'UNSW Structural Biology Facility, Australia', + github: 'keiran-rowell-unsw', + contribution: ['maintainer', 'contributor'], + orcid: '0000-0001-6955-1167' + ], + [ + name: 'Patricia Bota', + affiliation: 'Pompeu Fabra University, Spain', + github: 'abotlp', + contribution: ['maintainer','contributor'], + orcid: '0000-0001-7034-3744' + ], + [ + name: 'Thomas Liftin', + affiliation: ['UNSW Structural Biology Facility, Australia', 'Australian BioCommons'], + github: 'tlitfin', + contribution: ['maintainer', 'contributor'], + orcid: '0000-0002-4863-3865' + ], + [ + name: 'Ziad Al-Bkhetan', + affiliation: 'Australian BioCommons', + github: 'ziadbkh', + contribution: ['contributor'], + orcid: '0000-0002-4032-5331' + ], + [ + name: 'Nathan Glades', + affiliation: 'UNSW Structural Biology Facility, Australia', + github: 'nbtm-sh', + contribution: ['contributor'], + orcid: '' + ], + [ + name: 'Evangelos Karatzas', + affiliation: 'EMBL-EBI, UK', + github: 'vagkaratzas', + contribution: ['contributor'], + orcid: '0000-0001-9132-8981' + ], + [ + name: 'Júlia Mir-Pedrol', + affiliation: 'Centre for Genomic Regulation, Spain', + github: 'mirpedrol', + contribution: ['contributor'], + orcid: '0000-0001-6104-9260' + ], + [ + name: "Mitchell J O Brien", + affiliation: ['Sydney Informatics Hub, The University of Sydney, Australia', 'Australian BioCommons'], + github: 'mitchob', + contribution: ['contributor'], + orcid: '0000-0003-0662-9101' + ], + ] homePage = 'https://github.com/nf-core/proteinfold' description = """Protein 3D structure prediction pipeline""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' - version = '1.1.1' + defaultBranch = 'master' + nextflowVersion = '!>=25.10.2' + version = '2.0.0' doi = '10.5281/zenodo.7629996' } +// Nextflow plugins +plugins { + id 'nf-schema@2.6.1' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} + +validation { + defaultIgnoreParams = ["genomes"] + monochromeLogs = params.monochrome_logs +} + +// Load DBs parameters +params.alphafold2_db = params.mode.toLowerCase().split(",").contains("alphafold2") ? + (params.alphafold2_db ?: params.db) : params.alphafold2_db +params.alphafold2_full_dbs = params.mode.toLowerCase().split(",").contains("alphafold2") ? + (params.alphafold2_full_dbs ?: params.full_dbs) : params.alphafold2_full_dbs +params.alphafold3_db = params.mode.toLowerCase().split(",").contains("alphafold3") ? + (params.alphafold3_db ?: params.db) : params.alphafold3_db +params.colabfold_db = (params.mode.toLowerCase().split(",").contains("colabfold") || params.mode.toLowerCase().split(",").contains("boltz")) ? + (params.colabfold_db ?: params.db) : params.colabfold_db +params.esmfold_db = params.mode.toLowerCase().split(",").contains("esmfold") ? + (params.esmfold_db ?: params.db) : params.esmfold_db +params.rosettafold_all_atom_db = params.mode.toLowerCase().split(",").contains("rosettafold_all_atom") ? + (params.rosettafold_all_atom_db ?: params.db) : params.rosettafold_all_atom_db +params.rosettafold2na_db = params.mode.toLowerCase().split(",").contains("rosettafold2na") ? + (params.rosettafold2na_db ?: params.db) : params.rosettafold2na_db +params.helixfold3_db = params.mode.toLowerCase().split(",").contains("helixfold3") ? + (params.helixfold3_db ?: params.db) : params.helixfold3_db +// Not supported yet +//params.helixfold3_full_dbs = params.mode.toLowerCase().split(",").contains("helixfold3") ? +// (params.helixfold3_full_dbs ?: params.full_dbs) : params.helixfold3_full_dbs +params.boltz_db = params.mode.toLowerCase().split(",").contains("boltz") ? (params.boltz_db ?: params.db) : params.boltz_db + // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' // Load modules config for pipeline specific modes -if (params.mode == 'alphafold2') { - includeConfig 'conf/modules_alphafold2.config' -} else if (params.mode == 'colabfold') { - includeConfig 'conf/modules_colabfold.config' -} else if (params.mode == 'esmfold') { - includeConfig 'conf/modules_esmfold.config' -} +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("alphafold2")) { + return 'conf/modules_alphafold2.config' + } + return '/dev/null' +}()) -// Load links to DBs and parameters -includeConfig 'conf/dbs.config' +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("alphafold3")) { + return 'conf/modules_alphafold3.config' + } + return '/dev/null' +}()) -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("colabfold")) { + return 'conf/modules_colabfold.config' } -} + return '/dev/null' +}()) +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("esmfold")) { + return 'conf/modules_esmfold.config' + } + return '/dev/null' +}()) +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("rosettafold_all_atom")) { + return 'conf/modules_rosettafold_all_atom.config' + } + return '/dev/null' +}()) + +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("helixfold3")) { + return 'conf/modules_helixfold3.config' + } + return '/dev/null' +}()) +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("boltz")) { + return 'conf/modules_boltz.config' + } + return '/dev/null' +}()) + +includeConfig ({ + if (params.mode.toLowerCase().split(",").contains("rosettafold2na")) { + return 'conf/modules_rosettafold2na.config' + } + return '/dev/null' +}()) + +// Load links to DBs and parameters +includeConfig 'conf/dbs.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index df0bbfe3a..7f2dc2575 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,10 +1,10 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/proteinfold/master/nextflow_schema.json", "title": "nf-core/proteinfold pipeline parameters", "description": "Protein 3D structure prediction pipeline", "type": "object", - "definitions": { + "$defs": { "input_output_options": { "title": "Global options", "type": "object", @@ -21,108 +21,148 @@ "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/proteinfold/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "fa_icon": "fas fa-file-csv", + "errorMessage": "Input samplesheet must be a CSV file that exists. Please check the file path and format." }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "fa_icon": "fas fa-folder-open", + "errorMessage": "Output directory path must be specified" }, "mode": { "type": "string", "default": "alphafold2", - "description": "Specifies the mode in which the pipeline will be run", - "enum": ["alphafold2", "colabfold", "esmfold"], - "fa_icon": "fas fa-cogs" + "description": "Specifies the mode in which the pipeline will be run. mode can be any combination of ['alphafold2', 'alphafold3', 'colabfold', 'esmfold', 'rosettafold_all_atom', 'boltz', 'helixfold3', 'rosettafold2na'] separated by a comma (',') with no spaces.", + "fa_icon": "fas fa-cogs", + "pattern": "^(alphafold2|alphafold3|colabfold|esmfold|rosettafold_all_atom|helixfold3|boltz|rosettafold2na|)(,(alphafold2|alphafold3|colabfold|esmfold|rosettafold_all_atom|helixfold3|boltz|rosettafold2na)?,?)*(?\n \n \n \"nf-core/proteinfold\"\n \n\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinfold)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinfold/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinfold/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinfold/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.13135393-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.13135393)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinfold)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinfold-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinfold)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinfold** is a bioinformatics best-practice analysis pipeline for Protein 3D structure prediction.\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/proteinfold/results).\n\n## Pipeline summary\n\n![Alt text](docs/images/nf-core-proteinfold_metro_map_1.1.0.png?raw=true \"nf-core-proteinfold 1.1.0 metro map\")\n\n1. Choice of protein structure prediction method:\n\n i. [AlphaFold2](https://github.com/deepmind/alphafold) - Regular AlphaFold2 (MSA computation and model inference in the same process)\n\n ii. [AlphaFold2 split](https://github.com/luisas/alphafold_split) - AlphaFold2 MSA computation and model inference in separate processes\n\n iii. [AlphaFold3](https://github.com/deepmind/alphafold) - Regular AlphaFold3 (MSA computation and model inference in the same process)\n\n iv. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 API server followed by ColabFold\n\n v. [ColabFold](https://github.com/sokrypton/ColabFold) - MMseqs2 local search followed by ColabFold\n\n vi. [ESMFold](https://github.com/facebookresearch/esm) - Regular ESM\n\n vii. [RoseTTAFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom/) - Regular RFAA\n\n viii. [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) - Regular HF3\n\n ix. [Boltz](https://github.com/jwohlwend/boltz/) - Regular Boltz-1\n\n x. [RosettaFold2NA](https://github.com/uw-ipd/RoseTTAFold2NA) - Regular RF2NA\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinfold \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\nThe pipeline takes care of downloading the databases and parameters required by AlphaFold2, Colabfold, ESMFold RoseTTAFold-All-Atom or RosettaFold2NA. In case you have already downloaded the required files, you can skip this step by providing the path to the databases using the corresponding parameter [`--alphafold2_db`], [`--colabfold_db`], [`--esmfold_db`] or ['--rosettafold_all_atom_db']. Please refer to the [usage documentation](https://nf-co.re/proteinfold/usage) to check the directory structure you must provide for each database.\n\n- The typical command to run AlphaFold2 mode is shown below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode alphafold2 \\\n --alphafold2_db \\\n --alphafold2_full_dbs \\\n --alphafold2_model_preset monomer \\\n --use_gpu \\\n -profile \n ```\n\n- Here is the command to run AlphaFold2 splitting the MSA from the prediction execution:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode alphafold2 \\\n --alphafold2_mode split_msa_prediction \\\n --alphafold2_db \\\n --alphafold2_full_dbs \\\n --alphafold2_model_preset monomer \\\n --use_gpu \\\n -profile \n ```\n\n- The AlphaFold3 mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode alphafold3 \\\n --alphafold3_db \\\n --use_gpu \\\n -profile \n ```\n\n > [!WARNING]\n > The AlphaFold3 weights are not provided by this pipeline. Users must obtain the weights directly from DeepMind according to their [terms of use](https://github.com/deepmind/alphafold/blob/main/WEIGHTS_TERMS_OF_USE.md) and [prohibited use policy](https://github.com/deepmind/alphafold/blob/main/WEIGHTS_PROHIBITED_USE_POLICY.md). Please ensure you comply with all terms and conditions before using AlphaFold3. For more information about AlphaFold3 usage and requirements, please refer to the [official AlphaFold3 repository](https://github.com/deepmind/alphafold).\n\n- Below, the command to run colabfold_local mode:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode colabfold \\\n --colabfold_db \\\n --num_recycles_colabfold 3 \\\n --use_amber \\\n --colabfold_model_preset \"alphafold2_ptm\" \\\n --use_gpu \\\n --db_load_mode 0\n -profile \n ```\n\n- The typical command to run colabfold_webserver mode would be:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode colabfold \\\n --use_msa_server \\\n --msa_server_url \\\n --colabfold_db \\\n --num_recycles_colabfold 3 \\\n --use_amber \\\n --colabfold_model_preset \"alphafold2_ptm\" \\\n --use_gpu \\\n -profile \n ```\n\n > [!WARNING]\n > If you aim to carry out a large amount of predictions using the colabfold_webserver mode, please setup and use your own custom MMSeqs2 API Server. You can find instructions [here](https://github.com/sokrypton/ColabFold/tree/main/MsaServer).\n\n- The esmfold mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode esmfold \\\n --esmfold_model_preset \\\n --esmfold_db \\\n --num_recycles_esmfold 4 \\\n --use_gpu \\\n -profile \n ```\n\n- The rosettafold_all_atom mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode rosettafold_all_atom \\\n --rosettafold_all_atom_db \\\n --use_gpu \\\n -profile \n ```\n\n- The helixfold3 mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode helixfold3 \\\n --helixfold3_db \\\n --use_gpu \\\n -profile \n ```\n\n- The RosettaFold2NA mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode rosettafold2na \\\n --rosettafold2na_db \\\n --use_gpu \\\n -profile \n ```\n\n- The boltz mode can be run using the command below:\n\n ```console\n nextflow run nf-core/proteinfold \\\n --input samplesheet.csv \\\n --outdir \\\n --mode boltz \\\n --boltz_ccd_path \\\n --boltz_model_path \\\n --use_gpu \\\n -profile \n ```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinfold/usage) and the [parameter documentation](https://nf-co.re/proteinfold/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinfold/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinfold/output).\n\n## Adding new modes to the pipeline\n\nFor details on how to contribute new modes to the pipeline please refer to the [Howto contribute new modes](https://nf-co.re/proteinfold/usage/HOWTO_CONTRIBUTE_NEW_MODES).\n\n## Credits\n\nnf-core/proteinfold was originally written by Athanasios Baltzis ([@athbaltzis](https://github.com/athbaltzis)), Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)), Luisa Santus ([@luisas](https://github.com/luisas)) and Leila Mansouri ([@l-mansouri](https://github.com/l-mansouri)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/) under the umbrella of the [BovReg project](https://www.bovreg.eu/) and Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/).\n\nMany thanks to others who have helped out and contributed along the way too, including (but not limited to): Norman Goodacre and Waleed Osman from Interline Therapeutics ([@interlinetx](https://github.com/interlinetx)), Martin Steinegger ([@martin-steinegger](https://github.com/martin-steinegger)) and Raoul J.P. Bonnal ([@rjpbonnal](https://github.com/rjpbonnal))\n\nWe would also like to thanks to the AWS Open Data Sponsorship Program for generously providing the resources necessary to host the data utilized in the testing, development, and deployment of nf-core proteinfold.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinfold` channel](https://nfcore.slack.com/channels/proteinfold) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/proteinfold for your analysis, please cite it using the following doi: [10.5281/zenodo.7437038](https://doi.org/10.5281/zenodo.7437038)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "hasPart": [ + { + "@id": "main.nf" + }, + { + "@id": "docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png" + }, + { + "@id": "assets/" + }, + { + "@id": "bin/" + }, + { + "@id": "conf/" + }, + { + "@id": "docs/" + }, + { + "@id": "docs/images/" + }, + { + "@id": "modules/" + }, + { + "@id": "modules/local/" + }, + { + "@id": "modules/nf-core/" + }, + { + "@id": "workflows/" + }, + { + "@id": "subworkflows/" + }, + { + "@id": "nextflow.config" + }, + { + "@id": "README.md" + }, + { + "@id": "nextflow_schema.json" + }, + { + "@id": "CHANGELOG.md" + }, + { + "@id": "LICENSE" + }, + { + "@id": "CODE_OF_CONDUCT.md" + }, + { + "@id": "CITATIONS.md" + }, + { + "@id": "modules.json" + }, + { + "@id": "docs/usage.md" + }, + { + "@id": "docs/output.md" + }, + { + "@id": ".nf-core.yml" + }, + { + "@id": ".pre-commit-config.yaml" + }, + { + "@id": ".prettierignore" + } + ], + "isBasedOn": "https://github.com/nf-core/proteinfold", + "license": "MIT", + "mainEntity": { + "@id": "main.nf" + }, + "mentions": [ + { + "@id": "#ff04bf6f-9ba6-45a4-ba22-677f4255a7d2" + } + ], + "name": "nf-core/proteinfold" + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": [ + { + "@id": "https://w3id.org/ro/crate/1.1" + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0" + } + ] + }, + { + "@id": "main.nf", + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], + "creator": [ + { + "@id": "https://orcid.org/0000-0002-4863-3865" + }, + { + "@id": "https://orcid.org/0000-0002-4032-5331" + }, + { + "@id": "#petrislp@gmail.com" + }, + { + "@id": "#tomlitfin@gmail.com" + }, + { + "@id": "https://orcid.org/0000-0001-6955-1167" + }, + { + "@id": "https://orcid.org/0000-0002-1541-042X" + }, + { + "@id": "https://orcid.org/0000-0002-4032-5331" + }, + { + "@id": "https://orcid.org/0000-0002-7495-1218" + }, + { + "@id": "#drpatelhh@gmail.com" + } + ], + "dateCreated": "", + "dateModified": "2026-03-12T12:05:10Z", + "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", + "image": { + "@id": "docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png" + }, + "keywords": [ + "nf-core", + "nextflow", + "alphafold2", + "colabfold", + "esmfold", + "protein-fold-prediction", + "protein-folding", + "protein-sequences", + "protein-structure" + ], + "license": [ + "MIT" + ], + "maintainer": [ + { + "@id": "#petrislp@gmail.com" + }, + { + "@id": "https://orcid.org/0000-0001-6955-1167" + }, + { + "@id": "https://orcid.org/0000-0002-4032-5331" + }, + { + "@id": "https://orcid.org/0000-0002-7495-1218" + }, + { + "@id": "#drpatelhh@gmail.com" + } + ], + "name": [ + "nf-core/proteinfold" + ], + "programmingLanguage": { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" + }, + "sdPublisher": { + "@id": "https://nf-co.re/" + }, + "url": [ + "https://github.com/nf-core/proteinfold", + "https://nf-co.re/proteinfold/2.0.0/" + ], + "version": [ + "2.0.0" + ] + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", + "@type": "ComputerLanguage", + "identifier": { + "@id": "https://www.nextflow.io/" + }, + "name": "Nextflow", + "url": { + "@id": "https://www.nextflow.io/" + }, + "version": "!>=25.10.2" + }, + { + "@id": "docs/images/nf-core-proteinfold_metro_map_1.1.0_transp.png", + "@type": [ + "File", + "ImageObject" + ], + "name": "Workflow diagram" + }, + { + "@id": "#ff04bf6f-9ba6-45a4-ba22-677f4255a7d2", + "@type": "TestSuite", + "instance": [ + { + "@id": "#78aecd07-24c2-47d1-905a-ef8e4fadb1e1" + } + ], + "mainEntity": { + "@id": "main.nf" + }, + "name": "Test suite for nf-core/proteinfold" + }, + { + "@id": "#78aecd07-24c2-47d1-905a-ef8e4fadb1e1", + "@type": "TestInstance", + "name": "GitHub Actions workflow for testing nf-core/proteinfold", + "resource": "repos/nf-core/proteinfold/actions/workflows/nf-test.yml", + "runsOn": { + "@id": "https://w3id.org/ro/terms/test#GithubService" + }, + "url": "https://api.github.com" + }, + { + "@id": "https://w3id.org/ro/terms/test#GithubService", + "@type": "TestService", + "name": "Github Actions", + "url": { + "@id": "https://github.com" + } + }, + { + "@id": "assets/", + "@type": "Dataset", + "description": "Additional files" + }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, + { + "@id": "conf/", + "@type": "Dataset", + "description": "Configuration files" + }, + { + "@id": "docs/", + "@type": "Dataset", + "description": "Markdown files for documenting the pipeline" + }, + { + "@id": "docs/images/", + "@type": "Dataset", + "description": "Images for the documentation files" + }, + { + "@id": "modules/", + "@type": "Dataset", + "description": "Modules used by the pipeline" + }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, + { + "@id": "modules/nf-core/", + "@type": "Dataset", + "description": "nf-core modules" + }, + { + "@id": "workflows/", + "@type": "Dataset", + "description": "Main pipeline workflows to be executed in main.nf" + }, + { + "@id": "subworkflows/", + "@type": "Dataset", + "description": "Smaller subworkflows" + }, + { + "@id": "nextflow.config", + "@type": "File", + "description": "Main Nextflow configuration file" + }, + { + "@id": "README.md", + "@type": "File", + "description": "Basic pipeline usage information" + }, + { + "@id": "nextflow_schema.json", + "@type": "File", + "description": "JSON schema for pipeline parameter specification" + }, + { + "@id": "CHANGELOG.md", + "@type": "File", + "description": "Information on changes made to the pipeline" + }, + { + "@id": "LICENSE", + "@type": "File", + "description": "The license - should be MIT" + }, + { + "@id": "CODE_OF_CONDUCT.md", + "@type": "File", + "description": "The nf-core code of conduct" + }, + { + "@id": "CITATIONS.md", + "@type": "File", + "description": "Citations needed when using the pipeline" + }, + { + "@id": "modules.json", + "@type": "File", + "description": "Version information for modules from nf-core/modules" + }, + { + "@id": "docs/usage.md", + "@type": "File", + "description": "Usage documentation" + }, + { + "@id": "docs/output.md", + "@type": "File", + "description": "Output documentation" + }, + { + "@id": ".nf-core.yml", + "@type": "File", + "description": "nf-core configuration file, configuring template features and linting rules" + }, + { + "@id": ".pre-commit-config.yaml", + "@type": "File", + "description": "Configuration file for pre-commit hooks" + }, + { + "@id": ".prettierignore", + "@type": "File", + "description": "Ignore file for prettier" + }, + { + "@id": "https://nf-co.re/", + "@type": "Organization", + "name": "nf-core", + "url": "https://nf-co.re/" + }, + { + "@id": "https://orcid.org/0000-0002-4863-3865", + "@type": "Person", + "email": "t.litfin@unsw.edu.au", + "name": "Thomas Litfin" + }, + { + "@id": "https://orcid.org/0000-0002-4032-5331", + "@type": "Person", + "email": "ziadbkh@users.noreply.github.com", + "name": "Ziad Al-Bkhetan" + }, + { + "@id": "#petrislp@gmail.com", + "@type": "Person", + "email": "petrislp@gmail.com", + "name": "Patricia Bota" + }, + { + "@id": "#tomlitfin@gmail.com", + "@type": "Person", + "email": "tomlitfin@gmail.com", + "name": "Tom Litfin" + }, + { + "@id": "https://orcid.org/0000-0001-6955-1167", + "@type": "Person", + "email": "54380465+keiran-rowell-unsw@users.noreply.github.com", + "name": "Keiran Rowell" + }, + { + "@id": "https://orcid.org/0000-0002-1541-042X", + "@type": "Person", + "email": "kadomu@gmail.com", + "name": "Jose Espinosa-Carrasco" + }, + { + "@id": "https://orcid.org/0000-0002-7495-1218", + "@type": "Person", + "email": "baltzis.athanasios@gmail.com", + "name": "Athanasios Baltzis" + }, + { + "@id": "#drpatelhh@gmail.com", + "@type": "Person", + "email": "drpatelhh@gmail.com", + "name": "Harshil Patel" + } + ] +} \ No newline at end of file diff --git a/subworkflows/local/aria2_uncompress.nf b/subworkflows/local/aria2_uncompress.nf index 09a27ff0d..323d91534 100644 --- a/subworkflows/local/aria2_uncompress.nf +++ b/subworkflows/local/aria2_uncompress.nf @@ -1,11 +1,11 @@ // // Download with aria2 and uncompress the data if needed // - -include { UNTAR } from '../../modules/nf-core/untar/main' -include { GUNZIP } from '../../modules/nf-core/gunzip/main' -include { ARIA2 } from '../../modules/nf-core/aria2/main' - +include { UNTAR } from '../../modules/nf-core/untar/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { ARIA2 } from '../../modules/nf-core/aria2/main' +include { UNZIP } from '../../modules/nf-core/unzip/main' +include { ZSTD_DECOMPRESS } from '../../modules/local/zstd_decompress/main.nf' workflow ARIA2_UNCOMPRESS { take: @@ -18,16 +18,38 @@ workflow ARIA2_UNCOMPRESS { source_url ] ) - ch_db = Channel.empty() + ch_db = channel.empty() - if (source_url.toString().endsWith('.tar') || source_url.toString().endsWith('.tar.gz')) { - ch_db = UNTAR ( ARIA2.out.downloaded_file ).untar.map{ it[1] } + if (source_url.toString().endsWith('.pkl.gz')) { + ch_db = ARIA2.out.downloaded_file.map { it -> it[1] } + } else if (source_url.toString().endsWith('.tar') || + source_url.toString().endsWith('.tar.gz') || + source_url.toString().endsWith('.tar.zst')|| + source_url.toString().endsWith('.tgz')) { + ch_db = UNTAR (ARIA2.out.downloaded_file).untar.map { it -> it[1] } } else if (source_url.toString().endsWith('.gz')) { - ch_db = GUNZIP ( ARIA2.out.downloaded_file ).gunzip.map { it[1] } + ch_db = GUNZIP (ARIA2.out.downloaded_file).gunzip.map { it -> it[1] } + } else if (source_url.toString().endsWith('.zst')) { + ch_db = ZSTD_DECOMPRESS (ARIA2.out.downloaded_file).decompressed.map { it -> it[1] } + } else if (source_url.toString().endsWith('.zip')) { + ch_db = UNZIP (ARIA2.out.downloaded_file) + .unzipped_archive + .map { _meta, dir -> + // Find the HelixFold3-params-240814 directory + def targetDir = dir.listFiles().find { it -> + it.isDirectory() && it.getName() == 'HelixFold3-params-240814' + } + // Find the .pdparams file in that directory + def pdparamsFile = targetDir.listFiles().find { it -> + it.getName().endsWith('.pdparams') + } + [ pdparamsFile ] + } + } else { + ch_db = ARIA2.out.downloaded_file.map { it -> it[1] } } emit: db = ch_db // channel: [ db ] versions = ARIA2.out.versions // channel: [ versions.yml ] } - diff --git a/subworkflows/local/post_processing.nf b/subworkflows/local/post_processing.nf new file mode 100644 index 000000000..d100361ce --- /dev/null +++ b/subworkflows/local/post_processing.nf @@ -0,0 +1,161 @@ +// +// Post processing analysis for the predicted structures +// + +// +// SUBWORKFLOW: Consisting entirely of nf-core/modules +// +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from './utils_nfcore_proteinfold_pipeline' + +include { GENERATE_REPORT } from '../../modules/local/generate_report' +include { COMPARE_STRUCTURES } from '../../modules/local/compare_structures' +include { FOLDSEEK_EASYSEARCH } from '../../modules/nf-core/foldseek/easysearch/main' +include { MULTIQC } from '../../modules/nf-core/multiqc/main' + + +workflow POST_PROCESSING { + + take: + skip_visualisation + requested_modes_size + ch_report_input + ch_report_template + ch_comparison_template + skip_foldseek + foldseek_db + foldseek_db_path + skip_multiqc + outdir + ch_versions + ch_multiqc_rep + ch_multiqc_config + ch_multiqc_custom_config + ch_multiqc_logo + ch_multiqc_methods_description + ch_top_ranked_model + + main: + ch_comparison_report_files = channel.empty() + + if (!skip_visualisation){ + GENERATE_REPORT( + ch_report_input, + ch_report_template + ) + ch_versions = ch_versions.mix(GENERATE_REPORT.out.versions) + + if (requested_modes_size > 1){ + ch_dummy_file = channel.fromPath("$projectDir/assets/NO_FILE") + + def esm = ch_top_ranked_model.filter { it ->it[0].model == 'esmfold' } + def not_esm = ch_top_ranked_model.filter { it -> it[0].model != 'esmfold' } + + esm = esm + .map { it -> + [it[0], it[1]] + } + .merge(ch_dummy_file) + + not_esm = not_esm + .map { it -> [it[0], it[1]] } + .join(GENERATE_REPORT.out.sequence_coverage) + + not_esm.mix(esm).set{ch_comparison_report_files} + + ch_comparison_report_files + .map { it -> + [["id": it[0].id], it[0], it[1], it[2]] + } + .groupTuple(by: [0], size: requested_modes_size) + .map { it -> + it[0].models=it[1].join(','); + [it[0], it[2], it[3]] + } + .set { ch_comparison_report_input } + + COMPARE_STRUCTURES( + ch_comparison_report_input + .map { it -> + [it[0], it[1].collect { file -> file.name} ] + }, + ch_comparison_report_input + .map { it -> + [ it[0], it[2].collect { file -> file.name } ] + }, + ch_comparison_report_input + .map { it -> + (it[1] + it[2]).unique() + }, + ch_comparison_template + ) + ch_versions = ch_versions.mix(COMPARE_STRUCTURES.out.versions) + } + } + + if (!skip_foldseek) { + ch_foldseek_db = channel.value([ + [ + id: foldseek_db, + ], + file(foldseek_db_path, checkIfExists: true) + ]) + FOLDSEEK_EASYSEARCH( + ch_top_ranked_model, + ch_foldseek_db + ) + } + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${outdir}/pipeline_info", + name: 'nf_core_' + 'proteinfold_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ).set { ch_collated_versions } + + // + // MODULE: MultiQC + // + ch_multiqc_report = channel.empty() + + if (!skip_multiqc) { + summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + ch_methods_description = channel.value(methodsDescriptionText(ch_multiqc_methods_description)) + + ch_multiqc_files = channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) + + MULTIQC ( + ch_multiqc_rep + .combine( + ch_multiqc_files + .collect() + .map { it -> [it] } + ) + .map { it -> [ it[0], it[1] + it[2] ] }, + ch_multiqc_config, + ch_multiqc_custom_config + .collect() + .ifEmpty([]), + ch_multiqc_logo + .collect() + .ifEmpty([]), + [], + [] + ) + ch_multiqc_report = MULTIQC.out.report.toList() + } + + emit: + versions = ch_versions + multiqc_report = ch_multiqc_report +} diff --git a/subworkflows/local/prepare_alphafold2_dbs.nf b/subworkflows/local/prepare_alphafold2_dbs.nf index 4621af6bf..1dc28e715 100644 --- a/subworkflows/local/prepare_alphafold2_dbs.nf +++ b/subworkflows/local/prepare_alphafold2_dbs.nf @@ -2,18 +2,17 @@ // Download all the required AlphaFold 2 databases and parameters // -include { - ARIA2_UNCOMPRESS as ARIA2_ALPHAFOLD2_PARAMS - ARIA2_UNCOMPRESS as ARIA2_BFD - ARIA2_UNCOMPRESS as ARIA2_SMALL_BFD - ARIA2_UNCOMPRESS as ARIA2_MGNIFY - ARIA2_UNCOMPRESS as ARIA2_PDB70 - ARIA2_UNCOMPRESS as ARIA2_UNIREF30 - ARIA2_UNCOMPRESS as ARIA2_UNIREF90 - ARIA2_UNCOMPRESS as ARIA2_UNIPROT_SPROT - ARIA2_UNCOMPRESS as ARIA2_UNIPROT_TREMBL } from './aria2_uncompress' - -include { ARIA2 as ARIA2_PDB_SEQRES } from '../../modules/nf-core/aria2/main' +include { ARIA2_UNCOMPRESS as ARIA2_ALPHAFOLD2_PARAMS } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_SMALL_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_MGNIFY } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_PDB70 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_OBSOLETE } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF30 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF90 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIPROT_SPROT } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIPROT_TREMBL } from './aria2_uncompress' +include { ARIA2 as ARIA2_PDB_SEQRES } from '../../modules/nf-core/aria2/main' include { COMBINE_UNIPROT } from '../../modules/local/combine_uniprot' include { DOWNLOAD_PDBMMCIF } from '../../modules/local/download_pdbmmcif' @@ -22,14 +21,15 @@ workflow PREPARE_ALPHAFOLD2_DBS { take: alphafold2_db // directory: path to alphafold2 DBs - full_dbs // boolean: Use full databases (otherwise reduced version) + alphafold2_full_dbs // boolean: Use full databases (otherwise reduced version) bfd_path // directory: /path/to/bfd/ small_bfd_path // directory: /path/to/small_bfd/ alphafold2_params_path // directory: /path/to/alphafold2/params/ mgnify_path // directory: /path/to/mgnify/ pdb70_path // directory: /path/to/pdb70/ - pdb_mmcif_path // directory: /path/to/pdb_mmcif/ - uniref30_alphafold2_path // directory: /path/to/uniref30/alphafold2/ + pdb_mmcif_path // directory: /path/to/pdb_mmcif/mmcif_files/ + pdb_obsolete_path // directory: /path/to/pdb_mmcif/obsolete.dat + alphafold2_uniref30_path // directory: /path/to/uniref30/alphafold2/ uniref90_path // directory: /path/to/uniref90/ pdb_seqres_path // directory: /path/to/pdb_seqres/ uniprot_path // directory: /path/to/uniprot/ @@ -40,45 +40,49 @@ workflow PREPARE_ALPHAFOLD2_DBS { pdb70_link // string: Specifies the link to download pdb70 pdb_mmcif_link // string: Specifies the link to download pdb_mmcif pdb_obsolete_link // string: Specifies the link to download pdb_obsolete - uniref30_alphafold2_link // string: Specifies the link to download uniref30_alphafold2 + alphafold2_uniref30_link // string: Specifies the link to download uniref30_alphafold2 uniref90_link // string: Specifies the link to download uniref90 pdb_seqres_link // string: Specifies the link to download pdb_seqres uniprot_sprot_link // string: Specifies the link to download uniprot_sprot uniprot_trembl_link // string: Specifies the link to download uniprot_trembl main: - ch_bfd = Channel.empty() - ch_small_bfd = Channel.empty() - ch_versions = Channel.empty() + ch_bfd = channel.value([]) + ch_small_bfd = channel.value([]) + ch_versions = channel.empty() if (alphafold2_db) { - if (full_dbs) { - ch_bfd = Channel.value(file(bfd_path)) - ch_small_bfd = Channel.value(file("${projectDir}/assets/dummy_db")) + if (alphafold2_full_dbs) { + ch_bfd = channel.value(file(bfd_path, checkIfExists: true)) + ch_small_bfd = channel.value(file("${projectDir}/assets/dummy_db")) } else { - ch_bfd = Channel.value(file("${projectDir}/assets/dummy_db")) - ch_small_bfd = Channel.value(file(small_bfd_path)) + ch_bfd = channel.value(file("${projectDir}/assets/dummy_db")) + ch_small_bfd = channel.value(file(small_bfd_path, checkIfExists: true)) } - ch_params = Channel.value(file(alphafold2_params_path)) - ch_mgnify = Channel.value(file(mgnify_path)) - ch_pdb70 = Channel.value(file(pdb70_path, type: 'dir' )) - ch_mmcif_files = file(pdb_mmcif_path, type: 'dir') - ch_mmcif_obsolete = file(pdb_mmcif_path, type: 'file') - ch_mmcif = Channel.value(ch_mmcif_files + ch_mmcif_obsolete) - ch_uniref30 = Channel.value(file(uniref30_alphafold2_path, type: 'any')) - ch_uniref90 = Channel.value(file(uniref90_path)) - ch_pdb_seqres = Channel.value(file(pdb_seqres_path)) - ch_uniprot = Channel.value(file(uniprot_path)) + ch_params = channel.value(file(alphafold2_params_path, checkIfExists: true)) + ch_mgnify = channel.value(file(mgnify_path, checkIfExists: true)) + ch_pdb70 = channel.value(file(pdb70_path, checkIfExists: true)) + ch_mmcif_files = channel.value(file(pdb_mmcif_path, checkIfExists: true)) + ch_obsolete = channel.value(file(pdb_obsolete_path, type: 'file', checkIfExists: true)) + ch_uniref30 = channel.value(file(alphafold2_uniref30_path, type: 'any', checkIfExists: true)) + ch_uniref90 = channel.value(file(uniref90_path, checkIfExists: true)) + ch_pdb_seqres = channel.value(file(pdb_seqres_path, checkIfExists: true)) + ch_uniprot = channel.value(file(uniprot_path, checkIfExists: true)) } else { - if (full_dbs) { + if (alphafold2_full_dbs) { ARIA2_BFD( bfd_link ) - ch_bfd = ARIA2_BFD.out.db + ch_bfd = ARIA2_BFD + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } ch_versions = ch_versions.mix(ARIA2_BFD.out.versions) } else { ARIA2_SMALL_BFD( @@ -91,7 +95,13 @@ workflow PREPARE_ALPHAFOLD2_DBS { ARIA2_ALPHAFOLD2_PARAMS( alphafold2_params_link ) - ch_params = ARIA2_ALPHAFOLD2_PARAMS.out.db + ch_params = ARIA2_ALPHAFOLD2_PARAMS + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } + ch_versions = ch_versions.mix(ARIA2_ALPHAFOLD2_PARAMS.out.versions) ARIA2_MGNIFY( @@ -103,21 +113,34 @@ workflow PREPARE_ALPHAFOLD2_DBS { ARIA2_PDB70( pdb70_link ) - ch_pdb70 = ARIA2_PDB70.out.db + ch_pdb70 = ARIA2_PDB70 + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } ch_versions = ch_versions.mix(ARIA2_PDB70.out.versions) DOWNLOAD_PDBMMCIF( pdb_mmcif_link, + ) + ch_mmcif_files = DOWNLOAD_PDBMMCIF.out.ch_db + ch_versions = ch_versions.mix(DOWNLOAD_PDBMMCIF.out.versions) + + ARIA2_OBSOLETE( pdb_obsolete_link ) - ch_mmcif = DOWNLOAD_PDBMMCIF.out.ch_db - ch_versions = ch_versions.mix(DOWNLOAD_PDBMMCIF.out.versions) + ch_obsolete = ARIA2_OBSOLETE.out.db + ch_versions = ch_versions.mix(ARIA2_OBSOLETE.out.versions) ARIA2_UNIREF30( - uniref30_alphafold2_link + alphafold2_uniref30_link ) - ch_uniref30 = ARIA2_UNIREF30.out.db - ch_versions = ch_versions.mix(ARIA2_UNIREF30.out.versions) + ch_uniref30 = ARIA2_UNIREF30 + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_UNIREF30.out.versions) ARIA2_UNIREF90( uniref90_link @@ -131,7 +154,7 @@ workflow PREPARE_ALPHAFOLD2_DBS { pdb_seqres_link ] ) - ch_pdb_seqres = ARIA2_PDB_SEQRES.out.downloaded_file.map{ it[1] } + ch_pdb_seqres = ARIA2_PDB_SEQRES.out.downloaded_file.map { it -> it[1] } ch_versions = ch_versions.mix(ARIA2_PDB_SEQRES.out.versions) ARIA2_UNIPROT_SPROT( @@ -146,20 +169,21 @@ workflow PREPARE_ALPHAFOLD2_DBS { ARIA2_UNIPROT_SPROT.out.db, ARIA2_UNIPROT_TREMBL.out.db ) - ch_uniprot = COMBINE_UNIPROT.out.ch_db - ch_version = ch_versions.mix(COMBINE_UNIPROT.out.versions) + ch_uniprot = COMBINE_UNIPROT.out.ch_db + ch_versions = ch_versions.mix(COMBINE_UNIPROT.out.versions) } emit: - bfd = ch_bfd - small_bfd = ch_small_bfd - params = ch_params - mgnify = ch_mgnify - pdb70 = ch_pdb70 - pdb_mmcif = ch_mmcif - uniref30 = ch_uniref30 - uniref90 = ch_uniref90 - pdb_seqres = ch_pdb_seqres - uniprot = ch_uniprot - versions = ch_versions + bfd = ch_bfd + small_bfd = ch_small_bfd + params = ch_params + mgnify = ch_mgnify + pdb70 = ch_pdb70 + pdb_mmcif = ch_mmcif_files + pdb_obsolete = ch_obsolete + uniref30 = ch_uniref30 + uniref90 = ch_uniref90 + pdb_seqres = ch_pdb_seqres + uniprot = ch_uniprot + versions = ch_versions } diff --git a/subworkflows/local/prepare_alphafold3_dbs.nf b/subworkflows/local/prepare_alphafold3_dbs.nf new file mode 100644 index 000000000..8c64bebbe --- /dev/null +++ b/subworkflows/local/prepare_alphafold3_dbs.nf @@ -0,0 +1,132 @@ +// +// Download all the required AlphaFold 3 databases and parameters +// + +include { ARIA2_UNCOMPRESS as ARIA2_SMALL_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_MGNIFY } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_MMCIF } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF90 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_PDB_SEQRES } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIPROT } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_RNACENTRAL_ACTIVE_SEQ } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_NT_RNA_2023_02_23 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_RFAM } from './aria2_uncompress' + +include { DOWNLOAD_PDBMMCIF_AF3 } from '../../modules/local/download_pdbmmcif_af3' + +workflow PREPARE_ALPHAFOLD3_DBS { + + take: + alphafold3_db // directory: path to alphafold3 DBs + alphafold3_params_path // directory: /path/to/alphafold3/params/ + small_bfd_path // directory: /path/to/small_bfd/ + mgnify_path // directory: /path/to/mgnify/ + pdb_mmcif_path // directory: /path/to/pdb_mmcif/ + uniref90_path // directory: /path/to/uniref90/ + pdb_seqres_path // directory: /path/to/pdb_seqres/ + uniprot_path // directory: /path/to/uniprot/ + rnacentral_active_seq_path // directory: /path/to/rnacentral_active_seq/ + nt_rna_2023_02_23_path // directory: /path/to/nt_rna_2023_02_23/ + rfam_path // directory: /path/to/rfam/ + small_bfd_link // string: Specifies the link to download small_bfd + mgnify_link // string: Specifies the link to download mgnify + pdb_mmcif_link // string: Specifies the link to download mmcif + uniref90_link // string: Specifies the link to download uniref90 + pdb_seqres_link // string: Specifies the link to download pdb_seqres + uniprot_link // string: Specifies the link to download uniprot + rnacentral_active_seq_link // string: Specifies the link to download rnacentral_active_seq + nt_rna_2023_02_23_link // string: Specifies the link to download nt_rna_2023_02_23 + rfam_link // string: Specifies the link to download rfam + + main: + ch_versions = channel.empty() + + if (alphafold3_db) { + ch_params = channel.value(file(alphafold3_params_path, checkIfExists: true)) + ch_small_bfd = channel.value(file(small_bfd_path, checkIfExists: true)) + ch_mgnify = channel.value(file(mgnify_path, checkIfExists: true)) + ch_mmcif = channel.value(file(pdb_mmcif_path, checkIfExists: true)) + ch_uniref90 = channel.value(file(uniref90_path, checkIfExists: true)) + ch_pdb_seqres = channel.value(file(pdb_seqres_path, checkIfExists: true)) + ch_uniprot = channel.value(file(uniprot_path, checkIfExists: true)) + ch_rnacentral = channel.value(file(rnacentral_active_seq_path)) + ch_nt_rna = channel.value(file(nt_rna_2023_02_23_path)) + ch_rfam = channel.value(file(rfam_path)) + } else { + + ARIA2_SMALL_BFD ( + small_bfd_link + ) + ch_small_bfd = ARIA2_SMALL_BFD.out.db + ch_versions = ch_versions.mix(ARIA2_SMALL_BFD.out.versions) + + ch_params = channel.value(file(alphafold3_params_path, checkIfExists: true)) + + ARIA2_MGNIFY ( + mgnify_link + ) + ch_mgnify = ARIA2_MGNIFY.out.db + ch_versions = ch_versions.mix(ARIA2_MGNIFY.out.versions) + + ARIA2_MMCIF ( + pdb_mmcif_link + ) + ch_mmcif = ARIA2_MMCIF.out.db + ch_versions = ch_versions.mix(ARIA2_MMCIF.out.versions) + + DOWNLOAD_PDBMMCIF_AF3( + pdb_mmcif_link + ) + ch_mmcif = DOWNLOAD_PDBMMCIF_AF3.out.ch_db + ch_versions = ch_versions.mix(DOWNLOAD_PDBMMCIF_AF3.out.versions) + + ARIA2_UNIREF90 ( + uniref90_link + ) + ch_uniref90 = ARIA2_UNIREF90.out.db + ch_versions = ch_versions.mix(ARIA2_UNIREF90.out.versions) + + ARIA2_PDB_SEQRES ( + pdb_seqres_link + ) + ch_pdb_seqres = ARIA2_PDB_SEQRES.out.db + ch_versions = ch_versions.mix(ARIA2_PDB_SEQRES.out.versions) + + ARIA2_UNIPROT ( + uniprot_link + ) + ch_uniprot = ARIA2_UNIPROT.out.db + ch_versions = ch_versions.mix(ARIA2_UNIPROT.out.versions) + + ARIA2_RNACENTRAL_ACTIVE_SEQ ( + rnacentral_active_seq_link + ) + ch_rnacentral = ARIA2_RNACENTRAL_ACTIVE_SEQ.out.db + ch_versions = ch_versions.mix(ARIA2_RNACENTRAL_ACTIVE_SEQ.out.versions) + + ARIA2_NT_RNA_2023_02_23 ( + nt_rna_2023_02_23_link + ) + ch_nt_rna = ARIA2_NT_RNA_2023_02_23.out.db + ch_versions = ch_versions.mix(ARIA2_NT_RNA_2023_02_23.out.versions) + + ARIA2_RFAM ( + rfam_link + ) + ch_rfam = ARIA2_RFAM.out.db + ch_versions = ch_versions.mix(ARIA2_RFAM.out.versions) + } + + emit: + params = ch_params + small_bfd = ch_small_bfd + mgnify = ch_mgnify + pdb_mmcif = ch_mmcif + uniref90 = ch_uniref90 + pdb_seqres = ch_pdb_seqres + uniprot = ch_uniprot + rnacentral = ch_rnacentral + nt_rna = ch_nt_rna + rfam = ch_rfam + versions = ch_versions +} diff --git a/subworkflows/local/prepare_boltz_dbs.nf b/subworkflows/local/prepare_boltz_dbs.nf new file mode 100644 index 000000000..49fa04588 --- /dev/null +++ b/subworkflows/local/prepare_boltz_dbs.nf @@ -0,0 +1,85 @@ +// +// Download the files required for Boltz +// +include { ARIA2 as ARIA2_BOLTZ_CCD } from '../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_BOLTZ_MODEL } from '../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_BOLTZ2_AFF } from '../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_BOLTZ2_CONF } from '../../modules/nf-core/aria2/main' + +include { ARIA2_UNCOMPRESS } from './aria2_uncompress' + +workflow PREPARE_BOLTZ_DBS { + take: + boltz_db + boltz_ccd + boltz_model + boltz2_aff + boltz2_conf + boltz2_mols + boltz_ccd_link + boltz_model_link + boltz2_aff_link + boltz2_conf_link + boltz2_mols_link + + main: + ch_versions = channel.empty() + + if (boltz_db) { + ch_boltz_ccd = channel.value(file(boltz_ccd, checkIfExists: true)) + ch_boltz_model = channel.value(file(boltz_model, checkIfExists: true)) + ch_boltz2_aff = channel.value(file(boltz2_aff, checkIfExists: true)) + ch_boltz2_conf = channel.value(file(boltz2_conf, checkIfExists: true)) + ch_boltz2_mols = channel.value(file(boltz2_mols, checkIfExists: true)) + } else { + ARIA2_BOLTZ_CCD( + [ + [:], + boltz_ccd_link + ] + ) + ch_boltz_ccd = ARIA2_BOLTZ_CCD.out.downloaded_file.map { it -> it[1] } + ch_versions = ch_versions.mix(ARIA2_BOLTZ_CCD.out.versions) + + ARIA2_BOLTZ_MODEL( + [ + [:], + boltz_model_link + ] + ) + ch_boltz_model = ARIA2_BOLTZ_MODEL.out.downloaded_file.map { it -> it[1] } + ch_versions = ch_versions.mix(ARIA2_BOLTZ_MODEL.out.versions) + + ARIA2_BOLTZ2_AFF( + [ + [:], + boltz2_aff_link + ] + ) + ch_boltz2_aff = ARIA2_BOLTZ2_AFF.out.downloaded_file.map { it -> it[1] } + ch_versions = ch_versions.mix(ARIA2_BOLTZ2_AFF.out.versions) + + ARIA2_BOLTZ2_CONF( + [ + [:], + boltz2_conf_link + ] + ) + ch_boltz2_conf = ARIA2_BOLTZ2_CONF.out.downloaded_file.map { it -> it[1] } + ch_versions = ch_versions.mix(ARIA2_BOLTZ2_CONF.out.versions) + + ARIA2_UNCOMPRESS( + boltz2_mols_link + ) + ch_boltz2_mols = ARIA2_UNCOMPRESS.out.db + ch_versions = ch_versions.mix(ARIA2_UNCOMPRESS.out.versions) + } + + emit: + boltz_ccd = ch_boltz_ccd + boltz_model = ch_boltz_model + boltz2_aff = ch_boltz2_aff + boltz2_conf = ch_boltz2_conf + boltz2_mols = ch_boltz2_mols + versions = ch_versions +} diff --git a/subworkflows/local/prepare_colabfold_dbs.nf b/subworkflows/local/prepare_colabfold_dbs.nf index bab0b74c7..78e5d750b 100644 --- a/subworkflows/local/prepare_colabfold_dbs.nf +++ b/subworkflows/local/prepare_colabfold_dbs.nf @@ -1,85 +1,109 @@ // // Download all the required databases and params by Colabfold // -include { MMSEQS_CREATEINDEX as MMSEQS_CREATEINDEX_COLABFOLDDB } from '../../modules/nf-core/mmseqs/createindex/main' -include { MMSEQS_CREATEINDEX as MMSEQS_CREATEINDEX_UNIPROT30 } from '../../modules/nf-core/mmseqs/createindex/main' +include { MMSEQS_CREATEINDEX as MMSEQS_CREATEINDEX_COLABFOLDDB } from '../../modules/nf-core/mmseqs/createindex/main' +include { MMSEQS_CREATEINDEX as MMSEQS_CREATEINDEX_UNIPROT30 } from '../../modules/nf-core/mmseqs/createindex/main' -include { ARIA2_UNCOMPRESS as ARIA2_COLABFOLD_PARAMS } from './aria2_uncompress' -include { ARIA2_UNCOMPRESS as ARIA2_COLABFOLD_DB } from './aria2_uncompress' -include { ARIA2_UNCOMPRESS as ARIA2_UNIREF30 } from './aria2_uncompress' -include { MMSEQS_TSV2EXPROFILEDB as MMSEQS_TSV2EXPROFILEDB_COLABFOLDDB } from '../../modules/nf-core/mmseqs/tsv2exprofiledb/main' -include { MMSEQS_TSV2EXPROFILEDB as MMSEQS_TSV2EXPROFILEDB_UNIPROT30 } from '../../modules/nf-core/mmseqs/tsv2exprofiledb/main' +include { ARIA2_UNCOMPRESS as ARIA2_COLABFOLD_PARAMS } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_COLABFOLD_DB } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF30 } from './aria2_uncompress' workflow PREPARE_COLABFOLD_DBS { take: colabfold_db // directory: path/to/colabfold/DBs and params - colabfold_server // string: Specifies the server to use for colabfold + use_msa_server // bool: Specifies whether to use web msa server colabfold_alphafold2_params_path // directory: /path/to/colabfold/alphafold2/params/ - colabfold_db_path // directory: /path/to/colabfold/db/ - uniref30_colabfold_path // directory: /path/to/uniref30/colabfold/ + colabfold_envdb_path // directory: /path/to/colabfold/db/ + colabfold_uniref30_path // directory: /path/to/uniref30/colabfold/ colabfold_alphafold2_params_link // string: Specifies the link to download colabfold alphafold2 params colabfold_db_link // string: Specifies the link to download colabfold db - uniref30_colabfold_link // string: Specifies the link to download uniref30 - create_colabfold_index // boolean: Create index for colabfold db + colabfold_uniref30_link // string: Specifies the link to download uniref30 + colabfold_create_index // boolean: Create index for colabfold db main: - ch_params = Channel.empty() - ch_colabfold_db = Channel.empty() - ch_uniref30 = Channel.empty() - ch_versions = Channel.empty() + ch_params = channel.empty() + ch_colabfold_db = channel.empty() + ch_uniref30 = channel.empty() + ch_versions = channel.empty() if (colabfold_db) { - ch_params = Channel.value(file( colabfold_alphafold2_params_path, type: 'any' )) - if (colabfold_server == 'local') { - ch_colabfold_db = Channel.value(file( colabfold_db_path, type: 'any' )) - ch_uniref30 = Channel.value(file( uniref30_colabfold_path , type: 'any' )) + ch_params = channel.value(file(colabfold_alphafold2_params_path, type: 'any', checkIfExists: true)) + if (!use_msa_server) { + ch_colabfold_db = channel.value(file(colabfold_envdb_path, type: 'any', checkIfExists: true)) + ch_uniref30 = channel.value(file(colabfold_uniref30_path, type: 'any', checkIfExists: true)) } } else { ARIA2_COLABFOLD_PARAMS ( colabfold_alphafold2_params_link ) - ch_params = ARIA2_COLABFOLD_PARAMS.out.db + ch_params = ARIA2_COLABFOLD_PARAMS + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_COLABFOLD_PARAMS.out.versions) - if (params.colabfold_server == 'local') { + if (!use_msa_server) { ARIA2_COLABFOLD_DB ( colabfold_db_link ) ch_versions = ch_versions.mix(ARIA2_COLABFOLD_DB.out.versions) - MMSEQS_TSV2EXPROFILEDB_COLABFOLDDB ( - ARIA2_COLABFOLD_DB.out.db - ) - ch_colabfold_db = MMSEQS_TSV2EXPROFILEDB_COLABFOLDDB.out.db_exprofile - ch_versions = ch_versions.mix(MMSEQS_TSV2EXPROFILEDB_COLABFOLDDB.out.versions) + ch_colabfold_db = ARIA2_COLABFOLD_DB.out.db - if (params.create_colabfold_index) { + if (colabfold_create_index) { MMSEQS_CREATEINDEX_COLABFOLDDB ( - MMSEQS_TSV2EXPROFILEDB_COLABFOLDDB.out.db_exprofile + ch_colabfold_db + .map { path_str -> + def db_file = file(path_str) + [ [id: 'colabfolddb'], db_file ] + } ) - ch_colabfold_db = MMSEQS_CREATEINDEX_COLABFOLDDB.out.db_indexed + ch_colabfold_db = MMSEQS_CREATEINDEX_COLABFOLDDB + .out + .db_indexed + .map { _meta, dir -> + file("${dir}/*") + } ch_versions = ch_versions.mix(MMSEQS_CREATEINDEX_COLABFOLDDB.out.versions) + + } else { + ch_colabfold_db = ch_colabfold_db + .map { dir_path -> + file("${dir_path}/*") + } } ARIA2_UNIREF30( - uniref30_colabfold_link + colabfold_uniref30_link ) ch_versions = ch_versions.mix(ARIA2_UNIREF30.out.versions) - MMSEQS_TSV2EXPROFILEDB_UNIPROT30 ( - ARIA2_UNIREF30.out.db - ) - ch_uniref30 = MMSEQS_TSV2EXPROFILEDB_UNIPROT30.out.db_exprofile - ch_versions = ch_versions.mix(MMSEQS_TSV2EXPROFILEDB_UNIPROT30.out.versions) + ch_uniref30 = ARIA2_UNIREF30.out.db - if (create_colabfold_index) { + if (colabfold_create_index) { MMSEQS_CREATEINDEX_UNIPROT30 ( - MMSEQS_TSV2EXPROFILEDB_UNIPROT30.out.db_exprofile + ch_uniref30 + .map { path_str -> + def db_file = file(path_str) + [ [id: 'uniprot30'], db_file ] + } ) - ch_uniref30 = MMSEQS_CREATEINDEX_UNIPROT30.out.db_indexed + ch_uniref30 = MMSEQS_CREATEINDEX_UNIPROT30 + .out + .db_indexed + .map { _meta, dir -> + file("${dir}/*") + } ch_versions = ch_versions.mix(MMSEQS_CREATEINDEX_UNIPROT30.out.versions) + + } else { + ch_uniref30 = ch_uniref30 + .map { dir_path -> + file("${dir_path}/*") + } } } } diff --git a/subworkflows/local/prepare_esmfold_dbs.nf b/subworkflows/local/prepare_esmfold_dbs.nf index decd28757..858ccb0bd 100644 --- a/subworkflows/local/prepare_esmfold_dbs.nf +++ b/subworkflows/local/prepare_esmfold_dbs.nf @@ -16,10 +16,10 @@ workflow PREPARE_ESMFOLD_DBS { esm2_t36_3B_UR50D_contact_regression // string: Specifies the link to download esm2 t36 3B UR50D contact regression main: - ch_versions = Channel.empty() + ch_versions = channel.empty() if (esmfold_db) { - ch_params = Channel.value(file( esmfold_params_path, type: 'file' )) + ch_params = channel.value(file(esmfold_params_path, checkIfExists: true )) } else { ARIA2_ESMFOLD_3B_V1 ( @@ -43,16 +43,16 @@ workflow PREPARE_ESMFOLD_DBS { ch_params = ARIA2_ESMFOLD_3B_V1 .out .downloaded_file - .map{ it[1] } + .map { it -> it[1] } .mix( ARIA2_ESM2_T36_3B_UR50D .out .downloaded_file - .map{ it[1] }, + .map { it -> it[1] }, ARIA2_ESM2_T36_3B_UR50D_CONTACT_REGRESSION .out .downloaded_file - .map{ it[1] }) + .map { it -> it[1] }) .collect() ch_versions = ch_versions.mix(ARIA2_ESMFOLD_3B_V1.out.versions) diff --git a/subworkflows/local/prepare_helixfold3_dbs.nf b/subworkflows/local/prepare_helixfold3_dbs.nf new file mode 100644 index 000000000..a40232679 --- /dev/null +++ b/subworkflows/local/prepare_helixfold3_dbs.nf @@ -0,0 +1,161 @@ +// +// Download all the required AlphaFold 2 databases and parameters +// + +include { ARIA2_UNCOMPRESS as ARIA2_UNICLUST30 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_CCD_PREPROCESSED } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_RFAM } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_SMALL_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIPROT_SPROT } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIPROT_TREMBL } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_OBSOLETE } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF90 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_MGNIFY } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_INIT_MODELS } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_MAXIT } from './aria2_uncompress' + +include { ARIA2 as ARIA2_PDB_SEQRES } from '../../modules/nf-core/aria2/main' +include { COMBINE_UNIPROT } from '../../modules/local/combine_uniprot' +include { DOWNLOAD_PDBMMCIF } from '../../modules/local/download_pdbmmcif' + +workflow PREPARE_HELIXFOLD3_DBS { + + take: + helixfold3_db + helixfold3_uniclust30_link + helixfold3_ccd_preprocessed_link + helixfold3_rfam_link + helixfold3_init_models_link + helixfold3_bfd_link + helixfold3_small_bfd_link + helixfold3_uniprot_sprot_link + helixfold3_uniprot_trembl_link + helixfold3_pdb_seqres_link + helixfold3_uniref90_link + helixfold3_mgnify_link + helixfold3_pdb_mmcif_link + helixfold3_obsolete_link + helixfold3_maxit_src_link + helixfold3_uniclust30_path + helixfold3_ccd_preprocessed_path + helixfold3_rfam_path + helixfold3_init_models_path + helixfold3_bfd_path + helixfold3_small_bfd_path + helixfold3_uniprot_path + helixfold3_pdb_seqres_path + helixfold3_uniref90_path + helixfold3_mgnify_path + helixfold3_pdb_mmcif_path + helixfold3_obsolete_path + helixfold3_maxit_src_path + + main: + ch_helixfold3_maxit_src = channel.value(file(helixfold3_maxit_src_path, checkIfExists: true)) + ch_versions = channel.empty() + + if (helixfold3_db) { + ch_helixfold3_uniclust30 = channel.value(file(helixfold3_uniclust30_path, checkIfExists: true)) + ch_helixfold3_ccd_preprocessed = channel.value(file(helixfold3_ccd_preprocessed_path, checkIfExists: true)) + ch_helixfold3_rfam = channel.value(file(helixfold3_rfam_path, checkIfExists: true)) + ch_helixfold3_bfd = channel.value(file(helixfold3_bfd_path, checkIfExists: true)) + ch_helixfold3_small_bfd = channel.value(file(helixfold3_small_bfd_path, checkIfExists: true)) + ch_helixfold3_uniprot = channel.value(file(helixfold3_uniprot_path, checkIfExists: true)) + ch_helixfold3_pdb_seqres = channel.value(file(helixfold3_pdb_seqres_path, checkIfExists: true)) + ch_helixfold3_uniref90 = channel.value(file(helixfold3_uniref90_path, checkIfExists: true)) + ch_helixfold3_mgnify = channel.value(file(helixfold3_mgnify_path, checkIfExists: true)) + ch_helixfold3_mmcif_files = channel.value(file(helixfold3_pdb_mmcif_path, checkIfExists: true)) + ch_helixfold3_obsolete = channel.value(file(helixfold3_obsolete_path, checkIfExists: true)) + ch_helixfold3_init_models = channel.value(file(helixfold3_init_models_path, checkIfExists: true)) + } + else { + ARIA2_UNICLUST30(helixfold3_uniclust30_link) + ch_helixfold3_uniclust30 = ARIA2_UNICLUST30.out.db + ch_versions = ch_versions.mix(ARIA2_UNICLUST30.out.versions) + + ARIA2_CCD_PREPROCESSED(helixfold3_ccd_preprocessed_link) + ch_helixfold3_ccd_preprocessed = ARIA2_CCD_PREPROCESSED.out.db + ch_versions = ch_versions.mix(ARIA2_CCD_PREPROCESSED.out.versions) + + ARIA2_RFAM(helixfold3_rfam_link) + ch_helixfold3_rfam = ARIA2_RFAM.out.db + ch_versions = ch_versions.mix(ARIA2_RFAM.out.versions) + + ARIA2_BFD(helixfold3_bfd_link) + ch_helixfold3_bfd = ARIA2_BFD.out.db + ch_versions = ch_versions.mix(ARIA2_BFD.out.versions) + + ARIA2_SMALL_BFD(helixfold3_small_bfd_link) + ch_helixfold3_small_bfd = ARIA2_SMALL_BFD.out.db + ch_versions = ch_versions.mix(ARIA2_SMALL_BFD.out.versions) + + ARIA2_UNIREF90(helixfold3_uniref90_link) + ch_helixfold3_uniref90 = ARIA2_UNIREF90.out.db + ch_versions = ch_versions.mix(ARIA2_UNIREF90.out.versions) + + ARIA2_MGNIFY(helixfold3_mgnify_link) + ch_helixfold3_mgnify = ARIA2_MGNIFY.out.db + ch_versions = ch_versions.mix(ARIA2_MGNIFY.out.versions) + + DOWNLOAD_PDBMMCIF( + helixfold3_pdb_mmcif_link + ) + ch_helixfold3_mmcif_files = DOWNLOAD_PDBMMCIF.out.ch_db + ch_versions = ch_versions.mix(DOWNLOAD_PDBMMCIF.out.versions) + + ARIA2_OBSOLETE( + helixfold3_obsolete_link + ) + ch_helixfold3_obsolete = ARIA2_OBSOLETE.out.db + ch_versions = ch_versions.mix(ARIA2_OBSOLETE.out.versions) + + ARIA2_INIT_MODELS(helixfold3_init_models_link) + ch_helixfold3_init_models = ARIA2_INIT_MODELS.out.db + ch_versions = ch_versions.mix(ARIA2_INIT_MODELS.out.versions) + + ARIA2_PDB_SEQRES ( + [ + [:], + helixfold3_pdb_seqres_link + ] + ) + ch_helixfold3_pdb_seqres = ARIA2_PDB_SEQRES.out.downloaded_file.map { it -> it[1] } + ch_versions = ch_versions.mix(ARIA2_PDB_SEQRES.out.versions) + + ARIA2_UNIPROT_SPROT( + helixfold3_uniprot_sprot_link + ) + ch_versions = ch_versions.mix(ARIA2_UNIPROT_SPROT.out.versions) + ARIA2_UNIPROT_TREMBL( + helixfold3_uniprot_trembl_link + ) + ch_versions = ch_versions.mix(ARIA2_UNIPROT_TREMBL.out.versions) + COMBINE_UNIPROT ( + ARIA2_UNIPROT_SPROT.out.db, + ARIA2_UNIPROT_TREMBL.out.db + ) + ch_helixfold3_uniprot = COMBINE_UNIPROT.out.ch_db + ch_versions = ch_versions.mix(COMBINE_UNIPROT.out.versions) + + ARIA2_MAXIT(helixfold3_maxit_src_link) + ch_helixfold3_maxit_src = ARIA2_MAXIT.out.db + ch_versions = ch_versions.mix(ARIA2_MAXIT.out.versions) + } + + emit: + helixfold3_uniclust30 = ch_helixfold3_uniclust30 + helixfold3_ccd_preprocessed = ch_helixfold3_ccd_preprocessed + helixfold3_rfam = ch_helixfold3_rfam + helixfold3_bfd = ch_helixfold3_bfd + helixfold3_small_bfd = ch_helixfold3_small_bfd + helixfold3_uniprot = ch_helixfold3_uniprot + helixfold3_pdb_seqres = ch_helixfold3_pdb_seqres + helixfold3_uniref90 = ch_helixfold3_uniref90 + helixfold3_mgnify = ch_helixfold3_mgnify + helixfold3_mmcif_files = ch_helixfold3_mmcif_files + helixfold3_obsolete = ch_helixfold3_obsolete + helixfold3_init_models = ch_helixfold3_init_models + helixfold3_maxit_src = ch_helixfold3_maxit_src + versions = ch_versions +} diff --git a/subworkflows/local/prepare_rosettafold2na_dbs.nf b/subworkflows/local/prepare_rosettafold2na_dbs.nf new file mode 100644 index 000000000..5524ff723 --- /dev/null +++ b/subworkflows/local/prepare_rosettafold2na_dbs.nf @@ -0,0 +1,92 @@ +// +// Prepare RoseTTAFold2NA databases +// + +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF30 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_PDB100 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_WEIGHTS } from './aria2_uncompress' + +include { ARIA2 as ARIA2_PDB_SEQRES } from '../../modules/nf-core/aria2/main' +include { DOWNLOAD_RNA_DATABASES } from '../../modules/local/download_rna_rf2na' + +workflow PREPARE_ROSETTAFOLD2NA_DBS { + + take: + rosettafold2na_db + rosettafold2na_bfd_path + rosettafold2na_uniref30_path + rosettafold2na_pdb100_path + rosettafold2na_rna_path + rosettafold2na_weights_path + rosettafold2na_bfd_link + rosettafold2na_uniref30_link + rosettafold2na_pdb100_link + rosettafold2na_weights_link + rfam_full_region_link + rfam_cm_link + rnacentral_rfam_annotations_link + rnacentral_id_mapping_link + rnacentral_sequences_link + + main: + ch_versions = channel.empty() + + if (rosettafold2na_db) { + ch_bfd = channel.value(file(rosettafold2na_bfd_path, checkIfExists: true)) + ch_uniref30 = channel.value(file(rosettafold2na_uniref30_path, checkIfExists: true)) + ch_pdb100 = channel.value(file(rosettafold2na_pdb100_path, checkIfExists: true)) + ch_weights = channel.value(file(rosettafold2na_weights_path, checkIfExists: true)) + ch_rna = channel.value(file(rosettafold2na_rna_path, checkIfExists: true)) + } else { + ARIA2_BFD(rosettafold2na_bfd_link) + ch_bfd = ARIA2_BFD + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_BFD.out.versions) + + ARIA2_UNIREF30(rosettafold2na_uniref30_link) + ch_uniref30 = ARIA2_UNIREF30 + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_UNIREF30.out.versions) + + ARIA2_PDB100(rosettafold2na_pdb100_link) + ch_pdb100 = ARIA2_PDB100 + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_PDB100.out.versions) + + DOWNLOAD_RNA_DATABASES( + rfam_full_region_link, + rfam_cm_link, + rnacentral_rfam_annotations_link, + rnacentral_id_mapping_link, + rnacentral_sequences_link + ) + ch_rna = DOWNLOAD_RNA_DATABASES + .out + .ch_db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(DOWNLOAD_RNA_DATABASES.out.versions) + + ARIA2_WEIGHTS(rosettafold2na_weights_link) + ch_weights = ARIA2_WEIGHTS + .out + .db + .map { dir -> dir.listFiles().findAll { it -> it.isFile() } } + ch_versions = ch_versions.mix(ARIA2_WEIGHTS.out.versions) + + } + + emit: + bfd = ch_bfd + uniref30 = ch_uniref30 + pdb100 = ch_pdb100 + rna = ch_rna + rosettafold2na_weights = ch_weights + versions = ch_versions +} diff --git a/subworkflows/local/prepare_rosettafold_all_atom_dbs.nf b/subworkflows/local/prepare_rosettafold_all_atom_dbs.nf new file mode 100644 index 000000000..97ecd2b75 --- /dev/null +++ b/subworkflows/local/prepare_rosettafold_all_atom_dbs.nf @@ -0,0 +1,75 @@ +// +// Download all the required Rosettafold-All-Atom databases and parameters +// + +include { ARIA2_UNCOMPRESS as ARIA2_UNIREF30 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_SMALL_BFD } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_PDB100 } from './aria2_uncompress' +include { ARIA2_UNCOMPRESS as ARIA2_WEIGHTS } from './aria2_uncompress' + +include { ARIA2 as ARIA2_PDB_SEQRES } from '../../modules/nf-core/aria2/main' + +workflow PREPARE_ROSETTAFOLD_ALL_ATOM_DBS { + + take: + rosettafold_all_atom_db + rosettafold_all_atom_bfd_path // directory: /path/to/bfd/ + rosettafold_all_atom_uniref30_path // directory: /path/to/uniref30/rosettafold_all_atom/ + rosettafold_all_atom_pdb100_path + rosettafold_all_atom_paper_weights_path + rosettafold_all_atom_bfd_link + rosettafold_all_atom_uniref30_link + rosettafold_all_atom_pdb100_link + rosettafold_all_atom_paper_weights_link + + main: + ch_versions = channel.empty() + + if (rosettafold_all_atom_db) { + ch_bfd = channel.value(file(rosettafold_all_atom_bfd_path, checkIfExists: true)) + ch_uniref30 = channel.value(file(rosettafold_all_atom_uniref30_path, checkIfExists: true)) + ch_pdb100 = channel.value(file(rosettafold_all_atom_pdb100_path, checkIfExists: true)) + ch_rfaa_paper_weights = channel.value(file(rosettafold_all_atom_paper_weights_path, checkIfExists: true)) + } + else { + ARIA2_BFD(rosettafold_all_atom_bfd_link) + ch_bfd = ARIA2_BFD + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } + + ch_versions = ch_versions.mix(ARIA2_BFD.out.versions) + + ARIA2_UNIREF30(rosettafold_all_atom_uniref30_link) + ch_uniref30 = ARIA2_UNIREF30 + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } + ch_versions = ch_versions.mix(ARIA2_UNIREF30.out.versions) + + ARIA2_PDB100(rosettafold_all_atom_pdb100_link) + ch_pdb100 = ARIA2_PDB100 + .out + .db + .map { + dir -> dir.listFiles().findAll { it -> it.isFile() } + } + ch_versions = ch_versions.mix(ARIA2_PDB100.out.versions) + + ARIA2_WEIGHTS(rosettafold_all_atom_paper_weights_link) + ch_rfaa_paper_weights = ARIA2_WEIGHTS.out.db + ch_versions = ch_versions.mix(ARIA2_WEIGHTS.out.versions) + } + + emit: + bfd = ch_bfd + uniref30 = ch_uniref30 + pdb100 = ch_pdb100 + rfaa_paper_weights = ch_rfaa_paper_weights + versions = ch_versions +} diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index 742d460ae..acac78d49 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -8,34 +8,40 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { UTILS_NFVALIDATION_PLUGIN } from '../../nf-core/utils_nfvalidation_plugin' -include { paramsSummaryMap } from 'plugin/nf-validation' -include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' -include { dashedLine } from '../../nf-core/utils_nfcore_pipeline' -include { nfCoreLogo } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' -include { workflowCitation } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { logColours } from '../../nf-core/utils_nfcore_pipeline' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW TO INITIALISE PIPELINE -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_INITIALISATION { take: version // boolean: Display version and exit - help // boolean: Display help text validate_params // boolean: Boolean whether to validate parameters against the schema at runtime monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved + input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: + + ch_versions = channel.empty() + // // Print version and exit if required and dump pipeline parameters to JSON file // @@ -49,16 +55,36 @@ workflow PIPELINE_INITIALISATION { // // Validate parameters and generate parameter summary to stdout // - pre_help_text = nfCoreLogo(monochrome_logs) - post_help_text = '\n' + workflowCitation() + '\n' + dashedLine(monochrome_logs) - def String workflow_command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " - UTILS_NFVALIDATION_PLUGIN ( - help, - workflow_command, - pre_help_text, - post_help_text, + def colors = logColours(monochrome_logs) + before_text = """ +-${colors.dim}----------------------------------------------------${colors.reset}- + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} +${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} +${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} +${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} +${colors.purple} nf-core/proteinfold ${workflow.manifest.version}${colors.reset} +-${colors.dim}----------------------------------------------------${colors.reset}- +""" + after_text = """${workflow.manifest.doi ? "\n* The pipeline\n" : ""}${workflow.manifest.doi.tokenize(",").collect { doi -> " https://doi.org/${doi.trim().replace('https://doi.org/','')}"}.join("\n")}${workflow.manifest.doi ? "\n" : ""} +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/proteinfold/blob/master/CITATIONS.md +""" + command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " + + UTILS_NFSCHEMA_PLUGIN ( + workflow, validate_params, - "nextflow_schema.json" + null, + help, + help_full, + show_hidden, + before_text, + after_text, + command ) // @@ -67,12 +93,47 @@ workflow PIPELINE_INITIALISATION { UTILS_NFCORE_PIPELINE ( nextflow_cli_args ) + + // + // Create channel from input file provided through input + // + ch_samplesheet = channel.fromList(samplesheetToList(input, "assets/schema_input.json")) + + ch_samplesheet + .map { meta, fasta -> + // This mapping supports legacy samplesheets that use 'sequence' as metadata. + // If meta.id is missing or empty, meta.sequence is used as the identifier. + def identifier = meta.id ? meta.id : meta.sequence + return [[id: identifier], fasta] + } + + if (params.split_fasta) { + ch_samplesheet.map { _meta, fasta -> + validateFasta(fasta) + } + + // Split the fasta file into individual files for each sequence + ch_samplesheet + .map { _meta,fasta -> fasta } + .splitFasta( record: [header: true, sequence: true] ) + .collectFile { item -> + [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] + } + .map { + file -> [[id: file.baseName], file] + } + .set { ch_samplesheet } + } + + emit: + samplesheet = ch_samplesheet + versions = ch_versions } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW FOR PIPELINE COMPLETION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_COMPLETION { @@ -87,19 +148,26 @@ workflow PIPELINE_COMPLETION { multiqc_report // string: Path to MultiQC report main: - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") + def multiqc_reports = multiqc_report.toList() // // Completion email and summary // workflow.onComplete { if (email || email_on_fail) { - completionEmail(summary_params, email, email_on_fail, plaintext_email, outdir, monochrome_logs, multiqc_report.toList()) + completionEmail( + summary_params, + email, + email_on_fail, + plaintext_email, + outdir, + monochrome_logs, + multiqc_reports.getVal(), + ) } completionSummary(monochrome_logs) - if (hook_url) { imNotification(summary_params, hook_url) } @@ -111,11 +179,20 @@ workflow PIPELINE_COMPLETION { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// +// Check and validate pipeline parameters +// +def validateInputParameters() { + if (params.mode.toLowerCase().split(",").contains("alphafold3")) { + alphafold3Warn(log) + } +} + // // Get link to Colabfold Alphafold2 parameters // @@ -143,6 +220,14 @@ def getColabfoldAlphafold2ParamsPath() { return path } +def modeChannel(ch, mode) { + return ch.map { meta, value -> + def meta_clone = meta.clone() + meta_clone.model = mode + [ meta_clone, value ] + } +} + // // Generate methods description for MultiQC // @@ -152,7 +237,6 @@ def toolCitationText() { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", - "FastQC (Andrews 2010),", "MultiQC (Ewels et al. 2016)", "." ].join(' ').trim() @@ -165,7 +249,6 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " ].join(' ').trim() @@ -173,7 +256,7 @@ def toolBibliographyText() { } def methodsDescriptionText(mqc_methods_yaml) { - // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + // Convert to a named map so can be used as with familiar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = workflow.toMap() meta["manifest_map"] = workflow.manifest.toMap() @@ -184,8 +267,10 @@ def methodsDescriptionText(mqc_methods_yaml) { // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers // Removing ` ` since the manifest.doi is a string and not a proper list def temp_doi_ref = "" - String[] manifest_doi = meta.manifest_map.doi.tokenize(",") - for (String doi_ref: manifest_doi) temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " + def manifest_doi = meta.manifest_map.doi.tokenize(",") + manifest_doi.each { doi_ref -> + temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " + } meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) } else meta["doi_text"] = "" meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " @@ -204,3 +289,38 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } + +def cleanHeader(header) { + return header + .replaceAll(" ", "_") + .replaceAll("/","_") + .replaceAll(",", "") + .replaceAll(";","") +} + +def validateFasta(fasta) { + // extract headers + def headers = fasta.findAll { it -> it.startsWith('>') } + // if headers are not unique, throw an error + if (headers.size() != headers.unique().size()) { + throw new Exception("Invalid FASTA file. The headers are not unique.") + } + // check headers that are malformed + headers.each { header -> + if (header =~ /[ \t;,\/]/) { + // warn user that the header contains special characters + log.warn "The header ${header} contains special characters. They have been automatically removed." + } + } +} + +// +// Print a warning when using Alphafold3 +// +def alphafold3Warn(log) { + log.warn "=============================================================================\n" + + " You are using AlphaFold3 mode.\n" + + " Be aware that the predicted structures can not be used for commercial purposes.\n" + + " More information here: \"https://github.com/google-deepmind/alphafold3/blob/main/README.md#alphafold-3-source-code-and-model-parameters.\"\n" + + "===================================================================================" +} diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index ac31f28f6..d6e593e85 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -2,18 +2,13 @@ // Subworkflow with functionality that may be useful for any Nextflow pipeline // -import org.yaml.snakeyaml.Yaml -import groovy.json.JsonOutput -import nextflow.extension.FilesEx - /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NEXTFLOW_PIPELINE { - take: print_version // boolean: print version dump_parameters // boolean: dump parameters @@ -26,7 +21,7 @@ workflow UTILS_NEXTFLOW_PIPELINE { // Print workflow version and exit on --version // if (print_version) { - log.info "${workflow.manifest.name} ${getWorkflowVersion()}" + log.info("${workflow.manifest.name} ${getWorkflowVersion()}") System.exit(0) } @@ -49,16 +44,16 @@ workflow UTILS_NEXTFLOW_PIPELINE { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // Generate version string // def getWorkflowVersion() { - String version_string = "" + def version_string = "" as String if (workflow.manifest.version) { def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' version_string += "${prefix_v}${workflow.manifest.version}" @@ -76,13 +71,13 @@ def getWorkflowVersion() { // Dump pipeline parameters to a JSON file // def dumpParametersToJSON(outdir) { - def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') - def filename = "params_${timestamp}.json" - def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") - def jsonStr = JsonOutput.toJson(params) - temp_pf.text = JsonOutput.prettyPrint(jsonStr) + def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = groovy.json.JsonOutput.toJson(params) + temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") + nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") temp_pf.delete() } @@ -90,37 +85,42 @@ def dumpParametersToJSON(outdir) { // When running with -profile conda, warn if channels have not been set-up appropriately // def checkCondaChannels() { - Yaml parser = new Yaml() + def parser = new org.yaml.snakeyaml.Yaml() def channels = [] try { def config = parser.load("conda config --show channels".execute().text) channels = config.channels - } catch(NullPointerException | IOException e) { - log.warn "Could not verify conda channel configuration." - return + } + catch (NullPointerException e) { + log.debug(e) + log.warn("Could not verify conda channel configuration.") + return null + } + catch (IOException e) { + log.debug(e) + log.warn("Could not verify conda channel configuration.") + return null } // Check that all channels are present // This channel list is ordered by required channel priority. - def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def required_channels_in_order = ['conda-forge', 'bioconda'] def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - def channel_priority_violation = false - def n = required_channels_in_order.size() - for (int i = 0; i < n - 1; i++) { - channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) - } + def channel_priority_violation = required_channels_in_order != channels.findAll { ch -> ch in required_channels_in_order } if (channels_missing | channel_priority_violation) { - log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " There is a problem with your Conda configuration!\n\n" + - " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/\n" + - " The observed channel order is \n" + - " ${channels}\n" + - " but the following channel order is required:\n" + - " ${required_channels_in_order}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + log.warn """\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + There is a problem with your Conda configuration! + You will need to set-up the conda-forge and bioconda channels correctly. + Please refer to https://bioconda.github.io/ + The observed channel order is + ${channels} + but the following channel order is required: + ${required_channels_in_order} + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + """.stripIndent(true) } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test.snap b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test.snap index e3f0baf47..846287c41 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test.snap +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test.snap @@ -17,4 +17,4 @@ }, "timestamp": "2024-02-28T12:02:12.425833" } -} \ No newline at end of file +} diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test index ca964ce8e..02dbf094c 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test @@ -52,10 +52,12 @@ nextflow_workflow { } then { - assertAll( - { assert workflow.success }, - { assert workflow.stdout.contains("nextflow_workflow v9.9.9") } - ) + expect { + with(workflow) { + assert success + assert "nextflow_workflow v9.9.9" in stdout + } + } } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config b/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config index d0a926bf6..a09572e5b 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config @@ -3,7 +3,7 @@ manifest { author = """nf-core""" homePage = 'https://127.0.0.1' description = """Dummy pipeline""" - nextflowVersion = '!>=23.04.0' + nextflowVersion = '!>=23.04.0' version = '9.9.9' doi = 'https://doi.org/10.5281/zenodo.5070524' } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f84761125..000000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index 14558c392..2f30e9a46 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -2,17 +2,13 @@ // Subworkflow with utility functions specific to the nf-core pipeline template // -import org.yaml.snakeyaml.Yaml -import nextflow.extension.FilesEx - /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NFCORE_PIPELINE { - take: nextflow_cli_args @@ -25,23 +21,20 @@ workflow UTILS_NFCORE_PIPELINE { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // Warn if a -profile or Nextflow config has not been provided to run the pipeline // def checkConfigProvided() { - valid_config = true + def valid_config = true as Boolean if (workflow.profile == 'standard' && workflow.configFiles.size() <= 1) { - log.warn "[$workflow.manifest.name] You are attempting to run the pipeline without any custom configuration!\n\n" + - "This will be dependent on your local compute environment but can be achieved via one or more of the following:\n" + - " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + - " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + - " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + - "Please refer to the quick start section and usage docs for the pipeline.\n " + log.warn( + "[${workflow.manifest.name}] You are attempting to run the pipeline without any custom configuration!\n\n" + "This will be dependent on your local compute environment but can be achieved via one or more of the following:\n" + " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + "Please refer to the quick start section and usage docs for the pipeline.\n " + ) valid_config = false } return valid_config @@ -52,39 +45,22 @@ def checkConfigProvided() { // def checkProfileProvided(nextflow_cli_args) { if (workflow.profile.endsWith(',')) { - error "The `-profile` option cannot end with a trailing comma, please remove it and re-run the pipeline!\n" + - "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + error( + "The `-profile` option cannot end with a trailing comma, please remove it and re-run the pipeline!\n" + "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + ) } if (nextflow_cli_args[0]) { - log.warn "nf-core pipelines do not accept positional arguments. The positional argument `${nextflow_cli_args[0]}` has been detected.\n" + - "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + log.warn( + "nf-core pipelines do not accept positional arguments. The positional argument `${nextflow_cli_args[0]}` has been detected.\n" + "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + ) } } -// -// Citation string for pipeline -// -def workflowCitation() { - def temp_doi_ref = "" - String[] manifest_doi = workflow.manifest.doi.tokenize(",") - // Using a loop to handle multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" - return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - "* The pipeline\n" + - temp_doi_ref + "\n" + - "* The nf-core framework\n" + - " https://doi.org/10.1038/s41587-020-0439-x\n\n" + - "* Software dependencies\n" + - " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" -} - // // Generate workflow version string // def getWorkflowVersion() { - String version_string = "" + def version_string = "" as String if (workflow.manifest.version) { def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' version_string += "${prefix_v}${workflow.manifest.version}" @@ -102,8 +78,8 @@ def getWorkflowVersion() { // Get software versions for pipeline // def processVersionsFromYAML(yaml_file) { - Yaml yaml = new Yaml() - versions = yaml.load(yaml_file).collectEntries { k, v -> [ k.tokenize(':')[-1], v ] } + def yaml = new org.yaml.snakeyaml.Yaml() + def versions = yaml.load(yaml_file).collectEntries { k, v -> [k.tokenize(':')[-1], v] } return yaml.dumpAsMap(versions).trim() } @@ -113,8 +89,8 @@ def processVersionsFromYAML(yaml_file) { def workflowVersionToYAML() { return """ Workflow: - $workflow.manifest.name: ${getWorkflowVersion()} - Nextflow: $workflow.nextflow.version + ${workflow.manifest.name}: ${getWorkflowVersion()} + Nextflow: ${workflow.nextflow.version} """.stripIndent().trim() } @@ -122,11 +98,7 @@ def workflowVersionToYAML() { // Get channel of software versions used in pipeline in YAML format // def softwareVersionsToYAML(ch_versions) { - return ch_versions - .unique() - .map { processVersionsFromYAML(it) } - .unique() - .mix(Channel.of(workflowVersionToYAML())) + return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(channel.of(workflowVersionToYAML())) } // @@ -134,61 +106,40 @@ def softwareVersionsToYAML(ch_versions) { // def paramsSummaryMultiqc(summary_params) { def summary_section = '' - for (group in summary_params.keySet()) { - def group_params = summary_params.get(group) // This gets the parameters of that particular group - if (group_params) { - summary_section += "

    $group

    \n" - summary_section += "
    \n" - for (param in group_params.keySet()) { - summary_section += "
    $param
    ${group_params.get(param) ?: 'N/A'}
    \n" + summary_params + .keySet() + .each { group -> + def group_params = summary_params.get(group) + // This gets the parameters of that particular group + if (group_params) { + summary_section += "

    ${group}

    \n" + summary_section += "
    \n" + group_params + .keySet() + .sort() + .each { param -> + summary_section += "
    ${param}
    ${group_params.get(param) ?: 'N/A'}
    \n" + } + summary_section += "
    \n" } - summary_section += "
    \n" } - } - String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" - yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" - yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" - yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" - yaml_file_text += "plot_type: 'html'\n" - yaml_file_text += "data: |\n" - yaml_file_text += "${summary_section}" + def yaml_file_text = "id: '${workflow.manifest.name.replace('/', '-')}-summary'\n" as String + yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" + yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" + yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" + yaml_file_text += "plot_type: 'html'\n" + yaml_file_text += "data: |\n" + yaml_file_text += "${summary_section}" return yaml_file_text } -// -// nf-core logo -// -def nfCoreLogo(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) - String.format( - """\n - ${dashedLine(monochrome_logs)} - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} ${getWorkflowVersion()}${colors.reset} - ${dashedLine(monochrome_logs)} - """.stripIndent() - ) -} - -// -// Return dashed line -// -def dashedLine(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) - return "-${colors.dim}----------------------------------------------------${colors.reset}-" -} - // // ANSII colours used for terminal logging // def logColours(monochrome_logs=true) { - Map colorcodes = [:] + def colorcodes = [:] as Map // Reset / Meta colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" @@ -200,79 +151,76 @@ def logColours(monochrome_logs=true) { colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" // Regular Colors - colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" - colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" - colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" - colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" - colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" - colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" - colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" - colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" // Bold - colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" - colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" - colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" - colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" - colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" - colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" - colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" - colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" + colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" + colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" + colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" + colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" + colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" + colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" + colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" + colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" // Underline - colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" - colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" - colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" - colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" - colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" - colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" - colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" - colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" + colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" + colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" + colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" + colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" + colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" + colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" + colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" + colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" // High Intensity - colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" - colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" - colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" - colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" - colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" - colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" - colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" - colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" + colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" + colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" + colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" + colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" + colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" + colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" + colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" + colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" // Bold High Intensity - colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" - colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" - colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" - colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" - colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" - colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" - colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" - colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" + colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" + colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" + colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" + colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" + colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" + colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" + colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" return colorcodes } -// -// Attach the multiqc report to email -// -def attachMultiqcReport(multiqc_report) { - def mqc_report = null - try { - if (workflow.success) { - mqc_report = multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { - if (mqc_report.size() > 1) { - log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" - } - mqc_report = mqc_report[0] - } - } - } catch (all) { - if (multiqc_report) { - log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" +// Return a single report from an object that may be a Path or List +// +def getSingleReport(multiqc_reports) { + if (multiqc_reports instanceof Path) { + return multiqc_reports + } else if (multiqc_reports instanceof List) { + if (multiqc_reports.size() == 0) { + log.warn("[${workflow.manifest.name}] No reports found from process 'MULTIQC'") + return null + } else if (multiqc_reports.size() == 1) { + return multiqc_reports.first() + } else { + log.warn("[${workflow.manifest.name}] Found multiple reports from process 'MULTIQC', will use only one") + return multiqc_reports.first() } + } else { + return null } - return mqc_report } // @@ -281,26 +229,35 @@ def attachMultiqcReport(multiqc_report) { def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdir, monochrome_logs=true, multiqc_report=null) { // Set up the e-mail variables - def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + def subject = "[${workflow.manifest.name}] Successful: ${workflow.runName}" if (!workflow.success) { - subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + subject = "[${workflow.manifest.name}] FAILED: ${workflow.runName}" } def summary = [:] - for (group in summary_params.keySet()) { - summary << summary_params[group] - } + summary_params + .keySet() + .sort() + .each { group -> + summary << summary_params[group] + } def misc_fields = [:] misc_fields['Date Started'] = workflow.start misc_fields['Date Completed'] = workflow.complete misc_fields['Pipeline script file path'] = workflow.scriptFile misc_fields['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision - misc_fields['Nextflow Version'] = workflow.nextflow.version - misc_fields['Nextflow Build'] = workflow.nextflow.build + if (workflow.repository) { + misc_fields['Pipeline repository Git URL'] = workflow.repository + } + if (workflow.commitId) { + misc_fields['Pipeline repository Git Commit'] = workflow.commitId + } + if (workflow.revision) { + misc_fields['Pipeline Git branch/tag'] = workflow.revision + } + misc_fields['Nextflow Version'] = workflow.nextflow.version + misc_fields['Nextflow Build'] = workflow.nextflow.build misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] @@ -317,7 +274,7 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi email_fields['summary'] = summary << misc_fields // On success try attach the multiqc report - def mqc_report = attachMultiqcReport(multiqc_report) + def mqc_report = getSingleReport(multiqc_report) // Check if we are only sending emails on failure def email_address = email @@ -337,40 +294,45 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as MemoryUnit + def smail_fields = [email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes()] def sf = new File("${workflow.projectDir}/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() // Send the HTML e-mail - Map colors = logColours(monochrome_logs) + def colors = logColours(monochrome_logs) as Map if (email_address) { try { - if (plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + if (plaintext_email) { + new org.codehaus.groovy.GroovyException('Send plaintext e-mail, not HTML') + } // Try to send HTML e-mail using sendmail def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") sendmail_tf.withWriter { w -> w << sendmail_html } - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" - } catch (all) { + ['sendmail', '-t'].execute() << sendmail_html + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Sent summary e-mail to ${email_address} (sendmail)-") + } + catch (Exception msg) { + log.debug(msg.toString()) + log.debug("Trying with mail instead of sendmail") // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + def mail_cmd = ['mail', '-s', subject, '--content-type=text/html', email_address] mail_cmd.execute() << email_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Sent summary e-mail to ${email_address} (mail)-") } } // Write summary e-mail HTML to a file def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - FilesEx.copyTo(output_hf.toPath(), "${outdir}/pipeline_info/pipeline_report.html"); + nextflow.extension.FilesEx.copyTo(output_hf.toPath(), "${outdir}/pipeline_info/pipeline_report.html") output_hf.delete() // Write summary e-mail TXT to a file def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } - FilesEx.copyTo(output_tf.toPath(), "${outdir}/pipeline_info/pipeline_report.txt"); + nextflow.extension.FilesEx.copyTo(output_tf.toPath(), "${outdir}/pipeline_info/pipeline_report.txt") output_tf.delete() } @@ -378,15 +340,17 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi // Print pipeline summary on completion // def completionSummary(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) + def colors = logColours(monochrome_logs) as Map if (workflow.success) { if (workflow.stats.ignoredCount == 0) { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Pipeline completed successfully${colors.reset}-") } - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + else { + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-") + } + } + else { + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.red} Pipeline completed with errors${colors.reset}-") } } @@ -395,21 +359,30 @@ def completionSummary(monochrome_logs=true) { // def imNotification(summary_params, hook_url) { def summary = [:] - for (group in summary_params.keySet()) { - summary << summary_params[group] - } + summary_params + .keySet() + .sort() + .each { group -> + summary << summary_params[group] + } def misc_fields = [:] - misc_fields['start'] = workflow.start - misc_fields['complete'] = workflow.complete - misc_fields['scriptfile'] = workflow.scriptFile - misc_fields['scriptid'] = workflow.scriptId - if (workflow.repository) misc_fields['repository'] = workflow.repository - if (workflow.commitId) misc_fields['commitid'] = workflow.commitId - if (workflow.revision) misc_fields['revision'] = workflow.revision - misc_fields['nxf_version'] = workflow.nextflow.version - misc_fields['nxf_build'] = workflow.nextflow.build - misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) { + misc_fields['repository'] = workflow.repository + } + if (workflow.commitId) { + misc_fields['commitid'] = workflow.commitId + } + if (workflow.revision) { + misc_fields['revision'] = workflow.revision + } + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp def msg_fields = [:] msg_fields['version'] = getWorkflowVersion() @@ -434,13 +407,13 @@ def imNotification(summary_params, hook_url) { def json_message = json_template.toString() // POST - def post = new URL(hook_url).openConnection(); + def post = new URL(hook_url).openConnection() post.setRequestMethod("POST") post.setDoOutput(true) post.setRequestProperty("Content-Type", "application/json") - post.getOutputStream().write(json_message.getBytes("UTF-8")); - def postRC = post.getResponseCode(); - if (! postRC.equals(200)) { - log.warn(post.getErrorStream().getText()); + post.getOutputStream().write(json_message.getBytes("UTF-8")) + def postRC = post.getResponseCode() + if (!postRC.equals(200)) { + log.warn(post.getErrorStream().getText()) } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test index 1dc317f8f..f117040cb 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test @@ -41,26 +41,14 @@ nextflow_function { } } - test("Test Function workflowCitation") { - - function "workflowCitation" - - then { - assertAll( - { assert function.success }, - { assert snapshot(function.result).match() } - ) - } - } - - test("Test Function nfCoreLogo") { + test("Test Function without logColours") { - function "nfCoreLogo" + function "logColours" when { function { """ - input[0] = false + input[0] = true """ } } @@ -73,9 +61,8 @@ nextflow_function { } } - test("Test Function dashedLine") { - - function "dashedLine" + test("Test Function with logColours") { + function "logColours" when { function { @@ -93,14 +80,13 @@ nextflow_function { } } - test("Test Function without logColours") { - - function "logColours" + test("Test Function getSingleReport with a single file") { + function "getSingleReport" when { function { """ - input[0] = true + input[0] = file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true) """ } } @@ -108,18 +94,22 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") } ) } } - test("Test Function with logColours") { - function "logColours" + test("Test Function getSingleReport with multiple files") { + function "getSingleReport" when { function { """ - input[0] = false + input[0] = [ + file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/network.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/expression.tsv', checkIfExists: true) + ] """ } } @@ -127,7 +117,9 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") }, + { assert !function.result.contains("network.tsv") }, + { assert !function.result.contains("expression.tsv") } ) } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap index 1037232c9..02c670141 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap @@ -17,26 +17,6 @@ }, "timestamp": "2024-02-28T12:02:59.729647" }, - "Test Function nfCoreLogo": { - "content": [ - "\n\n-\u001b[2m----------------------------------------------------\u001b[0m-\n \u001b[0;32m,--.\u001b[0;30m/\u001b[0;32m,-.\u001b[0m\n\u001b[0;34m ___ __ __ __ ___ \u001b[0;32m/,-._.--~'\u001b[0m\n\u001b[0;34m |\\ | |__ __ / ` / \\ |__) |__ \u001b[0;33m} {\u001b[0m\n\u001b[0;34m | \\| | \\__, \\__/ | \\ |___ \u001b[0;32m\\`-._,-`-,\u001b[0m\n \u001b[0;32m`._,._,'\u001b[0m\n\u001b[0;35m nextflow_workflow v9.9.9\u001b[0m\n-\u001b[2m----------------------------------------------------\u001b[0m-\n" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:10.562934" - }, - "Test Function workflowCitation": { - "content": [ - "If you use nextflow_workflow for your analysis please cite:\n\n* The pipeline\n https://doi.org/10.5281/zenodo.5070524\n\n* The nf-core framework\n https://doi.org/10.1038/s41587-020-0439-x\n\n* Software dependencies\n https://github.com/nextflow_workflow/blob/master/CITATIONS.md" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:07.019761" - }, "Test Function without logColours": { "content": [ { @@ -95,16 +75,6 @@ }, "timestamp": "2024-02-28T12:03:17.969323" }, - "Test Function dashedLine": { - "content": [ - "-\u001b[2m----------------------------------------------------\u001b[0m-" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:14.366181" - }, "Test Function with logColours": { "content": [ { diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c9a..000000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf new file mode 100644 index 000000000..1df8b76fb --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -0,0 +1,73 @@ +// +// Subworkflow that uses the nf-schema plugin to validate parameters and render the parameter summary +// + +include { paramsSummaryLog } from 'plugin/nf-schema' +include { validateParameters } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' + +workflow UTILS_NFSCHEMA_PLUGIN { + + take: + input_workflow // workflow: the workflow object used by nf-schema to get metadata from the workflow + validate_params // boolean: validate the parameters + parameters_schema // string: path to the parameters JSON schema. + // this has to be the same as the schema given to `validation.parametersSchema` + // when this input is empty it will automatically use the configured schema or + // "${projectDir}/nextflow_schema.json" as default. This input should not be empty + // for meta pipelines + help // boolean: show help message + help_full // boolean: show full help message + show_hidden // boolean: show hidden parameters in help message + before_text // string: text to show before the help message and parameters summary + after_text // string: text to show after the help message and parameters summary + command // string: an example command of the pipeline + + main: + + if(help || help_full) { + help_options = [ + beforeText: before_text, + afterText: after_text, + command: command, + showHidden: show_hidden, + fullHelp: help_full, + ] + if(parameters_schema) { + help_options << [parametersSchema: parameters_schema] + } + log.info paramsHelp( + help_options, + (params.help instanceof String && params.help != "true") ? params.help : "", + ) + exit 0 + } + + // + // Print parameter summary to stdout. This will display the parameters + // that differ from the default given in the JSON schema + // + + summary_options = [:] + if(parameters_schema) { + summary_options << [parametersSchema: parameters_schema] + } + log.info before_text + log.info paramsSummaryLog(summary_options, input_workflow) + log.info after_text + + // + // Validate the parameters using nextflow_schema.json or the schema + // given via the validation.parametersSchema configuration option + // + if(validate_params) { + validateOptions = [:] + if(parameters_schema) { + validateOptions << [parametersSchema: parameters_schema] + } + validateParameters(validateOptions) + } + + emit: + dummy_emit = true +} diff --git a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml new file mode 100644 index 000000000..f7d9f0288 --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml @@ -0,0 +1,35 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "utils_nfschema_plugin" +description: Run nf-schema to validate parameters and create a summary of changed parameters +keywords: + - validation + - JSON schema + - plugin + - parameters + - summary +components: [] +input: + - input_workflow: + type: object + description: | + The workflow object of the used pipeline. + This object contains meta data used to create the params summary log + - validate_params: + type: boolean + description: Validate the parameters and error if invalid. + - parameters_schema: + type: string + description: | + Path to the parameters JSON schema. + This has to be the same as the schema given to the `validation.parametersSchema` config + option. When this input is empty it will automatically use the configured schema or + "${projectDir}/nextflow_schema.json" as default. The schema should not be given in this way + for meta pipelines. +output: + - dummy_emit: + type: boolean + description: Dummy emit to make nf-core subworkflows lint happy +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test new file mode 100644 index 000000000..c977917aa --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -0,0 +1,173 @@ +nextflow_workflow { + + name "Test Subworkflow UTILS_NFSCHEMA_PLUGIN" + script "../main.nf" + workflow "UTILS_NFSCHEMA_PLUGIN" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/utils_nfschema_plugin" + tag "plugin/nf-schema" + + config "./nextflow.config" + + test("Should run nothing") { + + when { + + params { + test_data = '' + } + + workflow { + """ + validate_params = false + input[0] = workflow + input[1] = validate_params + input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("Should validate params") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.stdout.any { it.contains('ERROR ~ Validation of pipeline parameters failed!') } } + ) + } + } + + test("Should run nothing - custom schema") { + + when { + + params { + test_data = '' + } + + workflow { + """ + validate_params = false + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("Should validate params - custom schema") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.stdout.any { it.contains('ERROR ~ Validation of pipeline parameters failed!') } } + ) + } + } + + test("Should create a help message") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = true + input[4] = false + input[5] = false + input[6] = "Before" + input[7] = "After" + input[8] = "nextflow run test/test" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } +} diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config new file mode 100644 index 000000000..f6537cc33 --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -0,0 +1,8 @@ +plugins { + id "nf-schema@2.6.1" +} + +validation { + parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + monochromeLogs = true +} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json similarity index 95% rename from subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json rename to subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json index 7626c1c93..331e0d2f4 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json @@ -1,10 +1,10 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": ". pipeline parameters", "description": "", "type": "object", - "definitions": { + "$defs": { "input_output_options": { "title": "Input/output options", "type": "object", @@ -87,10 +87,10 @@ }, "allOf": [ { - "$ref": "#/definitions/input_output_options" + "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/definitions/generic_options" + "$ref": "#/$defs/generic_options" } ] } diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf deleted file mode 100644 index 2585b65d1..000000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf +++ /dev/null @@ -1,62 +0,0 @@ -// -// Subworkflow that uses the nf-validation plugin to render help text and parameter summary -// - -/* -======================================================================================== - IMPORT NF-VALIDATION PLUGIN -======================================================================================== -*/ - -include { paramsHelp } from 'plugin/nf-validation' -include { paramsSummaryLog } from 'plugin/nf-validation' -include { validateParameters } from 'plugin/nf-validation' - -/* -======================================================================================== - SUBWORKFLOW DEFINITION -======================================================================================== -*/ - -workflow UTILS_NFVALIDATION_PLUGIN { - - take: - print_help // boolean: print help - workflow_command // string: default commmand used to run pipeline - pre_help_text // string: string to be printed before help text and summary log - post_help_text // string: string to be printed after help text and summary log - validate_params // boolean: validate parameters - schema_filename // path: JSON schema file, null to use default value - - main: - - log.debug "Using schema file: ${schema_filename}" - - // Default values for strings - pre_help_text = pre_help_text ?: '' - post_help_text = post_help_text ?: '' - workflow_command = workflow_command ?: '' - - // - // Print help message if needed - // - if (print_help) { - log.info pre_help_text + paramsHelp(workflow_command, parameters_schema: schema_filename) + post_help_text - System.exit(0) - } - - // - // Print parameter summary to stdout - // - log.info pre_help_text + paramsSummaryLog(workflow, parameters_schema: schema_filename) + post_help_text - - // - // Validate parameters relative to the parameter JSON schema - // - if (validate_params){ - validateParameters(parameters_schema: schema_filename) - } - - emit: - dummy_emit = true -} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml b/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml deleted file mode 100644 index 3d4a6b04f..000000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "UTILS_NFVALIDATION_PLUGIN" -description: Use nf-validation to initiate and validate a pipeline -keywords: - - utility - - pipeline - - initialise - - validation -components: [] -input: - - print_help: - type: boolean - description: | - Print help message and exit - - workflow_command: - type: string - description: | - The command to run the workflow e.g. "nextflow run main.nf" - - pre_help_text: - type: string - description: | - Text to print before the help message - - post_help_text: - type: string - description: | - Text to print after the help message - - validate_params: - type: boolean - description: | - Validate the parameters and error if invalid. - - schema_filename: - type: string - description: | - The filename of the schema to validate against. -output: - - dummy_emit: - type: boolean - description: | - Dummy emit to make nf-core subworkflows lint happy -authors: - - "@adamrtalbot" -maintainers: - - "@adamrtalbot" - - "@maxulysse" diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test deleted file mode 100644 index 5784a33f2..000000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test +++ /dev/null @@ -1,200 +0,0 @@ -nextflow_workflow { - - name "Test Workflow UTILS_NFVALIDATION_PLUGIN" - script "../main.nf" - workflow "UTILS_NFVALIDATION_PLUGIN" - tag "subworkflows" - tag "subworkflows_nfcore" - tag "plugin/nf-validation" - tag "'plugin/nf-validation'" - tag "utils_nfvalidation_plugin" - tag "subworkflows/utils_nfvalidation_plugin" - - test("Should run nothing") { - - when { - - params { - monochrome_logs = true - test_data = '' - } - - workflow { - """ - help = false - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success } - ) - } - } - - test("Should run help") { - - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } } - ) - } - } - - test("Should run help with command") { - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = "nextflow run noorg/doesntexist" - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('nextflow run noorg/doesntexist') } }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } } - ) - } - } - - test("Should run help with extra text") { - - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = "nextflow run noorg/doesntexist" - pre_help_text = "pre-help-text" - post_help_text = "post-help-text" - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('pre-help-text') } }, - { assert workflow.stdout.any { it.contains('nextflow run noorg/doesntexist') } }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } }, - { assert workflow.stdout.any { it.contains('post-help-text') } } - ) - } - } - - test("Should validate params") { - - when { - - params { - monochrome_logs = true - test_data = '' - outdir = 1 - } - workflow { - """ - help = false - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = true - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.failed }, - { assert workflow.stdout.any { it.contains('ERROR ~ ERROR: Validation of pipeline parameters failed!') } } - ) - } - } -} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml deleted file mode 100644 index 60b1cfff4..000000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfvalidation_plugin: - - subworkflows/nf-core/utils_nfvalidation_plugin/** diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 000000000..83f7a0a50 --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,10 @@ +.DS_Store +multiqc/multiqc_data/multiqc.parquet +multiqc/multiqc_data/multiqc.log +multiqc/multiqc_data/multiqc_data.json +multiqc/multiqc_data/multiqc_sources.txt +multiqc/multiqc_data/multiqc_software_versions.txt +multiqc/multiqc_data/llms-full.txt +multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} +multiqc/multiqc_report.html +pipeline_info/*.{html,json,txt,yml} diff --git a/tests/alphafold2_download.nf.test b/tests/alphafold2_download.nf.test new file mode 100644 index 000000000..8151c847c --- /dev/null +++ b/tests/alphafold2_download.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test alphafold2 download mode stub" + script "../main.nf" + tag "pipeline" + tag "test_alphafold2_download" + profile "test_alphafold2_download" + + test("-profile test_alphafold2_download") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/alphafold2_download.nf.test.snap b/tests/alphafold2_download.nf.test.snap new file mode 100644 index 000000000..48d464d74 --- /dev/null +++ b/tests/alphafold2_download.nf.test.snap @@ -0,0 +1,128 @@ +{ + "-profile test_alphafold2_download": { + "content": [ + 25, + { + "ARIA2": { + "aria2": null + }, + "ARIA2_PDB_SEQRES": { + "aria2": null + }, + "COMBINE_UNIPROT": { + "sed": 4.7 + }, + "DOWNLOAD_PDBMMCIF": { + "sed": 4.9, + "rsync": "3.3.0" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_ALPHAFOLD2": { + "python": "unknown", + "alphafold2": "unknown", + "jax": "unknown", + "jaxlib": "unknown", + "numpy": "unknown", + "biopython": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "DBs", + "DBs/alphafold2", + "DBs/alphafold2/mgnify", + "DBs/alphafold2/mgnify/mgy_clusters.fa", + "DBs/alphafold2/params", + "DBs/alphafold2/params/alphafold_params_2022-12-06", + "DBs/alphafold2/pdb70", + "DBs/alphafold2/pdb_mmcif", + "DBs/alphafold2/pdb_mmcif/mmcif_files", + "DBs/alphafold2/pdb_mmcif/obsolete.dat", + "DBs/alphafold2/pdb_seqres", + "DBs/alphafold2/pdb_seqres/pdb_seqres.txt", + "DBs/alphafold2/small_bfd", + "DBs/alphafold2/small_bfd/bfd-first_non_consensus_sequences.fasta", + "DBs/alphafold2/uniprot", + "DBs/alphafold2/uniprot/uniprot.fasta", + "DBs/alphafold2/uniprot_sprot.fasta", + "DBs/alphafold2/uniprot_trembl.fasta", + "DBs/alphafold2/uniref30", + "DBs/alphafold2/uniref90", + "DBs/alphafold2/uniref90/uniref90.fasta", + "alphafold2", + "alphafold2/standard", + "alphafold2/standard/T1024", + "alphafold2/standard/T1024/T1024_alphafold2_msa.tsv", + "alphafold2/standard/T1024/T1024_iptm.tsv", + "alphafold2/standard/T1024/T1024_plddt.tsv", + "alphafold2/standard/T1024/T1024_ptm.tsv", + "alphafold2/standard/T1024/paes", + "alphafold2/standard/T1024/paes/T1024_0_pae.tsv", + "alphafold2/standard/T1026", + "alphafold2/standard/T1026/T1026_alphafold2_msa.tsv", + "alphafold2/standard/T1026/T1026_iptm.tsv", + "alphafold2/standard/T1026/T1026_plddt.tsv", + "alphafold2/standard/T1026/T1026_ptm.tsv", + "alphafold2/standard/T1026/paes", + "alphafold2/standard/T1026/paes/T1026_0_pae.tsv", + "alphafold2/standard/top_ranked_structures", + "alphafold2/standard/top_ranked_structures/T1024.pdb", + "alphafold2/standard/top_ranked_structures/T1026.pdb", + "multiqc", + "multiqc/alphafold2_multiqc_data", + "multiqc/alphafold2_multiqc_plots", + "multiqc/alphafold2_multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "mgy_clusters.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + + ], + "obsolete.dat:md5,d41d8cd98f00b204e9800998ecf8427e", + "pdb_seqres.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "bfd-first_non_consensus_sequences.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "uniprot.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "uniprot_sprot.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "uniprot_trembl.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "uniref90.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "alphafold2_multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:09:42.635737909" + } +} \ No newline at end of file diff --git a/tests/alphafold2_split.nf.test b/tests/alphafold2_split.nf.test new file mode 100644 index 000000000..8c14dc058 --- /dev/null +++ b/tests/alphafold2_split.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test alphafold2 split mode stub" + script "../main.nf" + tag "pipeline" + tag "test_alphafold2_split" + profile "test_alphafold2_split" + + test("-profile test_alphafold2_split") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/alphafold2_split.nf.test.snap b/tests/alphafold2_split.nf.test.snap new file mode 100644 index 000000000..ef128e119 --- /dev/null +++ b/tests/alphafold2_split.nf.test.snap @@ -0,0 +1,78 @@ +{ + "-profile test_alphafold2_split": { + "content": [ + 7, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_ALPHAFOLD2_MSA": { + "python": null, + "alphafold2": "unknown", + "numpy": "unknown", + "biopython": "unknown" + }, + "RUN_ALPHAFOLD2_PRED": { + "python": null, + "alphafold2": "unknown", + "jax": "unknown", + "jaxlib": "unknown", + "numpy": "unknown", + "biopython": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "alphafold2", + "alphafold2/split_msa_prediction", + "alphafold2/split_msa_prediction/T1024", + "alphafold2/split_msa_prediction/T1024/T1024_alphafold2_msa.tsv", + "alphafold2/split_msa_prediction/T1024/T1024_plddt.tsv", + "alphafold2/split_msa_prediction/T1024/msa", + "alphafold2/split_msa_prediction/T1024/msa/features.pkl", + "alphafold2/split_msa_prediction/T1024/paes", + "alphafold2/split_msa_prediction/T1024/paes/T1024_0_pae.tsv", + "alphafold2/split_msa_prediction/T1026", + "alphafold2/split_msa_prediction/T1026/T1026_alphafold2_msa.tsv", + "alphafold2/split_msa_prediction/T1026/T1026_plddt.tsv", + "alphafold2/split_msa_prediction/T1026/msa", + "alphafold2/split_msa_prediction/T1026/msa/features.pkl", + "alphafold2/split_msa_prediction/T1026/paes", + "alphafold2/split_msa_prediction/T1026/paes/T1026_0_pae.tsv", + "alphafold2/split_msa_prediction/top_ranked_structures", + "alphafold2/split_msa_prediction/top_ranked_structures/T1024.pdb", + "alphafold2/split_msa_prediction/top_ranked_structures/T1026.pdb", + "multiqc", + "multiqc/alphafold2_multiqc_data", + "multiqc/alphafold2_multiqc_plots", + "multiqc/alphafold2_multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "features.pkl:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "features.pkl:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "alphafold2_multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:12:35.47858571" + } +} \ No newline at end of file diff --git a/tests/alphafold3.nf.test b/tests/alphafold3.nf.test new file mode 100644 index 000000000..825fb14ff --- /dev/null +++ b/tests/alphafold3.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test alphafold3 mode stub" + script "../main.nf" + tag "pipeline" + tag "test_alphafold3_standard" + profile "test_alphafold3_standard" + + test("-profile test_alphafold3_standard") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/alphafold3.nf.test.snap b/tests/alphafold3.nf.test.snap new file mode 100644 index 000000000..b24775c3e --- /dev/null +++ b/tests/alphafold3.nf.test.snap @@ -0,0 +1,113 @@ +{ + "-profile test_alphafold3_standard": { + "content": [ + 11, + { + "FASTA_TO_ALPHAFOLD3_JSON": { + "python": "3.13.7" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MMCIF2PDB_MODELS": { + "python": "3.12.7" + }, + "MMCIF2PDB_TOP_RANKED": { + "python": "3.12.7" + }, + "RUN_ALPHAFOLD3": { + "python": "unknown", + "alphafold3": "unknown", + "jax": "unknown", + "jaxlib": "unknown", + "numpy": "unknown", + "biopython": "unknown", + "hmmer": null, + "rdkit": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "alphafold3", + "alphafold3/T1024", + "alphafold3/T1024/T1024_alphafold3_msa.tsv", + "alphafold3/T1024/T1024_iptm.tsv", + "alphafold3/T1024/T1024_plddt.tsv", + "alphafold3/T1024/T1024_ptm.tsv", + "alphafold3/T1024/paes", + "alphafold3/T1024/paes/T1024_0_pae.tsv", + "alphafold3/T1026", + "alphafold3/T1026/T1026_alphafold3_msa.tsv", + "alphafold3/T1026/T1026_iptm.tsv", + "alphafold3/T1026/T1026_plddt.tsv", + "alphafold3/T1026/T1026_ptm.tsv", + "alphafold3/T1026/paes", + "alphafold3/T1026/paes/T1026_0_pae.tsv", + "alphafold3/top_ranked_structures", + "alphafold3/top_ranked_structures/T1024.cif", + "alphafold3/top_ranked_structures/T1024.pdb", + "alphafold3/top_ranked_structures/T1026.cif", + "alphafold3/top_ranked_structures/T1026.pdb", + "fasta", + "fasta/T1024.json", + "fasta/T1026.json", + "mmcif2pdb", + "mmcif2pdb/T1024_ranked_1.cif.pdb", + "mmcif2pdb/T1024_ranked_2.cif.pdb", + "mmcif2pdb/T1024_ranked_3.cif.pdb", + "mmcif2pdb/T1024_ranked_4.cif.pdb", + "mmcif2pdb/T1024_ranked_5.cif.pdb", + "mmcif2pdb/T1026_ranked_1.cif.pdb", + "mmcif2pdb/T1026_ranked_2.cif.pdb", + "mmcif2pdb/T1026_ranked_3.cif.pdb", + "mmcif2pdb/T1026_ranked_4.cif.pdb", + "mmcif2pdb/T1026_ranked_5.cif.pdb", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_alphafold3_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_alphafold3_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.cif:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.cif:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.json:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.json:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ranked_1.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ranked_2.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ranked_3.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ranked_4.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ranked_5.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ranked_1.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ranked_2.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ranked_3.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ranked_4.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ranked_5.cif.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:14:34.760694724" + } +} \ No newline at end of file diff --git a/tests/boltz.nf.test b/tests/boltz.nf.test new file mode 100644 index 000000000..2fa596048 --- /dev/null +++ b/tests/boltz.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test boltz mode stub" + script "../main.nf" + tag "pipeline" + tag "test_boltz" + profile "test_boltz" + + test("-profile test_boltz") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/boltz.nf.test.snap b/tests/boltz.nf.test.snap new file mode 100644 index 000000000..7e776118e --- /dev/null +++ b/tests/boltz.nf.test.snap @@ -0,0 +1,103 @@ +{ + "-profile test_boltz": { + "content": [ + 11, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MMSEQS_COLABFOLDSEARCH": { + "colabfold_search": "unknown", + "mmseqs": null + }, + "RUN_BOLTZ": { + "boltz": "unknown" + }, + "SPLIT_MSA": { + "python": "3.8.3" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "boltz", + "boltz/A.csv", + "boltz/B.csv", + "boltz/T1024", + "boltz/T1024/T1024_boltz_msa.tsv", + "boltz/T1024/T1024_chainwise_iptm.tsv", + "boltz/T1024/T1024_chainwise_ptm.tsv", + "boltz/T1024/T1024_iptm.tsv", + "boltz/T1024/T1024_plddt.tsv", + "boltz/T1024/T1024_ptm.tsv", + "boltz/T1024/paes", + "boltz/T1024/paes/T1024_0_pae.tsv", + "boltz/T1026", + "boltz/T1026/T1026_boltz_msa.tsv", + "boltz/T1026/T1026_chainwise_iptm.tsv", + "boltz/T1026/T1026_chainwise_ptm.tsv", + "boltz/T1026/T1026_iptm.tsv", + "boltz/T1026/T1026_plddt.tsv", + "boltz/T1026/T1026_ptm.tsv", + "boltz/T1026/paes", + "boltz/T1026/paes/T1026_0_pae.tsv", + "boltz/output_fasta", + "boltz/output_fasta/T1024.fasta", + "boltz/output_fasta/T1026.fasta", + "boltz/top_ranked_structures", + "boltz/top_ranked_structures/T1024.pdb", + "boltz/top_ranked_structures/T1026.pdb", + "mmseqs", + "mmseqs/results", + "mmseqs/results/T1024.a3m", + "mmseqs/results/T1026.a3m", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html", + "split", + "split/output_msa", + "split/output_msa/A.csv", + "split/output_msa/B.csv" + ], + [ + "A.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "B.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_boltz_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_chainwise_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_chainwise_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_boltz_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_chainwise_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_chainwise_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.a3m:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.a3m:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "A.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "B.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:15:43.464789542" + } +} \ No newline at end of file diff --git a/tests/colabfold_download.nf.test b/tests/colabfold_download.nf.test new file mode 100644 index 000000000..57ae0d227 --- /dev/null +++ b/tests/colabfold_download.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test colabfold_download mode stub" + script "../main.nf" + tag "pipeline" + tag "test_colabfold_download" + profile "test_colabfold_download" + + test("-profile test_colabfold_download") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/colabfold_download.nf.test.snap b/tests/colabfold_download.nf.test.snap new file mode 100644 index 000000000..08fcb40c4 --- /dev/null +++ b/tests/colabfold_download.nf.test.snap @@ -0,0 +1,80 @@ +{ + "-profile test_colabfold_download": { + "content": [ + 9, + { + "ARIA2": { + "aria2": null + }, + "COLABFOLD_BATCH": { + "alphafold_colabfold": "unknown", + "colabfold_batch": "unknown" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MULTIFASTA_TO_CSV": { + "sed": 4.7 + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "DBs", + "DBs/colabfold", + "DBs/colabfold/params", + "DBs/colabfold/params/alphafold_params_2021-07-14", + "colabfold", + "colabfold/T1024", + "colabfold/T1024/T1024_colabfold_msa.tsv", + "colabfold/T1024/T1024_plddt.tsv", + "colabfold/T1024/T1024_ptm.tsv", + "colabfold/T1024/paes", + "colabfold/T1024/paes/T1024_0_pae.tsv", + "colabfold/T1026", + "colabfold/T1026/T1026_colabfold_msa.tsv", + "colabfold/T1026/T1026_plddt.tsv", + "colabfold/T1026/T1026_ptm.tsv", + "colabfold/T1026/paes", + "colabfold/T1026/paes/T1026_0_pae.tsv", + "colabfold/top_ranked_structures", + "colabfold/top_ranked_structures/T1024.pdb", + "colabfold/top_ranked_structures/T1026.pdb", + "multifasta", + "multifasta/input.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + [ + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "T1024_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:17:44.266981229" + } +} \ No newline at end of file diff --git a/tests/colabfold_local.nf.test b/tests/colabfold_local.nf.test new file mode 100644 index 000000000..8b6d1a511 --- /dev/null +++ b/tests/colabfold_local.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test colabfold local mode stub" + script "../main.nf" + tag "pipeline" + tag "test_colabfold_local" + profile "test_colabfold_local" + + test("-profile test_colabfold_local") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/colabfold_local.nf.test.snap b/tests/colabfold_local.nf.test.snap new file mode 100644 index 000000000..a806bc9cd --- /dev/null +++ b/tests/colabfold_local.nf.test.snap @@ -0,0 +1,74 @@ +{ + "-profile test_colabfold_local": { + "content": [ + 9, + { + "COLABFOLD_BATCH": { + "alphafold_colabfold": "unknown", + "colabfold_batch": "unknown" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MMSEQS_COLABFOLDSEARCH": { + "colabfold_search": "unknown", + "mmseqs": null + }, + "MULTIFASTA_TO_CSV": { + "sed": 4.7 + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "colabfold", + "colabfold/T1024", + "colabfold/T1024/T1024_colabfold_msa.tsv", + "colabfold/T1024/T1024_plddt.tsv", + "colabfold/T1024/T1024_ptm.tsv", + "colabfold/T1024/paes", + "colabfold/T1024/paes/T1024_0_pae.tsv", + "colabfold/T1026", + "colabfold/T1026/T1026_colabfold_msa.tsv", + "colabfold/T1026/T1026_plddt.tsv", + "colabfold/T1026/T1026_ptm.tsv", + "colabfold/T1026/paes", + "colabfold/T1026/paes/T1026_0_pae.tsv", + "colabfold/top_ranked_structures", + "colabfold/top_ranked_structures/T1024.pdb", + "colabfold/top_ranked_structures/T1026.pdb", + "multifasta", + "multifasta/input.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:16:34.340030626" + } +} \ No newline at end of file diff --git a/tests/colabfold_webserver.nf.test b/tests/colabfold_webserver.nf.test new file mode 100644 index 000000000..31f281476 --- /dev/null +++ b/tests/colabfold_webserver.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test colabfold webserver mode stub" + script "../main.nf" + tag "pipeline" + tag "test_colabfold_webserver" + profile "test_colabfold_webserver" + + test("-profile test_colabfold_webserver") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/colabfold_webserver.nf.test.snap b/tests/colabfold_webserver.nf.test.snap new file mode 100644 index 000000000..4354b1bbd --- /dev/null +++ b/tests/colabfold_webserver.nf.test.snap @@ -0,0 +1,70 @@ +{ + "-profile test_colabfold_webserver": { + "content": [ + 7, + { + "COLABFOLD_BATCH": { + "alphafold_colabfold": "unknown", + "colabfold_batch": "unknown" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MULTIFASTA_TO_CSV": { + "sed": 4.7 + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "colabfold", + "colabfold/T1024", + "colabfold/T1024/T1024_colabfold_msa.tsv", + "colabfold/T1024/T1024_plddt.tsv", + "colabfold/T1024/T1024_ptm.tsv", + "colabfold/T1024/paes", + "colabfold/T1024/paes/T1024_0_pae.tsv", + "colabfold/T1026", + "colabfold/T1026/T1026_colabfold_msa.tsv", + "colabfold/T1026/T1026_plddt.tsv", + "colabfold/T1026/T1026_ptm.tsv", + "colabfold/T1026/paes", + "colabfold/T1026/paes/T1026_0_pae.tsv", + "colabfold/top_ranked_structures", + "colabfold/top_ranked_structures/T1024.pdb", + "colabfold/top_ranked_structures/T1026.pdb", + "multifasta", + "multifasta/input.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:20:48.487472959" + } +} \ No newline at end of file diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 000000000..0a53c63f9 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,34 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 000000000..2cd8ceba2 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,73 @@ +{ + "-profile test": { + "content": [ + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_ALPHAFOLD2": { + "python": "unknown", + "alphafold2": "unknown", + "jax": "unknown", + "jaxlib": "unknown", + "numpy": "unknown", + "biopython": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "alphafold2", + "alphafold2/standard", + "alphafold2/standard/T1024", + "alphafold2/standard/T1024/T1024_alphafold2_msa.tsv", + "alphafold2/standard/T1024/T1024_iptm.tsv", + "alphafold2/standard/T1024/T1024_plddt.tsv", + "alphafold2/standard/T1024/T1024_ptm.tsv", + "alphafold2/standard/T1024/paes", + "alphafold2/standard/T1024/paes/T1024_0_pae.tsv", + "alphafold2/standard/T1026", + "alphafold2/standard/T1026/T1026_alphafold2_msa.tsv", + "alphafold2/standard/T1026/T1026_iptm.tsv", + "alphafold2/standard/T1026/T1026_plddt.tsv", + "alphafold2/standard/T1026/T1026_ptm.tsv", + "alphafold2/standard/T1026/paes", + "alphafold2/standard/T1026/paes/T1026_0_pae.tsv", + "alphafold2/standard/top_ranked_structures", + "alphafold2/standard/top_ranked_structures/T1024.pdb", + "alphafold2/standard/top_ranked_structures/T1026.pdb", + "multiqc", + "multiqc/alphafold2_multiqc_data", + "multiqc/alphafold2_multiqc_plots", + "multiqc/alphafold2_multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_alphafold2_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "alphafold2_multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:13:26.007551223" + } +} \ No newline at end of file diff --git a/tests/esmfold.nf.test b/tests/esmfold.nf.test new file mode 100644 index 000000000..129e58513 --- /dev/null +++ b/tests/esmfold.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test esmfold mode stub" + script "../main.nf" + tag "pipeline" + tag "test_esmfold" + profile "test_esmfold" + + test("-profile test_esmfold") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/esmfold.nf.test.snap b/tests/esmfold.nf.test.snap new file mode 100644 index 000000000..be4ec57b1 --- /dev/null +++ b/tests/esmfold.nf.test.snap @@ -0,0 +1,54 @@ +{ + "-profile test_esmfold": { + "content": [ + 5, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_ESMFOLD": { + "esm-fold": "1.0.3", + "python": "unknown", + "pytorch": "unknown", + "openfold": "unknown", + "numpy": "unknown", + "biopython": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "esmfold", + "esmfold/T1024", + "esmfold/T1024/T1024_plddt.tsv", + "esmfold/T1026", + "esmfold/T1026/T1026_plddt.tsv", + "esmfold/top_ranked_structures", + "esmfold/top_ranked_structures/T1024.pdb", + "esmfold/top_ranked_structures/T1026.pdb", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:21:39.018391209" + } +} \ No newline at end of file diff --git a/tests/helixfold3.nf.test b/tests/helixfold3.nf.test new file mode 100644 index 000000000..e1c3f38d9 --- /dev/null +++ b/tests/helixfold3.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test helixfold3 mode stub" + script "../main.nf" + tag "pipeline" + tag "test_helixfold3" + profile "test_helixfold3" + + test("-profile test_helixfold3") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/helixfold3.nf.test.snap b/tests/helixfold3.nf.test.snap new file mode 100644 index 000000000..0455c881e --- /dev/null +++ b/tests/helixfold3.nf.test.snap @@ -0,0 +1,92 @@ +{ + "-profile test_helixfold3": { + "content": [ + 7, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_HELIXFOLD3": { + "python": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "fasta2json", + "fasta2json/T1024.json", + "fasta2json/T1026.json", + "helixfold3", + "helixfold3/T1024", + "helixfold3/T1024/T1024_helixfold3_msa.tsv", + "helixfold3/T1024/T1024_iptm.tsv", + "helixfold3/T1024/T1024_plddt.tsv", + "helixfold3/T1024/T1024_ptm.tsv", + "helixfold3/T1024/paes", + "helixfold3/T1024/paes/T1024_1_pae.tsv", + "helixfold3/T1024/paes/T1024_2_pae.tsv", + "helixfold3/T1024/paes/T1024_3_pae.tsv", + "helixfold3/T1024/paes/T1024_4_pae.tsv", + "helixfold3/T1024/paes/T1024_5_pae.tsv", + "helixfold3/T1026", + "helixfold3/T1026/T1026_helixfold3_msa.tsv", + "helixfold3/T1026/T1026_iptm.tsv", + "helixfold3/T1026/T1026_plddt.tsv", + "helixfold3/T1026/T1026_ptm.tsv", + "helixfold3/T1026/paes", + "helixfold3/T1026/paes/T1026_1_pae.tsv", + "helixfold3/T1026/paes/T1026_2_pae.tsv", + "helixfold3/T1026/paes/T1026_3_pae.tsv", + "helixfold3/T1026/paes/T1026_4_pae.tsv", + "helixfold3/T1026/paes/T1026_5_pae.tsv", + "helixfold3/top_ranked_structures", + "helixfold3/top_ranked_structures/T1024.cif", + "helixfold3/top_ranked_structures/T1024.pdb", + "helixfold3/top_ranked_structures/T1026.cif", + "helixfold3/top_ranked_structures/T1026.pdb", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "T1024.json:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.json:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_helixfold3_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_1_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_2_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_3_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_4_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_5_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_helixfold3_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_iptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_1_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_2_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_3_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_4_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_5_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.cif:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.cif:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:22:20.586950258" + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 000000000..9b4ef8173 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,14 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/proteinfold' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners diff --git a/tests/rosettafold2na.nf.test b/tests/rosettafold2na.nf.test new file mode 100644 index 000000000..888809b87 --- /dev/null +++ b/tests/rosettafold2na.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test rosettafold2na mode stub" + script "../main.nf" + tag "pipeline" + tag "test_rosettafold2na" + profile "test_rosettafold2na" + + test("-profile test_rosettafold2na") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/rosettafold2na.nf.test.snap b/tests/rosettafold2na.nf.test.snap new file mode 100644 index 000000000..b2202a8a9 --- /dev/null +++ b/tests/rosettafold2na.nf.test.snap @@ -0,0 +1,56 @@ +{ + "-profile test_rosettafold2na": { + "content": [ + 4, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "ROSETTAFOLD2NA_FASTA": { + "python": "$(python3 --version 2>/dev/null | sed 's/Python //g' || echo \"unknown\")" + }, + "RUN_ROSETTAFOLD2NA": { + "python": "unknown", + "rosettafold2na": "v0.2" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html", + "rosettafold2na", + "rosettafold2na/rf2na_input", + "rosettafold2na/rf2na_input/chain_map.tsv", + "rosettafold2na/rna_complex", + "rosettafold2na/rna_complex/paes", + "rosettafold2na/rna_complex/paes/rna_complex_0_pae.tsv", + "rosettafold2na/rna_complex/rna_complex_plddt.tsv", + "rosettafold2na/rna_complex/rna_complex_rosettafold2na_msa.tsv", + "rosettafold2na/top_ranked_structures", + "rosettafold2na/top_ranked_structures/rna_complex.pdb" + ], + [ + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "chain_map.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "rna_complex_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "rna_complex_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "rna_complex_rosettafold2na_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "rna_complex.pdb:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:23:41.931720663" + } +} \ No newline at end of file diff --git a/tests/rosettafold_all_atom.nf.test b/tests/rosettafold_all_atom.nf.test new file mode 100644 index 000000000..b9687f29f --- /dev/null +++ b/tests/rosettafold_all_atom.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test rosettafold_all_atom mode stub" + script "../main.nf" + tag "pipeline" + tag "test_rosettafold_all_atom" + profile "test_rosettafold_all_atom" + + test("-profile test_rosettafold_all_atom") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/rosettafold_all_atom.nf.test.snap b/tests/rosettafold_all_atom.nf.test.snap new file mode 100644 index 000000000..c15a6ac5e --- /dev/null +++ b/tests/rosettafold_all_atom.nf.test.snap @@ -0,0 +1,70 @@ +{ + "-profile test_rosettafold_all_atom": { + "content": [ + 7, + { + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "RUN_ROSETTAFOLD_ALL_ATOM": { + "python": "unknown", + "rosettafold-all-atom": "unknown" + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "fasta2yaml", + "fasta2yaml/T1024.yaml", + "fasta2yaml/T1026.yaml", + "fasta2yaml/out_fasta", + "fasta2yaml/out_fasta/A.fasta", + "fasta2yaml/out_fasta/B.fasta", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html", + "rosettafold_all_atom", + "rosettafold_all_atom/T1024", + "rosettafold_all_atom/T1024/T1024_plddt.tsv", + "rosettafold_all_atom/T1024/T1024_rosettafold_all_atom_msa.tsv", + "rosettafold_all_atom/T1024/paes", + "rosettafold_all_atom/T1024/paes/T1024_0_pae.tsv", + "rosettafold_all_atom/T1026", + "rosettafold_all_atom/T1026/T1026_plddt.tsv", + "rosettafold_all_atom/T1026/T1026_rosettafold_all_atom_msa.tsv", + "rosettafold_all_atom/T1026/paes", + "rosettafold_all_atom/T1026/paes/T1026_0_pae.tsv", + "rosettafold_all_atom/top_ranked_structures", + "rosettafold_all_atom/top_ranked_structures/T1024.pdb", + "rosettafold_all_atom/top_ranked_structures/T1026.pdb" + ], + [ + "T1024.yaml:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.yaml:md5,d41d8cd98f00b204e9800998ecf8427e", + "A.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "B.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_rosettafold_all_atom_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_rosettafold_all_atom_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1024.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "T1026.pdb:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:23:04.191225337" + } +} \ No newline at end of file diff --git a/tests/split_fasta.nf.test b/tests/split_fasta.nf.test new file mode 100644 index 000000000..440b07911 --- /dev/null +++ b/tests/split_fasta.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test split_fasta mode stub" + script "../main.nf" + tag "pipeline" + tag "test_split_fasta" + profile "test_split_fasta" + + test("-profile test_split_fasta") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinfold_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/split_fasta.nf.test.snap b/tests/split_fasta.nf.test.snap new file mode 100644 index 000000000..0d7e0336a --- /dev/null +++ b/tests/split_fasta.nf.test.snap @@ -0,0 +1,74 @@ +{ + "-profile test_split_fasta": { + "content": [ + 9, + { + "COLABFOLD_BATCH": { + "alphafold_colabfold": "unknown", + "colabfold_batch": "unknown" + }, + "GENERATE_REPORT": { + "python": "3.12.7", + "generate_report.py": "Python 3.12.7" + }, + "MMSEQS_COLABFOLDSEARCH": { + "colabfold_search": "unknown", + "mmseqs": null + }, + "MULTIFASTA_TO_CSV": { + "sed": 4.7 + }, + "Workflow": { + "nf-core/proteinfold": "v2.0.0" + } + }, + [ + "colabfold", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_colabfold_msa.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_plddt.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_ptm.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues/paes", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues/paes/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_0_pae.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_colabfold_msa.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_plddt.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_ptm.tsv", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues/paes", + "colabfold/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues/paes/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_0_pae.tsv", + "colabfold/top_ranked_structures", + "colabfold/top_ranked_structures/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues.pdb", + "colabfold/top_ranked_structures/H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues.pdb", + "multifasta", + "multifasta/input.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_plots", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinfold_software_mqc_versions.yml", + "reports", + "reports/test_alphafold2_report.html" + ], + [ + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_colabfold_msa.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_plddt.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_ptm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues_0_pae.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_1_127_residues.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "H1065_N4-Cytosine_Methyltransferase_Serratia_marcescens_subunit_2_98_residues.pdb:md5,d41d8cd98f00b204e9800998ecf8427e", + "input.csv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_alphafold2_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-12T16:25:13.497245275" + } +} \ No newline at end of file diff --git a/tower.yml b/tower.yml index 787aedfe9..b479a247c 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,23 @@ reports: - multiqc_report.html: - display: "MultiQC HTML report" + esmfold_multiqc_report.html: + display: "ESMFOLD - MultiQC HTML report" + alphafold2_multiqc_report.html: + display: "ALPHAFOLD2 - MultiQC HTML report" + colabfold_multiqc_report.html: + display: "COLABFOLD - MultiQC HTML report" samplesheet.csv: display: "Auto-created samplesheet with collated metadata and FASTQ paths" + "*_alphafold2_report.html": + display: "ALPHAFOLD2 - Predicted structures" + "*_esmfold_report.html": + display: "ESMFOLD - Predicted structures" + "*_colabfold_report.html": + display: "COLABFOLD - Predicted structures" + "*_colabfold_foldseek.html": + display: "COLABFOLD - Foldseek output" + "*_alphafold2_foldseek.html": + display: "ALPHAFOLD2 - Foldseek output" + "*_esmfold_foldseek.html": + display: "ESMFOLD - Foldseek output" + "*_comparison_report.html": + display: "Structure comparison" diff --git a/workflows/alphafold2.nf b/workflows/alphafold2.nf index 9a1aebae3..1c17cea1c 100644 --- a/workflows/alphafold2.nf +++ b/workflows/alphafold2.nf @@ -17,20 +17,6 @@ include { RUN_ALPHAFOLD2_PRED } from '../modules/local/run_alphafold2_pred' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Installed directly from nf-core/modules -// -include { MULTIQC } from '../modules/nf-core/multiqc/main' - -// -// SUBWORKFLOW: Consisting entirely of nf-core/modules -// -include { paramsSummaryMap } from 'plugin/nf-validation' -include { fromSamplesheet } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -40,39 +26,39 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot workflow ALPHAFOLD2 { take: + ch_samplesheet // channel: samplesheet read in from --input ch_versions // channel: [ path(versions.yml) ] - full_dbs // boolean: Use full databases (otherwise reduced version) + alphafold2_full_dbs // boolean: Use full databases (otherwise reduced version) alphafold2_mode // string: Mode to run Alphafold2 in alphafold2_model_preset // string: Specifies the model preset to use for Alphafold2 + uniref30_prefix // string: Prefix for uniref30 database files ch_alphafold2_params // channel: path(alphafold2_params) ch_bfd // channel: path(bfd) ch_small_bfd // channel: path(small_bfd) ch_mgnify // channel: path(mgnify) ch_pdb70 // channel: path(pdb70) ch_pdb_mmcif // channel: path(pdb_mmcif) + ch_pdb_obsolete // channel: path(pdb_obsolete) ch_uniref30 // channel: path(uniref30) ch_uniref90 // channel: path(uniref90) ch_pdb_seqres // channel: path(pdb_seqres) ch_uniprot // channel: path(uniprot) main: - ch_multiqc_files = Channel.empty() - - // - // Create input channel from input file provided through params.input - // - Channel - .fromSamplesheet("input") - .set { ch_fasta } + ch_pdb = channel.empty() + ch_top_ranked_pdb = channel.empty() + ch_msa = channel.empty() + ch_pae = channel.empty() + ch_multiqc_report = channel.empty() if (alphafold2_model_preset != 'multimer') { - ch_fasta + ch_samplesheet .map { meta, fasta -> [ meta, fasta.splitFasta(file:true) ] } .transpose() - .set { ch_fasta } + .set { ch_samplesheet } } if (alphafold2_mode == 'standard') { @@ -80,47 +66,69 @@ workflow ALPHAFOLD2 { // SUBWORKFLOW: Run Alphafold2 standard mode // RUN_ALPHAFOLD2 ( - ch_fasta, - full_dbs, + ch_samplesheet, + alphafold2_full_dbs, alphafold2_model_preset, + uniref30_prefix, ch_alphafold2_params, ch_bfd, ch_small_bfd, ch_mgnify, ch_pdb70, ch_pdb_mmcif, + ch_pdb_obsolete, ch_uniref30, ch_uniref90, ch_pdb_seqres, ch_uniprot ) - ch_multiqc_rep = RUN_ALPHAFOLD2.out.multiqc.collect() - ch_versions = ch_versions.mix(RUN_ALPHAFOLD2.out.versions) + + RUN_ALPHAFOLD2 + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "alphafold2" ], it.flatten() ] + } + .set { ch_multiqc_report } + + ch_pdb = ch_pdb.mix(RUN_ALPHAFOLD2.out.pdb) + ch_top_ranked_pdb = ch_top_ranked_pdb.mix(RUN_ALPHAFOLD2.out.top_ranked_pdb) + ch_msa = ch_msa.mix(RUN_ALPHAFOLD2.out.msa) + ch_pae = ch_pae.mix(RUN_ALPHAFOLD2.out.pae) + ch_versions = ch_versions.mix(RUN_ALPHAFOLD2.out.versions) } else if (alphafold2_mode == 'split_msa_prediction') { // // SUBWORKFLOW: Run Alphafold2 split mode, MSA and predicition // RUN_ALPHAFOLD2_MSA ( - ch_fasta, - full_dbs, + ch_samplesheet, + alphafold2_full_dbs, alphafold2_model_preset, + uniref30_prefix, ch_alphafold2_params, ch_bfd, ch_small_bfd, ch_mgnify, ch_pdb70, ch_pdb_mmcif, + ch_pdb_obsolete, ch_uniref30, ch_uniref90, ch_pdb_seqres, ch_uniprot ) - ch_versions = ch_versions.mix(RUN_ALPHAFOLD2_MSA.out.versions) + ch_versions = ch_versions.mix(RUN_ALPHAFOLD2_MSA.out.versions) + + //synchronize + ch_samplesheet + .join(RUN_ALPHAFOLD2_MSA.out.features) + .set { ch_fasta_features } RUN_ALPHAFOLD2_PRED ( - ch_fasta, - full_dbs, + ch_fasta_features, alphafold2_model_preset, ch_alphafold2_params, ch_bfd, @@ -128,55 +136,69 @@ workflow ALPHAFOLD2 { ch_mgnify, ch_pdb70, ch_pdb_mmcif, + ch_pdb_obsolete, ch_uniref30, ch_uniref90, ch_pdb_seqres, - ch_uniprot, - RUN_ALPHAFOLD2_MSA.out.features + ch_uniprot ) - ch_multiqc_rep = RUN_ALPHAFOLD2_PRED.out.multiqc.collect() - ch_versions = ch_versions.mix(RUN_ALPHAFOLD2_PRED.out.versions) - } - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) - .set { ch_collated_versions } - - // - // MODULE: MultiQC - // - ch_multiqc_report = Channel.empty() - if (!params.skip_multiqc) { - ch_multiqc_report = Channel.empty() - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config ) : Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo ) : Channel.empty() - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_rep) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - ch_multiqc_report = MULTIQC.out.report.toList() + RUN_ALPHAFOLD2_PRED + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "alphafold2" ], it.flatten() ] + } + .set { ch_multiqc_report } + + ch_top_ranked_pdb = ch_top_ranked_pdb.mix(RUN_ALPHAFOLD2_PRED.out.top_ranked_pdb) + ch_pdb = ch_pdb.mix(RUN_ALPHAFOLD2_PRED.out.pdb) + ch_msa = ch_msa.mix(RUN_ALPHAFOLD2_PRED.out.msa) + ch_pae = ch_pae.mix(RUN_ALPHAFOLD2_PRED.out.pae) + ch_versions = ch_versions.mix(RUN_ALPHAFOLD2_PRED.out.versions) } + ch_pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold2"; + def files = (it[1] instanceof List) ? it[1] : [ it[1] ] + [ meta, files ] + } + .set { ch_pdb_final } + + ch_msa + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold2"; + [ meta, it[1] ] + } + .set { ch_msa_final } + + ch_pae + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold2"; + [ meta, it[1] ] + } + .set { ch_pae_final } + + ch_top_ranked_pdb_final = ch_top_ranked_pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold2"; + [ meta, it[1] ] + } + emit: - multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + top_ranked_pdb = ch_top_ranked_pdb_final // channel: [ meta, /path/to/*.pdb ] + pdb = ch_pdb_final // channel: [ meta, /path/to/*.pdb ] + msa = ch_msa_final // channel: [ meta, /path/to/*.pdb, /path/to/*_coverage.png ] // Would prefer channel: [ meta, /path/to/*_msa.tsv ] + pae = ch_pae_final // channel: [ meta, /path/to/*_0_pae.tsv] + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } /* diff --git a/workflows/alphafold3.nf b/workflows/alphafold3.nf new file mode 100644 index 000000000..84ff42186 --- /dev/null +++ b/workflows/alphafold3.nf @@ -0,0 +1,162 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Loaded from modules/local/ +// +include { FASTA_TO_ALPHAFOLD3_JSON } from '../modules/local/fasta_to_alphafold3_json' +include { RUN_ALPHAFOLD3 } from '../modules/local/run_alphafold3' +include { MMCIF2PDB as MMCIF2PDB_TOP_RANKED } from '../modules/local/mmcif2pdb/main.nf' +include { MMCIF2PDB as MMCIF2PDB_MODELS } from '../modules/local/mmcif2pdb/main.nf' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow ALPHAFOLD3 { + + take: + ch_samplesheet // channel: samplesheet read in from --input + ch_versions // channel: [ path(versions.yml) ] + ch_alphafold3_params // channel: path(alphafold2_params) + ch_small_bfd // channel: path(small_bfd) + ch_mgnify // channel: path(mgnify) + ch_mmcif_files // channel: path(mmcif_files) + ch_uniref90 // channel: path(uniref90) + ch_pdb_seqres // channel: path(pdb_seqres) + ch_uniprot // channel: path(uniprot) + + main: + ch_pdb_final = channel.empty() + ch_top_ranked_pdb = channel.empty() + ch_msa_final = channel.empty() + ch_multiqc_report = channel.empty() + + FASTA_TO_ALPHAFOLD3_JSON(ch_samplesheet) + ch_versions = ch_versions.mix(FASTA_TO_ALPHAFOLD3_JSON.out.versions) + + // + // SUBWORKFLOW: Run AlphaFold3 + // + RUN_ALPHAFOLD3 ( + FASTA_TO_ALPHAFOLD3_JSON.out.json, + ch_alphafold3_params, + ch_small_bfd, + ch_mgnify, + ch_mmcif_files, + ch_uniref90, + ch_pdb_seqres, + ch_uniprot + ) + ch_versions = ch_versions.mix(RUN_ALPHAFOLD3.out.versions) + + // Convert mmcif to pdbs + RUN_ALPHAFOLD3 + .out + .cif + .groupTuple() + .map { + meta, files -> + [ meta, files.flatten() ] + } + + // Convert models mmcifs to pdbs + MMCIF2PDB_MODELS ( + RUN_ALPHAFOLD3 + .out + .cif + .groupTuple() + .map { + meta, files -> + [ meta, files.flatten() ] + } + ) + ch_versions = ch_versions.mix(MMCIF2PDB_MODELS.out.versions) + + MMCIF2PDB_MODELS + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold3"; + def files = (it[1] instanceof List) ? it[1] : [ it[1] ] + [ meta, files ] + } + .set { ch_pdb_final } + + // Convert top ranked mmcif to pdb + MMCIF2PDB_TOP_RANKED ( + RUN_ALPHAFOLD3 + .out + .top_ranked_cif + ) + ch_versions = ch_versions.mix(MMCIF2PDB_TOP_RANKED.out.versions) + + MMCIF2PDB_TOP_RANKED + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold3"; + [ meta, it[1] ] + } + .set { ch_top_ranked_pdb } + + // Prepare msa input + RUN_ALPHAFOLD3 + .out + .msa + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold3"; + [ meta, it[1] ] + } + .set { ch_msa_final } + + // Prepare report input + RUN_ALPHAFOLD3 + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "alphafold3" ], it.flatten() ] + } + .set { ch_multiqc_report } + + // Prepare dummy pae input + RUN_ALPHAFOLD3 + .out + .pae + .map { it -> + def meta = it[0].clone(); + meta.model = "alphafold3"; + [ meta, it[1] ] + } + .set { ch_pae_final } + + emit: + top_ranked_pdb = ch_top_ranked_pdb // channel: [ id, /path/to/*.pdb ] + pdb = ch_pdb_final // channel: [ meta, /path/to/*.pdb, ...,/path/to/*.pdb ] + msa = ch_msa_final // channel: [ meta, /path/to/*.pdb, /path/to/*_coverage.png ] + pae = ch_pae_final // channel: [ meta, path/to/*_pae.tsv ] + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/boltz.nf b/workflows/boltz.nf new file mode 100644 index 000000000..de82b05e7 --- /dev/null +++ b/workflows/boltz.nf @@ -0,0 +1,197 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Loaded from modules/local/ +// + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { BOLTZ_FASTA } from '../modules/local/boltz_fasta' +include { SPLIT_MSA } from '../modules/local/split_msa' +include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch' +include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv' +// +// SUBWORKFLOW: Consisting entirely of nf-core/modules +// +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' + +// +// MODULE: Boltz +// +include { RUN_BOLTZ } from '../modules/local/run_boltz' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow BOLTZ { + + take: + ch_samplesheet // channel: samplesheet read from --input + ch_versions // channel: [ path(versions.yml) ] + ch_boltz_ccd // channel: [ path(boltz_ccd) ] + ch_boltz_model // channel: [ path(model) ] + ch_boltz2_aff // channel: [ path(boltz2_aff) ] + ch_boltz2_conf // channel: [ path(boltz2_conf) ] + ch_mols // channel: [ path(mols) ] + ch_colabfold_db // channel: [ path(colabfold_db) ] + ch_uniref30 // channel: [ path(uniref30) ] + msa_server + + main: + ch_samplesheet + .branch { it -> + fasta: it[1].extension == "fasta" || it[1].extension == "fa" + yaml: it[1].extension == "yaml" || it[1].extension == "yml" + } + .set { ch_input_by_ext } + + ch_input_by_ext.fasta + .join( + ch_input_by_ext.fasta + .map { meta, file -> + [ + meta, + file.text.findAll { letter -> letter == ">" }.size() + ] + } + ) + .map { it -> + def meta = it[0].clone() + meta.cnt = it[2] + [meta, it[1]] + } + .branch { it -> + multimer: it[0].cnt > 1 + monomer: it[0].cnt == 1 + } + .set{ch_input} + + if (!msa_server){ + MULTIFASTA_TO_CSV( + ch_input.multimer + ) + ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) + + MMSEQS_COLABFOLDSEARCH ( + ch_input.monomer.mix(MULTIFASTA_TO_CSV.out.input_csv), + ch_colabfold_db, + ch_uniref30 + ) + ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) + + SPLIT_MSA( + MMSEQS_COLABFOLDSEARCH.out.a3m + ) + ch_versions = ch_versions.mix(SPLIT_MSA.out.versions) + ch_input.monomer + .join(SPLIT_MSA.out.msa_csv) + .mix( + ch_input.multimer.join(SPLIT_MSA.out.msa_csv) + ).set{ch_prepare_fasta} + + }else{ + ch_input + .multimer + .mix(ch_input.monomer) + .map { it -> + [it[0], it[1], []] + } + .set{ch_prepare_fasta} + } + + BOLTZ_FASTA( + ch_prepare_fasta + ) + + ch_input_by_ext.yaml + .map { meta, file -> [ meta, file, [] ] } // already in YAML + .mix(BOLTZ_FASTA.out.formatted_fasta) // newly converted from FASTA + .set { ch_boltz_input } + + RUN_BOLTZ( + ch_boltz_input, + ch_boltz_model, + ch_boltz_ccd, + ch_boltz2_aff, + ch_boltz2_conf, + ch_mols + ) + + RUN_BOLTZ + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "boltz" + [ meta, it[1] ] + } + .set {ch_pdb} + + RUN_BOLTZ + .out + .top_ranked_pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "boltz" + [ meta, it[1] ] + } + .set { ch_top_ranked_pdb } + + RUN_BOLTZ + .out + .msa_raw + .map { it -> + def meta = it[0].clone(); + meta.model = "boltz" + [ meta, it[1] ] + } + .set { ch_msa } + + RUN_BOLTZ + .out + .pae_raw + .map { it -> + def meta = it[0].clone(); + meta.model = "boltz" + [ meta, it[1] ] + } + .set { ch_pae } + + RUN_BOLTZ + .out + .multiqc + .map { it -> it[1] } + .collect(sort: true) + .map { it -> [ [ "model": "boltz"], it.flatten() ] } + .set { ch_multiqc_report } + + ch_versions = ch_versions.mix(RUN_BOLTZ.out.versions) + + emit: + versions = ch_versions + msa = ch_msa + structures = RUN_BOLTZ.out.structures + confidence = RUN_BOLTZ.out.confidence + multiqc_report = ch_multiqc_report + top_ranked_pdb = ch_top_ranked_pdb + pdb = ch_pdb + pae = ch_pae +} diff --git a/workflows/colabfold.nf b/workflows/colabfold.nf index 3d2829f38..312a22b4a 100644 --- a/workflows/colabfold.nf +++ b/workflows/colabfold.nf @@ -11,26 +11,14 @@ include { COLABFOLD_BATCH } from '../modules/local/colabfold_batch' include { MMSEQS_COLABFOLDSEARCH } from '../modules/local/mmseqs_colabfoldsearch' include { MULTIFASTA_TO_CSV } from '../modules/local/multifasta_to_csv' +include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Installed directly from nf-core/modules -// -include { MULTIQC } from '../modules/nf-core/multiqc/main' - -// -// SUBWORKFLOW: Consisting entirely of nf-core/modules -// -include { paramsSummaryMap } from 'plugin/nf-validation' -include { fromSamplesheet } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -40,78 +28,52 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot workflow COLABFOLD { take: + ch_samplesheet // channel: samplesheet read in from --input ch_versions // channel: [ path(versions.yml) ] colabfold_model_preset // string: Specifies the model preset to use for colabfold ch_colabfold_params // channel: path(colabfold_params) ch_colabfold_db // channel: path(colabfold_db) ch_uniref30 // channel: path(uniref30) - num_recycles // int: Number of recycles for esmfold + num_recycles // int: Number of recycles for colabfold main: - ch_multiqc_files = Channel.empty() + ch_multiqc_report = channel.empty() - // - // Create input channel from input file provided through params.input - // - Channel - .fromSamplesheet("input") - .set { ch_fasta } - - if (params.colabfold_server == 'webserver') { + if (params.use_msa_server) { // // MODULE: Run colabfold // - if (params.colabfold_model_preset != 'alphafold2_ptm' && params.colabfold_model_preset != 'alphafold2') { - MULTIFASTA_TO_CSV( - ch_fasta - ) - ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) - COLABFOLD_BATCH( - MULTIFASTA_TO_CSV.out.input_csv, - colabfold_model_preset, - ch_colabfold_params, - [], - [], - num_recycles - ) - ch_versions = ch_versions.mix(COLABFOLD_BATCH.out.versions) - } else { - COLABFOLD_BATCH( - ch_fasta, - colabfold_model_preset, - ch_colabfold_params, - [], - [], - num_recycles - ) - ch_versions = ch_versions.mix(COLABFOLD_BATCH.out.versions) - } - } else if (params.colabfold_server == 'local') { + MULTIFASTA_TO_CSV( + ch_samplesheet + ) + ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) + + COLABFOLD_BATCH( + MULTIFASTA_TO_CSV.out.input_csv, + colabfold_model_preset, + ch_colabfold_params, + [], + [], + num_recycles + ) + ch_versions = ch_versions.mix(COLABFOLD_BATCH.out.versions) + + } else { // // MODULE: Run mmseqs // - if (params.colabfold_model_preset != 'AlphaFold2-ptm') { - MULTIFASTA_TO_CSV( - ch_fasta - ) - ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) - MMSEQS_COLABFOLDSEARCH ( - MULTIFASTA_TO_CSV.out.input_csv, - ch_colabfold_params, - ch_colabfold_db, - ch_uniref30 - ) - ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) - } else { - MMSEQS_COLABFOLDSEARCH ( - ch_fasta, - ch_colabfold_params, - ch_colabfold_db, - ch_uniref30 - ) - ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) - } + //Multimer mode + MULTIFASTA_TO_CSV( + ch_samplesheet + ) + ch_versions = ch_versions.mix(MULTIFASTA_TO_CSV.out.versions) + MMSEQS_COLABFOLDSEARCH ( + MULTIFASTA_TO_CSV.out.input_csv, + ch_colabfold_db, + ch_uniref30 + ) + ch_versions = ch_versions.mix(MMSEQS_COLABFOLDSEARCH.out.versions) // // MODULE: Run colabfold @@ -124,46 +86,48 @@ workflow COLABFOLD { ch_uniref30, num_recycles ) - ch_versions = ch_versions.mix(COLABFOLD_BATCH.out.versions) + ch_versions = ch_versions.mix(COLABFOLD_BATCH.out.versions) } - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) - .set { ch_collated_versions } - - // - // MODULE: MultiQC - // - ch_multiqc_report = Channel.empty() - if (!params.skip_multiqc) { - ch_multiqc_report = Channel.empty() - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config ) : Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo ) : Channel.empty() - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(COLABFOLD_BATCH.out.multiqc.collect()) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - ch_multiqc_report = MULTIQC.out.report.toList() - } + COLABFOLD_BATCH + .out + .top_ranked_pdb + .map { it -> + def meta_clone = it[0].clone(); + meta_clone.model = "colabfold"; + [ meta_clone, it[1] ] + } + .set { ch_top_ranked_pdb } + + COLABFOLD_BATCH + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "colabfold"; + def files = (it[1] instanceof List) ? it[1] : [ it[1] ] + [ meta, files ] + } + .set { ch_pdb_final } + + modeChannel(COLABFOLD_BATCH.out.msa, "colabfold").set { ch_msa_final } + modeChannel(COLABFOLD_BATCH.out.pae, "colabfold").set { ch_pae_final } + + COLABFOLD_BATCH + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model":"colabfold"], it.flatten() ] + } + .set { ch_multiqc_report } emit: + top_ranked_pdb = ch_top_ranked_pdb // channel: [ meta, /path/to/*.pdb ] + pdb = ch_pdb_final // channel: [ id, /path/to/*.pdb ] + msa = ch_msa_final // channel: [ meta, /path/to/*.pdb, /path/to/*_coverage.png ] + pae = ch_pae_final // channel: [ id, /path/to/*_pae.tsv ] multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/esmfold.nf b/workflows/esmfold.nf index 962c01a19..5a221d986 100644 --- a/workflows/esmfold.nf +++ b/workflows/esmfold.nf @@ -10,26 +10,14 @@ include { RUN_ESMFOLD } from '../modules/local/run_esmfold' include { MULTIFASTA_TO_SINGLEFASTA } from '../modules/local/multifasta_to_singlefasta' +include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Installed directly from nf-core/modules -// -include { MULTIQC } from '../modules/nf-core/multiqc/main' - -// -// SUBWORKFLOW: Consisting entirely of nf-core/modules -// -include { paramsSummaryMap } from 'plugin/nf-validation' -include { fromSamplesheet } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -39,26 +27,18 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot workflow ESMFOLD { take: + ch_samplesheet // channel: samplesheet read in from --input ch_versions // channel: [ path(versions.yml) ] ch_esmfold_params // directory: /path/to/esmfold/params/ ch_num_recycles // int: Number of recycles for esmfold main: - ch_multiqc_files = Channel.empty() - - // - // Create input channel from input file provided through params.input - // - Channel - .fromSamplesheet("input") - .set { ch_fasta } - // // MODULE: Run esmfold // if (params.esmfold_model_preset != 'monomer') { MULTIFASTA_TO_SINGLEFASTA( - ch_fasta + ch_samplesheet ) ch_versions = ch_versions.mix(MULTIFASTA_TO_SINGLEFASTA.out.versions) RUN_ESMFOLD( @@ -69,50 +49,27 @@ workflow ESMFOLD { ch_versions = ch_versions.mix(RUN_ESMFOLD.out.versions) } else { RUN_ESMFOLD( - ch_fasta, + ch_samplesheet, ch_esmfold_params, ch_num_recycles ) ch_versions = ch_versions.mix(RUN_ESMFOLD.out.versions) } - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_proteinfold_software_mqc_versions.yml', sort: true, newLine: true) - .set { ch_collated_versions } + RUN_ESMFOLD + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "esmfold"], it.flatten() ] + } + .set { ch_multiqc_report } - // - // MODULE: MultiQC - // - ch_multiqc_report = Channel.empty() - if (!params.skip_multiqc) { - ch_multiqc_report = Channel.empty() - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config ) : Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo ) : Channel.empty() - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_methods_description)) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(RUN_ESMFOLD.out.multiqc.collect()) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - ch_multiqc_report = MULTIQC.out.report.toList() - } + modeChannel(RUN_ESMFOLD.out.pdb, "esmfold").set { ch_pdb_final } emit: + pdb = ch_pdb_final // channel: [ id, /path/to/*.pdb ] multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/helixfold3.nf b/workflows/helixfold3.nf new file mode 100644 index 000000000..9defb5c43 --- /dev/null +++ b/workflows/helixfold3.nf @@ -0,0 +1,130 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Loaded from modules/local/ +// +include { RUN_HELIXFOLD3 } from '../modules/local/run_helixfold3' +include { FASTA2JSON } from '../modules/local/fasta2json' + +include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow HELIXFOLD3 { + + take: + ch_samplesheet + ch_versions // channel: [ path(versions.yml) ] + uniref30_prefix // string: Prefix for uniref30 database files + ch_helixfold3_uniclust30 // channel: path(uniclust30) + ch_helixfold3_ccd_preprocessed // channel: path(ccd_preprocessed) + ch_helixfold3_rfam // channel: path(rfam) + ch_helixfold3_bfd // channel: path(bfd) + ch_helixfold3_small_bfd // channel: path(small_bfd) + ch_helixfold3_uniprot // channel: path(uniprot) + ch_helixfold3_pdb_seqres // channel: path(pdb_seqres) + ch_helixfold3_uniref90 // channel: path(uniref90) + ch_helixfold3_mgnify // channel: path(mgnify) + ch_helixfold3_mmcif_files // channel: path(pdb_mmcif) + ch_helixfold3_obsolete // channel: path(pdb_obsolete) + ch_helixfold3_init_models // channel: path(init_models) + ch_helixfold3_maxit_src // channel: path(maxit_src) + + main: + ch_pdb = channel.empty() + ch_top_ranked_pdb = channel.empty() + ch_multiqc_report = channel.empty() + + // + // SUBWORKFLOW: Run helixfold3 + // + ch_samplesheet.branch { it -> + fasta: it[1].extension == "fasta" || it[1].extension == "fa" + json: it[1].extension == "json" + }.set { ch_input } + + FASTA2JSON(ch_input.fasta) + + RUN_HELIXFOLD3 ( + ch_input.json.mix(FASTA2JSON.out.json), + uniref30_prefix, + ch_helixfold3_uniclust30, + ch_helixfold3_ccd_preprocessed, + ch_helixfold3_rfam, + ch_helixfold3_bfd, + ch_helixfold3_small_bfd, + ch_helixfold3_uniprot, + ch_helixfold3_pdb_seqres, + ch_helixfold3_uniref90, + ch_helixfold3_mgnify, + ch_helixfold3_mmcif_files, + ch_helixfold3_obsolete, + ch_helixfold3_init_models, + ch_helixfold3_maxit_src + ) + + RUN_HELIXFOLD3 + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "helixfold3" ], it.flatten() ] + } + .set { ch_multiqc_report } + + ch_pdb = ch_pdb.mix(RUN_HELIXFOLD3.out.pdb) + ch_versions = ch_versions.mix(RUN_HELIXFOLD3.out.versions) + + RUN_HELIXFOLD3 + .out + .top_ranked_pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "helixfold3"; + [ meta, it[1] ] + } + .set { ch_top_ranked_pdb } + + RUN_HELIXFOLD3 + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "helixfold3"; + def files = (it[1] instanceof List) ? it[1] : [ it[1] ] + [ meta, files ] + } + .set { ch_pdb_final } + + modeChannel(RUN_HELIXFOLD3.out.msa, "helixfold3").set { ch_msa_final } + modeChannel(RUN_HELIXFOLD3.out.pae, "helixfold3").set { ch_pae_final } + + emit: + top_ranked_pdb = ch_top_ranked_pdb // channel: [ meta, /path/to/*.pdb ] + pdb = ch_pdb_final // channel: [ id, /path/to/*.pdb ] + msa = ch_msa_final // channel: [ id, /path/to/*_msa.tsv ] + pae = ch_pae_final // channel: [ id, /path/to/*_pae.tsv ] + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/rosettafold2na.nf b/workflows/rosettafold2na.nf new file mode 100644 index 000000000..dfcf92861 --- /dev/null +++ b/workflows/rosettafold2na.nf @@ -0,0 +1,100 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Loaded from modules/local/ +// +include { ROSETTAFOLD2NA_FASTA } from '../modules/local/rosettafold2na_fasta' +include { RUN_ROSETTAFOLD2NA } from '../modules/local/run_rosettafold2na' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow ROSETTAFOLD2NA { + + take: + ch_samplesheet // channel: samplesheet read in from --input + ch_versions // channel: [ path(versions.yml) ] + ch_bfd // channel: path(bfd) + ch_uniref30 // channel: path(uniref30) + ch_pdb100 // channel: path(pdb100) + ch_rna // channel: path(rna) + ch_rosettafold2na_weights // channel: path(rosettafold2na_weights) + + main: + ch_multiqc_report = channel.empty() + + ROSETTAFOLD2NA_FASTA( + ch_samplesheet + ) + ch_versions = ch_versions.mix(ROSETTAFOLD2NA_FASTA.out.versions) + + RUN_ROSETTAFOLD2NA ( + ROSETTAFOLD2NA_FASTA.out.rf2na_input, + ch_bfd, + ch_uniref30, + ch_pdb100, + ch_rna, + ch_rosettafold2na_weights + ) + ch_versions = ch_versions.mix(RUN_ROSETTAFOLD2NA.out.versions) + + RUN_ROSETTAFOLD2NA + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "rosettafold2na" ], it.flatten() ] + } + .set { ch_multiqc_report } + + RUN_ROSETTAFOLD2NA + .out + .pdb + .map { it -> + def meta = it[0].clone(); + meta.model = "rosettafold2na"; + [ meta, it[1] ] + } + .set { ch_pdb_final } + + RUN_ROSETTAFOLD2NA + .out + .pae + .map { it -> + def meta = it[0].clone(); + meta.model = "rosettafold2na"; + [ meta, it[1] ] + } + .set { ch_pae_final } + + RUN_ROSETTAFOLD2NA + .out + .msa + .map { it -> + def meta = it[0].clone(); + meta.model = "rosettafold2na"; + [ meta, it[1] ] + } + .set { ch_msa_final } + + emit: + pdb = ch_pdb_final // channel: [ id, /path/to/*.pdb ] + pae = ch_pae_final // channel: [ id, /path/to/*_pae.tsv ] + msa = ch_msa_final // channel: [ id, /path/to/*_msa.tsv ] + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/rosettafold_all_atom.nf b/workflows/rosettafold_all_atom.nf new file mode 100644 index 000000000..bd576087b --- /dev/null +++ b/workflows/rosettafold_all_atom.nf @@ -0,0 +1,93 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Loaded from modules/local/ +// +include { RUN_ROSETTAFOLD_ALL_ATOM } from '../modules/local/run_rosettafold_all_atom' +include { FASTA2YAML } from '../modules/local/fasta2yaml' + +include { modeChannel } from '../subworkflows/local/utils_nfcore_proteinfold_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow ROSETTAFOLD_ALL_ATOM { + + take: + ch_samplesheet // channel: samplesheet read in from --input + ch_versions // channel: [ path(versions.yml) ] + uniref30_prefix // string: Prefix for uniref30 database files + ch_bfd // channel: path(bfd) + ch_uniref30 // channel: path(uniref30) + ch_pdb100 // channel: path(pdb100) + ch_rfaa_paper_weights // channel: path(rfaa_paper_weightsch_dummy_file // channel: path(NO_file) + + main: + ch_multiqc_report = channel.empty() + + ch_samplesheet.branch { it -> + fasta: it[1].extension == "fasta" || it[1].extension == "fa" + yaml: it[1].extension == "yaml" + }.set{ch_input} + + FASTA2YAML( + ch_input.fasta + ) + + ch_input.yaml.map { it -> + [it[0], it[1], []] + } + .mix(FASTA2YAML.out.yaml.join(FASTA2YAML.out.fasta)) + .set{ch_rosetta_all_atom_in} + + RUN_ROSETTAFOLD_ALL_ATOM ( + ch_rosetta_all_atom_in.map { it -> [it[0], it[1]] }, + uniref30_prefix, + ch_bfd, + ch_uniref30, + ch_pdb100, + ch_rfaa_paper_weights, + ch_rosetta_all_atom_in.map { it -> it[2] } + ) + ch_versions = ch_versions.mix(RUN_ROSETTAFOLD_ALL_ATOM.out.versions) + + RUN_ROSETTAFOLD_ALL_ATOM + .out + .multiqc + .map { it -> it[1] } + .toSortedList() + .map { it -> + [ [ "model": "rosettafold_all_atom" ], it.flatten() ] + } + .set { ch_multiqc_report } + + modeChannel(RUN_ROSETTAFOLD_ALL_ATOM.out.pdb, "rosettafold_all_atom").set { ch_pdb_final } + modeChannel(RUN_ROSETTAFOLD_ALL_ATOM.out.msa, "rosettafold_all_atom").set { ch_msa_final } + modeChannel(RUN_ROSETTAFOLD_ALL_ATOM.out.pae, "rosettafold_all_atom").set { ch_pae_final } + + emit: + pdb = ch_pdb_final // channel: [ id, /path/to/*.pdb ] + msa = ch_msa_final // channel: [ id, /path/to/*_msa.tsv ] + pae = ch_pae_final // channel: [ id, /path/to/*_pae.tsv ] + multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/