diff --git a/scripts/mcmc_Makefile b/scripts/mcmc_Makefile index 42d60c03ad..85fef57331 100644 --- a/scripts/mcmc_Makefile +++ b/scripts/mcmc_Makefile @@ -44,7 +44,7 @@ $(TOIL_OS)/$(BASENAME).snarls: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg - vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_0) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg + vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --retain-paths -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_0) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg @@ -57,7 +57,7 @@ $(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam: $(TOIL_OS)/$(BASENAME).xg $(TOIL_OS)/$(BA $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg - vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_1) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg + vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --retain-paths -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_1) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 36abac24a4..79e7474d9e 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -14,6 +14,7 @@ #include "subcommand.hpp" +#include "../crash.hpp" #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" @@ -37,11 +38,10 @@ void help_paths(char** argv) { << "input:" << endl << " -x, --xg FILE use the paths and haplotypes in this graph FILE" << endl << " Supports GBZ haplotypes. (also accepts -v, --vg)" << endl - << " -g, --gbwt FILE use the threads in the GBWT index in FILE" << endl - << " (graph also required for most output options;" << endl - << " -g takes priority over -x)" << endl + << " -g, --gbwt FILE use the threads in the GBWT index in FILE instead " << endl + << " of graph paths (graph also required for most output " << endl + << " options)" << endl << "output graph (.vg format):" << endl - << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl << " -n, --normalize-paths output a graph where equivalent paths in a site are" << endl @@ -76,11 +76,12 @@ void help_paths(char** argv) { << " then adds augref paths)." << endl << " if unspecified, paths get added to target sample." << endl << " --augref-segs FILE write augref segment table to FILE" << endl - << " --verbose print augref progress and coverage summary" << endl << "configuration:" << endl << " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl << " -t, --threads N number of threads to use [all available]" << endl - << " applies to -n (snarl finding) and -u (augref)" << endl; + << " applies to -n (snarl finding) and -u (augref)" << endl + << " --progress, --verbose print progress and augref coverage summary" << endl; + } /// Chunk a path and emit it in Graph messages. @@ -124,7 +125,6 @@ int main_paths(int argc, char** argv) { bool extract_as_gam = false; bool extract_as_gaf = false; - bool extract_as_vg = false; bool list_names = false; bool extract_as_fasta = false; bool drop_paths = false; @@ -147,14 +147,14 @@ int main_paths(int argc, char** argv) { const size_t coverage_bins = 10; bool normalize_paths = false; bool overlay = false; + bool progress = false; bool compute_augref = false; int64_t min_augref_length = 50; string augref_sample; - bool augref_verbose = false; string augref_segments_file; string exclude_sample; - constexpr int OPT_VERBOSE = 1001; + constexpr int OPT_PROGRESS = 1001; constexpr int OPT_AUGREF_SEGMENTS = 1002; constexpr int OPT_EXCLUDE_SAMPLE = 1003; @@ -189,6 +189,8 @@ int main_paths(int argc, char** argv) { {"coverage", no_argument, 0, 'c'}, {"overlay", no_argument, 0, 'o'}, {"threads", required_argument, 0, 't'}, + {"progress", no_argument, 0, OPT_PROGRESS}, + {"verbose", no_argument, 0, OPT_PROGRESS}, // Hidden options for backward compatibility. {"threads-old", no_argument, 0, 'T'}, @@ -199,7 +201,6 @@ int main_paths(int argc, char** argv) { {"min-augref-len", required_argument, 0, 'l'}, {"augref-sample", required_argument, 0, 'N'}, {"augref-segs", required_argument, 0, OPT_AUGREF_SEGMENTS}, - {"verbose", no_argument, 0, OPT_VERBOSE}, {"exclude-sample", required_argument, 0, OPT_EXCLUDE_SAMPLE}, {0, 0, 0, 0} @@ -228,8 +229,7 @@ int main_paths(int argc, char** argv) { break; case 'V': - extract_as_vg = true; - output_formats++; + logger.error() << "-V/--extract-vg has been removed" << endl; break; case 'd': @@ -353,8 +353,8 @@ int main_paths(int argc, char** argv) { augref_sample = optarg; break; - case OPT_VERBOSE: - augref_verbose = true; + case OPT_PROGRESS: + progress = true; break; case OPT_AUGREF_SEGMENTS: @@ -399,11 +399,11 @@ int main_paths(int argc, char** argv) { logger.error() << "at least one input format (-x, -g) must be specified" << std::endl; } if (!gbwt_file.empty()) { - bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths + bool need_graph = (extract_as_gam || extract_as_gaf || drop_paths || retain_paths || extract_as_fasta || list_lengths); if (need_graph && graph_file.empty()) { logger.error() << "a graph is needed for extracting threads in " - << "-X, -A, -V, -d, -r, -n, -E or -F format" << std::endl; + << "-X, -A, -d, -r, -n, -E or -F format" << std::endl; } if (!need_graph && !graph_file.empty()) { // TODO: This should be an error, but we display a warning instead for backward compatibility. @@ -412,7 +412,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - logger.error() << "one output format (-X, -A, -V, -d, -r, -n, -L, -F, -E, -C or -c) " + logger.error() << "one output format (-X, -A, -d, -r, -n, -L, -F, -E, -C or -c) " << "must be specified" << std::endl; } if (selection_criteria > 1) { @@ -424,9 +424,8 @@ int main_paths(int argc, char** argv) { if (list_metadata && !gbwt_file.empty()) { logger.error() << "listing path metadata is not compatible with a GBWT index" << std::endl; } - if ((drop_paths || retain_paths || normalize_paths) && !gbwt_file.empty()) { - logger.error() << "dropping, retaining or normalizing paths " - << "only works on embedded graph paths, not GBWT threads" << std::endl; + if (normalize_paths && !gbwt_file.empty()) { + logger.error() << "normalizing paths only works on embedded graph paths, not GBWT threads" << std::endl; } if (coverage && !gbwt_file.empty()) { logger.error() << "coverage option -c only works on embedded graph paths, not GBWT threads" << std::endl; @@ -534,7 +533,7 @@ int main_paths(int argc, char** argv) { if (!augref_sample.empty()) { cover.set_augref_sample(augref_sample); } - cover.set_verbose(augref_verbose); + cover.set_verbose(progress); cover.clear(mutable_graph); cover.compute(graph, &snarl_manager, ref_paths, min_augref_length); @@ -567,19 +566,23 @@ int main_paths(int argc, char** argv) { // We may need to emit a stream of Alignments unique_ptr aln_emitter; - // Or we might need to emit a stream of VG Graph objects - unique_ptr> graph_emitter; if (extract_as_gam || extract_as_gaf) { // Open up a GAM/GAF output stream aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(), graph); - } else if (extract_as_vg) { - // Open up a VG Graph chunk output stream - graph_emitter = unique_ptr>(new vg::io::ProtobufEmitter(cout)); } if (gbwt_index) { + // TODO: Use GBWTGraph or a lightweight verison that just uses a + // backing HandleGraph's nodes directly to unify this all with the + // HandleGraph case. The problem there is that GBWTGraph is immutable, + // and we don't have good code for efficiently moving selected stuff + // from immutable graphs to mutable graphs. + // We want to operate on a GBWT instead of the graph. + if (progress) { + logger.info() << "Extracting from GBWT" << std::endl; + } if (!(gbwt_index->hasMetadata() && gbwt_index->metadata.hasPathNames())) { logger.error() << "the GBWT index does not contain path names" << std::endl; @@ -588,12 +591,44 @@ int main_paths(int argc, char** argv) { // Pre-parse some metadata auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt_index); + MutablePathMutableHandleGraph* out_graph = nullptr; + std::unique_ptr graph_holder; + if (drop_paths || retain_paths) { + // We need to output a graph. + // Which means we need a graph to start with. + crash_unless(graph); + // Maybe we can use the input graph as the output graph. + out_graph = dynamic_cast(graph); + + if (out_graph) { + // We need to use the GBWT threads *instead* of whatever might + // have been in the graph. + if (progress) { + logger.info() << "Operating on loaded graph" << std::endl; + } + std::vector paths_to_clear; + out_graph->for_each_path_handle([&](const path_handle_t& h) { + paths_to_clear.push_back(h); + }); + out_graph->destroy_paths(paths_to_clear); + } else { + // We need to make a new graph + if (progress) { + logger.info() << "Creating graph copy" << std::endl; + } + graph_holder = std::make_unique(); + out_graph = graph_holder.get(); + // We need to copy the nodes over, but not the paths. + handlegraph::algorithms::copy_handle_graph(graph, out_graph); + } + } + // Select the threads we are interested in. std::vector thread_ids; if (!sample_name.empty()) { thread_ids = threads_for_sample(*gbwt_index, sample_name); } else if(!path_prefix.empty()) { - for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) { + for (gbwt::size_type i = 0; i < gbwt_index->metadata.paths(); i++) { PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, i, gbwt_reference_samples); std::string name = gbwtgraph::compose_path_name(*gbwt_index, i, sense); if (name.length() >= path_prefix.length() && std::equal(path_prefix.begin(), path_prefix.end(), name.begin())) { @@ -633,15 +668,29 @@ int main_paths(int argc, char** argv) { thread_ids = std::move(filtered_ids); } + if (drop_paths) { + // Invert the set of thread IDs + std::unordered_set selected_ids {thread_ids.begin(), thread_ids.end()}; + std::vector inverted_ids; + for (gbwt::size_type i = 0; i < gbwt_index->metadata.paths(); i++) { + if (!selected_ids.count(i)) { + inverted_ids.push_back(i); + } + } + thread_ids = std::move(inverted_ids); + } + if (thread_ids.empty()) { logger.warn() << "no matching paths found in GBWT index" << std::endl; return 0; + } else if (progress) { + logger.info() << "found " << thread_ids.size() << " matching paths in GBWT index" << std::endl; } // Process the threads. for (gbwt::size_type id : thread_ids) { PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, id, gbwt_reference_samples); - std::string name = gbwtgraph::compose_path_name(*gbwt_index, id, sense); + std::string name = gbwtgraph::compose_path_name(*gbwt_index, id, sense); // We are only interested in the name // TODO: do we need to consult list_cyclicity or list_metadata here? @@ -657,9 +706,6 @@ int main_paths(int argc, char** argv) { if (extract_as_gam || extract_as_gaf) { // Write as an Alignment. Must contain the whole path. aln_emitter->emit_singles({alignment_from_path(*graph, path)}); - } else if (extract_as_vg) { - // Write as a Path in a VG - chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { write_fasta_sequence(name, path_sequence(*graph, path), cout); } @@ -679,6 +725,28 @@ int main_paths(int argc, char** argv) { } cout << path.name() << "\t" << (cyclic ? "cyclic" : "acyclic") << std::endl; } + if (retain_paths || drop_paths) { + // We need to copy this path into the new graph. + path_handle_t new_path = out_graph->create_path( + sense, + gbwtgraph::get_path_sample_name(*gbwt_index, id, sense), + gbwtgraph::get_path_locus_name(*gbwt_index, id, sense), + gbwtgraph::get_path_haplotype(*gbwt_index, id, sense), + gbwtgraph::get_path_phase_block(*gbwt_index, id, sense), + gbwtgraph::get_path_subrange(*gbwt_index, id, sense) + ); + + for (auto& mapping : path.mapping()) { + // Put each step onto the path + out_graph->append_step(new_path, out_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse())); + } + + } + } + + if (out_graph) { + // Write the graph to standard output. + vg::io::save_handle_graph(out_graph, std::cout); } } else if (graph) { @@ -977,8 +1045,6 @@ int main_paths(int argc, char** argv) { Path path = path_from_path_handle(*graph, path_handle); if (extract_as_gam || extract_as_gaf) { aln_emitter->emit_singles({alignment_from_path(*graph, path)}); - } else if (extract_as_vg) { - chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout); } diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 729c327cac..afd4911481 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -32,7 +32,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@a56a860ad31583ce4b1616c9fa2bfb4bfe06d635" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@db3dbc9616f60187cdad248f7d7c4d42e83f3549" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "toil[wdl,aws,mesos]@git+https://github.com/DataBiosphere/toil.git@52aa2979f23c24001837b2808552973e7bb263e6" @@ -273,8 +273,8 @@ then # Build the git version file first, so the Docker knows its version make version - docker pull mirror.gcr.io/library/ubuntu:20.04 - docker build --no-cache -t "${DOCKER_TAG}" -f Dockerfile . + docker pull mirror.gcr.io/library/ubuntu:22.04 + docker build -t "${DOCKER_TAG}" -f Dockerfile . if [ "$?" -ne 0 ] then echo "vg docker build fail"