Skip to content
4 changes: 2 additions & 2 deletions scripts/mcmc_Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ $(TOIL_OS)/$(BASENAME).snarls: $(TOIL_OS)/$(BASENAME).vg

$(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg
vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg
vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_0) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg
vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --retain-paths -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_0) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg

$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg
vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg
Expand All @@ -57,7 +57,7 @@ $(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam: $(TOIL_OS)/$(BASENAME).xg $(TOIL_OS)/$(BA

$(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg
vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg
vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_1) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg
vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --retain-paths -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_1) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg

$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg
vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg
Expand Down
132 changes: 99 additions & 33 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "subcommand.hpp"

#include "../crash.hpp"
#include "../vg.hpp"
#include "../xg.hpp"
#include "../gbwt_helper.hpp"
Expand All @@ -37,11 +38,10 @@ void help_paths(char** argv) {
<< "input:" << endl
<< " -x, --xg FILE use the paths and haplotypes in this graph FILE" << endl
<< " Supports GBZ haplotypes. (also accepts -v, --vg)" << endl
<< " -g, --gbwt FILE use the threads in the GBWT index in FILE" << endl
<< " (graph also required for most output options;" << endl
<< " -g takes priority over -x)" << endl
<< " -g, --gbwt FILE use the threads in the GBWT index in FILE instead " << endl
<< " of graph paths (graph also required for most output " << endl
<< " options)" << endl
<< "output graph (.vg format):" << endl
<< " -V, --extract-vg output a path-only graph covering the selected paths" << endl
<< " -d, --drop-paths output a graph with the selected paths removed" << endl
<< " -r, --retain-paths output a graph with only the selected paths retained" << endl
<< " -n, --normalize-paths output a graph where equivalent paths in a site are" << endl
Expand Down Expand Up @@ -76,11 +76,12 @@ void help_paths(char** argv) {
<< " then adds augref paths)." << endl
<< " if unspecified, paths get added to target sample." << endl
<< " --augref-segs FILE write augref segment table to FILE" << endl
<< " --verbose print augref progress and coverage summary" << endl
<< "configuration:" << endl
<< " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl
<< " -t, --threads N number of threads to use [all available]" << endl
<< " applies to -n (snarl finding) and -u (augref)" << endl;
<< " applies to -n (snarl finding) and -u (augref)" << endl
<< " --progress, --verbose print progress and augref coverage summary" << endl;

}

/// Chunk a path and emit it in Graph messages.
Expand Down Expand Up @@ -124,7 +125,6 @@ int main_paths(int argc, char** argv) {

bool extract_as_gam = false;
bool extract_as_gaf = false;
bool extract_as_vg = false;
bool list_names = false;
bool extract_as_fasta = false;
bool drop_paths = false;
Expand All @@ -147,14 +147,14 @@ int main_paths(int argc, char** argv) {
const size_t coverage_bins = 10;
bool normalize_paths = false;
bool overlay = false;
bool progress = false;
bool compute_augref = false;
int64_t min_augref_length = 50;
string augref_sample;
bool augref_verbose = false;
string augref_segments_file;
string exclude_sample;

constexpr int OPT_VERBOSE = 1001;
constexpr int OPT_PROGRESS = 1001;
constexpr int OPT_AUGREF_SEGMENTS = 1002;
constexpr int OPT_EXCLUDE_SAMPLE = 1003;

Expand Down Expand Up @@ -189,6 +189,8 @@ int main_paths(int argc, char** argv) {
{"coverage", no_argument, 0, 'c'},
{"overlay", no_argument, 0, 'o'},
{"threads", required_argument, 0, 't'},
{"progress", no_argument, 0, OPT_PROGRESS},
{"verbose", no_argument, 0, OPT_PROGRESS},

// Hidden options for backward compatibility.
{"threads-old", no_argument, 0, 'T'},
Expand All @@ -199,7 +201,6 @@ int main_paths(int argc, char** argv) {
{"min-augref-len", required_argument, 0, 'l'},
{"augref-sample", required_argument, 0, 'N'},
{"augref-segs", required_argument, 0, OPT_AUGREF_SEGMENTS},
{"verbose", no_argument, 0, OPT_VERBOSE},
{"exclude-sample", required_argument, 0, OPT_EXCLUDE_SAMPLE},

{0, 0, 0, 0}
Expand Down Expand Up @@ -228,8 +229,7 @@ int main_paths(int argc, char** argv) {
break;

case 'V':
extract_as_vg = true;
output_formats++;
logger.error() << "-V/--extract-vg has been removed" << endl;
break;

case 'd':
Expand Down Expand Up @@ -353,8 +353,8 @@ int main_paths(int argc, char** argv) {
augref_sample = optarg;
break;

case OPT_VERBOSE:
augref_verbose = true;
case OPT_PROGRESS:
progress = true;
break;

case OPT_AUGREF_SEGMENTS:
Expand Down Expand Up @@ -399,11 +399,11 @@ int main_paths(int argc, char** argv) {
logger.error() << "at least one input format (-x, -g) must be specified" << std::endl;
}
if (!gbwt_file.empty()) {
bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths
bool need_graph = (extract_as_gam || extract_as_gaf || drop_paths
|| retain_paths || extract_as_fasta || list_lengths);
if (need_graph && graph_file.empty()) {
logger.error() << "a graph is needed for extracting threads in "
<< "-X, -A, -V, -d, -r, -n, -E or -F format" << std::endl;
<< "-X, -A, -d, -r, -n, -E or -F format" << std::endl;
}
if (!need_graph && !graph_file.empty()) {
// TODO: This should be an error, but we display a warning instead for backward compatibility.
Expand All @@ -412,7 +412,7 @@ int main_paths(int argc, char** argv) {
}
}
if (output_formats != 1) {
logger.error() << "one output format (-X, -A, -V, -d, -r, -n, -L, -F, -E, -C or -c) "
logger.error() << "one output format (-X, -A, -d, -r, -n, -L, -F, -E, -C or -c) "
<< "must be specified" << std::endl;
}
if (selection_criteria > 1) {
Expand All @@ -424,9 +424,8 @@ int main_paths(int argc, char** argv) {
if (list_metadata && !gbwt_file.empty()) {
logger.error() << "listing path metadata is not compatible with a GBWT index" << std::endl;
}
if ((drop_paths || retain_paths || normalize_paths) && !gbwt_file.empty()) {
logger.error() << "dropping, retaining or normalizing paths "
<< "only works on embedded graph paths, not GBWT threads" << std::endl;
if (normalize_paths && !gbwt_file.empty()) {
logger.error() << "normalizing paths only works on embedded graph paths, not GBWT threads" << std::endl;
}
if (coverage && !gbwt_file.empty()) {
logger.error() << "coverage option -c only works on embedded graph paths, not GBWT threads" << std::endl;
Expand Down Expand Up @@ -534,7 +533,7 @@ int main_paths(int argc, char** argv) {
if (!augref_sample.empty()) {
cover.set_augref_sample(augref_sample);
}
cover.set_verbose(augref_verbose);
cover.set_verbose(progress);
cover.clear(mutable_graph);
cover.compute(graph, &snarl_manager, ref_paths, min_augref_length);

Expand Down Expand Up @@ -567,19 +566,23 @@ int main_paths(int argc, char** argv) {
// We may need to emit a stream of Alignments
unique_ptr<vg::io::AlignmentEmitter> aln_emitter;

// Or we might need to emit a stream of VG Graph objects
unique_ptr<vg::io::ProtobufEmitter<Graph>> graph_emitter;
if (extract_as_gam || extract_as_gaf) {
// Open up a GAM/GAF output stream
aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(),
graph);
} else if (extract_as_vg) {
// Open up a VG Graph chunk output stream
graph_emitter = unique_ptr<vg::io::ProtobufEmitter<Graph>>(new vg::io::ProtobufEmitter<Graph>(cout));
}

if (gbwt_index) {
// TODO: Use GBWTGraph or a lightweight verison that just uses a
// backing HandleGraph's nodes directly to unify this all with the
// HandleGraph case. The problem there is that GBWTGraph is immutable,
// and we don't have good code for efficiently moving selected stuff
// from immutable graphs to mutable graphs.

// We want to operate on a GBWT instead of the graph.
if (progress) {
logger.info() << "Extracting from GBWT" << std::endl;
}

if (!(gbwt_index->hasMetadata() && gbwt_index->metadata.hasPathNames())) {
logger.error() << "the GBWT index does not contain path names" << std::endl;
Expand All @@ -588,12 +591,44 @@ int main_paths(int argc, char** argv) {
// Pre-parse some metadata
auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt_index);

MutablePathMutableHandleGraph* out_graph = nullptr;
std::unique_ptr<MutablePathMutableHandleGraph> graph_holder;
if (drop_paths || retain_paths) {
// We need to output a graph.
// Which means we need a graph to start with.
crash_unless(graph);
// Maybe we can use the input graph as the output graph.
out_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph);

if (out_graph) {
// We need to use the GBWT threads *instead* of whatever might
// have been in the graph.
if (progress) {
logger.info() << "Operating on loaded graph" << std::endl;
}
std::vector<path_handle_t> paths_to_clear;
out_graph->for_each_path_handle([&](const path_handle_t& h) {
paths_to_clear.push_back(h);
});
out_graph->destroy_paths(paths_to_clear);
} else {
// We need to make a new graph
if (progress) {
logger.info() << "Creating graph copy" << std::endl;
}
graph_holder = std::make_unique<bdsg::PackedGraph>();
out_graph = graph_holder.get();
// We need to copy the nodes over, but not the paths.
handlegraph::algorithms::copy_handle_graph(graph, out_graph);
}
}

// Select the threads we are interested in.
std::vector<gbwt::size_type> thread_ids;
if (!sample_name.empty()) {
thread_ids = threads_for_sample(*gbwt_index, sample_name);
} else if(!path_prefix.empty()) {
for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) {
for (gbwt::size_type i = 0; i < gbwt_index->metadata.paths(); i++) {
PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, i, gbwt_reference_samples);
std::string name = gbwtgraph::compose_path_name(*gbwt_index, i, sense);
if (name.length() >= path_prefix.length() && std::equal(path_prefix.begin(), path_prefix.end(), name.begin())) {
Expand Down Expand Up @@ -633,15 +668,29 @@ int main_paths(int argc, char** argv) {
thread_ids = std::move(filtered_ids);
}

if (drop_paths) {
// Invert the set of thread IDs
std::unordered_set<gbwt::size_type> selected_ids {thread_ids.begin(), thread_ids.end()};
std::vector<gbwt::size_type> inverted_ids;
for (gbwt::size_type i = 0; i < gbwt_index->metadata.paths(); i++) {
if (!selected_ids.count(i)) {
inverted_ids.push_back(i);
}
}
thread_ids = std::move(inverted_ids);
}

if (thread_ids.empty()) {
logger.warn() << "no matching paths found in GBWT index" << std::endl;
return 0;
} else if (progress) {
logger.info() << "found " << thread_ids.size() << " matching paths in GBWT index" << std::endl;
}

// Process the threads.
for (gbwt::size_type id : thread_ids) {
PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, id, gbwt_reference_samples);
std::string name = gbwtgraph::compose_path_name(*gbwt_index, id, sense);
std::string name = gbwtgraph::compose_path_name(*gbwt_index, id, sense);

// We are only interested in the name
// TODO: do we need to consult list_cyclicity or list_metadata here?
Expand All @@ -657,9 +706,6 @@ int main_paths(int argc, char** argv) {
if (extract_as_gam || extract_as_gaf) {
// Write as an Alignment. Must contain the whole path.
aln_emitter->emit_singles({alignment_from_path(*graph, path)});
} else if (extract_as_vg) {
// Write as a Path in a VG
chunk_to_emitter(path, *graph_emitter);
} else if (extract_as_fasta) {
write_fasta_sequence(name, path_sequence(*graph, path), cout);
}
Expand All @@ -679,6 +725,28 @@ int main_paths(int argc, char** argv) {
}
cout << path.name() << "\t" << (cyclic ? "cyclic" : "acyclic") << std::endl;
}
if (retain_paths || drop_paths) {
// We need to copy this path into the new graph.
path_handle_t new_path = out_graph->create_path(
sense,
gbwtgraph::get_path_sample_name(*gbwt_index, id, sense),
gbwtgraph::get_path_locus_name(*gbwt_index, id, sense),
gbwtgraph::get_path_haplotype(*gbwt_index, id, sense),
gbwtgraph::get_path_phase_block(*gbwt_index, id, sense),
gbwtgraph::get_path_subrange(*gbwt_index, id, sense)
);

for (auto& mapping : path.mapping()) {
// Put each step onto the path
out_graph->append_step(new_path, out_graph->get_handle(mapping.position().node_id(), mapping.position().is_reverse()));
}

}
}

if (out_graph) {
// Write the graph to standard output.
vg::io::save_handle_graph(out_graph, std::cout);
}
} else if (graph) {

Expand Down Expand Up @@ -977,8 +1045,6 @@ int main_paths(int argc, char** argv) {
Path path = path_from_path_handle(*graph, path_handle);
if (extract_as_gam || extract_as_gaf) {
aln_emitter->emit_singles({alignment_from_path(*graph, path)});
} else if (extract_as_vg) {
chunk_to_emitter(path, *graph_emitter);
} else if (extract_as_fasta) {
write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout);
}
Expand Down
6 changes: 3 additions & 3 deletions vgci/vgci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ KEEP_INTERMEDIATE_FILES=0
# Should we show stdout and stderr from tests? If so, set to "-s".
SHOW_OPT=""
# What toil-vg should we install?
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@a56a860ad31583ce4b1616c9fa2bfb4bfe06d635"
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@db3dbc9616f60187cdad248f7d7c4d42e83f3549"
# What toil should we install?
# Could be something like "toil[aws,mesos]==3.20.0"
# or "toil[wdl,aws,mesos]@git+https://github.com/DataBiosphere/toil.git@52aa2979f23c24001837b2808552973e7bb263e6"
Expand Down Expand Up @@ -273,8 +273,8 @@ then
# Build the git version file first, so the Docker knows its version
make version

docker pull mirror.gcr.io/library/ubuntu:20.04
docker build --no-cache -t "${DOCKER_TAG}" -f Dockerfile .
docker pull mirror.gcr.io/library/ubuntu:22.04
docker build -t "${DOCKER_TAG}" -f Dockerfile .
if [ "$?" -ne 0 ]
then
echo "vg docker build fail"
Expand Down
Loading