Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 11 additions & 47 deletions src/toil_vg/vg_construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,39 +1558,16 @@ def run_make_haplo_thread_graphs(job, context, vg_id, vg_name, output_name, chro
# We know we have haplotype data on this contig.
# Pull out the graph with just the haplotype thread as the only path to vg_with_thread_as_path_path

# To accomplish this we now need to make sure to use vg combine
# to combine the path-only vg Protobuf and the actual graph. So
# first get them in different files.

base_graph_filename = '{}{}_thread_{}_base.vg'.format(output_name, tag, hap)

# strip paths from our original graph
cmd = ['vg', 'paths', '-d', '-v', os.path.basename(vg_path)]
with open(os.path.join(work_dir, base_graph_filename), 'wb') as out_file:
context.runner.call(job, cmd, work_dir = work_dir, outfile = out_file)

path_graph_filename = '{}{}_thread_{}_path.vg'.format(output_name, tag, hap)

# get haplotype thread paths from the gbwt
cmd = ['vg', 'paths', '--gbwt', os.path.basename(gbwt_path), '--extract-vg', '-x', os.path.basename(xg_path)]
vg_with_thread_as_path_path = os.path.join(work_dir, '{}{}_thread_{}_aspath.vg'.format(output_name, tag, hap))
logger.info('Creating thread-as-path graph {}'.format(vg_with_thread_as_path_path))
cmd = ['vg', 'paths', '--gbwt', os.path.basename(gbwt_path), '--retain-paths', '-x', os.path.basename(xg_path)]
for chrom in chroms:
# TODO: vg as of 1.39 can currently only support one
# selection at a time, and will fail if more are provided.
cmd += get_haplotype_selection_options(sample, chrom, hap)
with open(os.path.join(work_dir, path_graph_filename), 'wb') as out_file:
context.runner.call(job, cmd, work_dir = work_dir, outfile = out_file)

# Now combine the two files, adding the paths to the graph
vg_with_thread_as_path_path = os.path.join(work_dir, '{}{}_thread_{}_merge.vg'.format(output_name, tag, hap))
logger.info('Creating thread graph {}'.format(vg_with_thread_as_path_path))
cmd = ['vg', 'combine', '-c', base_graph_filename, path_graph_filename]
with open(vg_with_thread_as_path_path, 'wb') as out_file:
context.runner.call(job, cmd, work_dir = work_dir, outfile = out_file)

# Now delete the intermediates
os.unlink(os.path.join(work_dir, base_graph_filename))
os.unlink(os.path.join(work_dir, path_graph_filename))


# Now trim the graph vg_with_thread_as_path_path into vg_trimmed_path, dropping anything not covered by a path
vg_trimmed_path = os.path.join(work_dir, '{}{}_thread_{}.vg'.format(output_name, tag, hap))
logger.info('Creating trimmed thread graph {}'.format(vg_trimmed_path))
Expand Down Expand Up @@ -1733,30 +1710,17 @@ def run_make_sample_region_graph(job, context, vg_id, vg_name, output_name, chro
# We have actual thread data for the graph. Go extract the relevant threads.
extract_graph_path = os.path.join(work_dir, '{}_{}_extract.vg'.format(output_name, chrom))
logger.info('Creating sample extraction graph {}'.format(extract_graph_path))

# We need to extract two pieces (the base graph and the paths) and combine them.
# Different versions of vg may compress the parts. We need them both uncompressed.
base_path = extract_graph_path + '.1'
paths_path = extract_graph_path + '.2'

with open(base_path, 'wb') as base_file:
# strip paths from our original graph
cmd = ['vg', 'paths', '-d', '-v', os.path.basename(vg_path)]
context.runner.call(job, cmd, work_dir = work_dir, outfile = base_file)

with open(paths_path, 'wb') as paths_file:

# We need to get a graph containing only these thread paths; the
# step after this is to drop anything not on a path.
with open(extract_graph_path, 'wb') as extract_graph_file:
# If we have a nonzero thread count we must have a GBWT.
# Get haplotype thread paths from the index for all haplotypes of the sample.
cmd = ['vg', 'paths', '--gbwt', os.path.basename(gbwt_path), '--extract-vg']
cmd = ['vg', 'paths', '--gbwt', os.path.basename(gbwt_path), '--retain-paths']
cmd += ['-x', os.path.basename(xg_path)]
cmd += get_haplotype_selection_options(sample)
context.runner.call(job, cmd, work_dir = work_dir, outfile = paths_file)

with open(extract_graph_path, 'wb') as extract_graph_file:
# Combine as Protobuf.
cmd = ['vg', 'combine', '-c', os.path.basename(base_path), os.path.basename(paths_path)]
context.runner.call(job, cmd, work_dir = work_dir, outfile = extract_graph_file)

assert os.path.getsize(extract_graph_path) > 4

sample_graph_path = os.path.join(work_dir, '{}_{}.vg'.format(output_name, chrom))
Expand All @@ -1767,7 +1731,7 @@ def run_make_sample_region_graph(job, context, vg_id, vg_name, output_name, chro
if not leave_thread_paths:
cmd.append(['vg', 'paths', '-v', '-', '-d'])
context.runner.call(job, cmd, work_dir = work_dir, outfile = sample_graph_file)

assert os.path.getsize(sample_graph_path) > 4

if validate:
Expand Down