Skip to content

Commit 0c44ef6

Browse files
committed
VCFHeader and VCFHeaderLine refactoring to enable support for VCF4.3/BCF2.2 and bug fixes.
1 parent bf8d8da commit 0c44ef6

60 files changed

Lines changed: 5905 additions & 2107 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/main/java/htsjdk/samtools/Defaults.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ public class Defaults {
110110
*/
111111
public static final boolean DISABLE_SNAPPY_COMPRESSOR;
112112

113+
/**
114+
* Strict VCF version validation. Default = true.
115+
*/
116+
public static final boolean STRICT_VCF_VERSION_VALIDATION;
117+
113118

114119
public static final String SAMJDK_PREFIX = "samjdk.";
115120
static {
@@ -134,6 +139,7 @@ public class Defaults {
134139
SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name()));
135140
SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false);
136141
DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false);
142+
STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true);
137143
}
138144

139145
/**

src/main/java/htsjdk/samtools/SAMSequenceDictionary.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List<SAMSequenceRecord> list) {
5353
setSequences(list);
5454
}
5555

56+
//TODO: this returns sequences in the internal list order instead of
57+
// honoring each sequence's contigIndex
58+
/**
59+
* Get a list of sequences for this dictionary.
60+
* @return the list of sequences for this dictionary in internal order (the order in which the sequences
61+
* were added to this dictionary)
62+
*/
5663
public List<SAMSequenceRecord> getSequences() {
5764
return Collections.unmodifiableList(mSequences);
5865
}
@@ -75,6 +82,14 @@ public void setSequences(final List<SAMSequenceRecord> list) {
7582
list.forEach(this::addSequence);
7683
}
7784

85+
/**
86+
* Add a sequence to the dictionary.
87+
* @param sequenceRecord the sequence record to add - note that this method mutates the contig
88+
* index of the sequenceRecord to match the newly added record's relative
89+
* order in the list
90+
*/
91+
//TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match
92+
// the record's relative placement in the dictionary's internal list
7893
public void addSequence(final SAMSequenceRecord sequenceRecord) {
7994
if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) {
8095
throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " +

src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java

Lines changed: 11 additions & 170 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,23 @@
1-
package org.broadinstitute.hellbender.utils;
1+
package htsjdk.samtools;
22

3-
import htsjdk.samtools.SAMSequenceDictionary;
4-
import htsjdk.samtools.SAMSequenceRecord;
5-
import org.broadinstitute.hellbender.exceptions.GATKException;
6-
import org.broadinstitute.hellbender.exceptions.UserException;
3+
import htsjdk.utils.ValidationUtils;
74

85
import java.util.*;
96
import java.util.stream.Collectors;
107

118
/**
129
*
13-
* A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference,
10+
* A series of utility functions that enable comparison of two sequence dictionaries -- from the reference,
1411
* from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that
1512
* describes at a high level the consistency between two dictionaries, or a validateDictionaries that will
1613
* blow up with a UserException if the dicts are too incompatible.
1714
*
1815
* Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length,
1916
* if available.
2017
*/
21-
public final class SequenceDictionaryUtils {
18+
public final class SAMSequenceDictionaryUtils {
2219

23-
private SequenceDictionaryUtils(){}
20+
private SAMSequenceDictionaryUtils(){}
2421

2522
/**
2623
* Compares sequence records by their order
@@ -59,166 +56,10 @@ public enum SequenceDictionaryCompatibility {
5956
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
6057
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example)
6158
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
62-
// orders with respect to each other
59+
// orders with respect to each other
6360
DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
64-
// order with respect to each other, but one or more of them have different
65-
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
66-
}
67-
68-
/**
69-
* Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate
70-
* for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages.
71-
*
72-
* The standard validation settings used by this method are:
73-
*
74-
* -Require the dictionaries to share a common subset of equivalent contigs
75-
*
76-
* -Do not require dict1 to be a superset of dict2.
77-
*
78-
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
79-
* different orders with respect to each other, occur at different absolute indices, or are
80-
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
81-
* indices, and so should not be sensitive to contig ordering issues.
82-
*
83-
* For comparing a CRAM dictionary against a reference dictionary, call
84-
* {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead.
85-
*
86-
* @param name1 name associated with dict1
87-
* @param dict1 the sequence dictionary dict1
88-
* @param name2 name associated with dict2
89-
* @param dict2 the sequence dictionary dict2
90-
*/
91-
public static void validateDictionaries( final String name1,
92-
final SAMSequenceDictionary dict1,
93-
final String name2,
94-
final SAMSequenceDictionary dict2) {
95-
final boolean requireSuperset = false;
96-
final boolean checkContigOrdering = false;
97-
98-
validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering);
99-
}
100-
101-
/**
102-
* Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate
103-
* validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with
104-
* detailed error messages.
105-
*
106-
* The standard validation settings used by this method are:
107-
*
108-
* -Require the reference dictionary to be a superset of the cram dictionary
109-
*
110-
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
111-
* different orders with respect to each other, occur at different absolute indices, or are
112-
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
113-
* indices, and so should not be sensitive to contig ordering issues.
114-
*
115-
* @param referenceDictionary the sequence dictionary for the reference
116-
* @param cramDictionary sequence dictionary from a CRAM file
117-
*/
118-
public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary,
119-
final SAMSequenceDictionary cramDictionary ) {
120-
// For CRAM, we require the reference dictionary to be a superset of the reads dictionary
121-
final boolean requireSuperset = true;
122-
final boolean checkContigOrdering = false;
123-
124-
validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering);
125-
}
126-
127-
128-
/**
129-
* Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
130-
* UserExceptions are thrown with detailed error messages.
131-
*
132-
* Two sequence dictionaries are compatible if they share a common subset of equivalent contigs,
133-
* where equivalent contigs are defined as having the same name and length.
134-
*
135-
* @param name1 name associated with dict1
136-
* @param dict1 the sequence dictionary dict1
137-
* @param name2 name associated with dict2
138-
* @param dict2 the sequence dictionary dict2
139-
* @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset
140-
* @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other
141-
* and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries
142-
*/
143-
public static void validateDictionaries( final String name1,
144-
final SAMSequenceDictionary dict1,
145-
final String name2,
146-
final SAMSequenceDictionary dict2,
147-
final boolean requireSuperset,
148-
final boolean checkContigOrdering ) {
149-
Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary");
150-
Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary");
151-
152-
final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering);
153-
154-
switch ( type ) {
155-
case IDENTICAL:
156-
return;
157-
case SUPERSET:
158-
return;
159-
case COMMON_SUBSET:
160-
if ( requireSuperset ) {
161-
final Set<String> contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet());
162-
final List<String> missingContigs = dict2.getSequences().stream()
163-
.map(SAMSequenceRecord::getSequenceName)
164-
.filter(contig -> !contigs1.contains(contig))
165-
.collect(Collectors.toList());
166-
throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2);
167-
}
168-
return;
169-
case NO_COMMON_CONTIGS:
170-
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
171-
172-
case UNEQUAL_COMMON_CONTIGS: {
173-
final List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
174-
final SAMSequenceRecord elt1 = x.get(0);
175-
final SAMSequenceRecord elt2 = x.get(1);
176-
throw new UserException.IncompatibleSequenceDictionaries(
177-
String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
178-
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
179-
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
180-
name1, dict1, name2, dict2
181-
);
182-
}
183-
184-
case NON_CANONICAL_HUMAN_ORDER: {
185-
// We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering,
186-
// so we should always throw when we see it.
187-
final UserException ex;
188-
if ( nonCanonicalHumanContigOrder(dict1) ) {
189-
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
190-
}
191-
else {
192-
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
193-
}
194-
195-
throw ex;
196-
}
197-
198-
case OUT_OF_ORDER: {
199-
// We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering,
200-
// so we should always throw when we see it.
201-
throw new UserException.IncompatibleSequenceDictionaries(
202-
"The relative ordering of the common contigs in " + name1 + " and " + name2 +
203-
" is not the same; to fix this please see: "
204-
+ "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
205-
+ " which describes reordering contigs in BAM and VCF files.",
206-
name1, dict1, name2, dict2);
207-
}
208-
209-
case DIFFERENT_INDICES: {
210-
// We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering,
211-
// so we should always throw when we see it.
212-
final String msg = "One or more contigs common to both dictionaries have " +
213-
"different indices (ie., absolute positions) in each dictionary. Code " +
214-
"that is sensitive to contig ordering can fail when this is the case. " +
215-
"You should fix the sequence dictionaries so that all shared contigs " +
216-
"occur at the same absolute positions in both dictionaries.";
217-
throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
218-
}
219-
default:
220-
throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type);
221-
}
61+
// order with respect to each other, but one or more of them have different
62+
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
22263
}
22364

22465
/**
@@ -465,14 +306,14 @@ public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SA
465306
}
466307

467308
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
468-
Set<String> contigNames = new LinkedHashSet<String>(Utils.optimumHashSize(dict.size()));
309+
Set<String> contigNames = new LinkedHashSet<String>(dict.size());
469310
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
470311
contigNames.add(dictionaryEntry.getSequenceName());
471312
return contigNames;
472313
}
473314

474315
public static List<String> getContigNamesList(final SAMSequenceDictionary refSeqDict) {
475-
Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null");
316+
ValidationUtils.nonNull(refSeqDict, "provided reference sequence ditionary is null");
476317
return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList());
477318
}
478319

@@ -486,7 +327,7 @@ public static List<String> getContigNamesList(final SAMSequenceDictionary refSeq
486327
* @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
487328
*/
488329
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
489-
Utils.nonNull(dict, "Sequence dictionary must be non-null");
330+
ValidationUtils.nonNull(dict, "Sequence dictionary must be non-null");
490331

491332
StringBuilder s = new StringBuilder("[ ");
492333

src/main/java/htsjdk/tribble/TribbleException.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ public static class InternalCodecException extends TribbleException {
8686
public InternalCodecException(String message) { super (message); }
8787
}
8888

89+
public static class VersionValidationFailure extends TribbleException {
90+
public VersionValidationFailure(final String message) {
91+
super(String.format("Version validation failure: %s", message));
92+
}
93+
}
94+
8995
// //////////////////////////////////////////////////////////////////////
9096
// Index exceptions
9197
// //////////////////////////////////////////////////////////////////////

src/main/java/htsjdk/variant/bcf2/BCF2Utils.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@
2727

2828
import htsjdk.samtools.util.FileExtensions;
2929
import htsjdk.tribble.TribbleException;
30-
import htsjdk.variant.vcf.*;
30+
import htsjdk.variant.vcf.VCFConstants;
31+
import htsjdk.variant.vcf.VCFHeader;
32+
import htsjdk.variant.vcf.VCFHeaderLine;
33+
import htsjdk.variant.vcf.VCFIDHeaderLine;
34+
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
3135

3236
import java.io.File;
3337
import java.io.FileNotFoundException;
@@ -93,10 +97,15 @@ public static ArrayList<String> makeDictionary(final VCFHeader header) {
9397
// set up the strings dictionary
9498
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
9599
if ( line.shouldBeAddedToDictionary() ) {
96-
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
97-
if ( ! seen.contains(idLine.getID())) {
98-
dict.add(idLine.getID());
99-
seen.add(idLine.getID());
100+
if (!line.isIDHeaderLine()) {
101+
//is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true
102+
throw new TribbleException(String.format(
103+
"The header line %s cannot be added to the BCF dictionary since its not an ID header line",
104+
line));
105+
}
106+
if ( ! seen.contains(line.getID())) {
107+
dict.add(line.getID());
108+
seen.add(line.getID());
100109
}
101110
}
102111
}
@@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) {
291300
* Are the elements and their order in the output and input headers consistent so that
292301
* we can write out the raw genotypes block without decoding and recoding it?
293302
*
294-
* If the order of INFO, FILTER, or contrig elements in the output header is different than
303+
* If the order of INFO, FILTER, or contig elements in the output header is different than
295304
* in the input header we must decode the blocks using the input header and then recode them
296305
* based on the new output order.
297306
*
@@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe
308317
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
309318
return false;
310319

311-
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
312-
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
320+
final Iterator<VCFSimpleHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
321+
final Iterator<VCFSimpleHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
313322

314323
while ( inputLinesIt.hasNext() ) {
315324
if ( ! outputLinesIt.hasNext() ) // missing lines in output
316325
return false;
317326

318-
final VCFIDHeaderLine outputLine = outputLinesIt.next();
319-
final VCFIDHeaderLine inputLine = inputLinesIt.next();
327+
final VCFSimpleHeaderLine outputLine = outputLinesIt.next();
328+
final VCFSimpleHeaderLine inputLine = inputLinesIt.next();
320329

321330
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
322331
return false;

0 commit comments

Comments
 (0)