1- package org . broadinstitute . hellbender . utils ;
1+ package htsjdk . samtools ;
22
3- import htsjdk .samtools .SAMSequenceDictionary ;
4- import htsjdk .samtools .SAMSequenceRecord ;
5- import org .broadinstitute .hellbender .exceptions .GATKException ;
6- import org .broadinstitute .hellbender .exceptions .UserException ;
3+ import htsjdk .utils .ValidationUtils ;
74
85import java .util .*;
96import java .util .stream .Collectors ;
107
118/**
129 *
13- * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference,
10+ * A series of utility functions that enable comparison of two sequence dictionaries -- from the reference,
1411 * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that
1512 * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will
1613 * blow up with a UserException if the dicts are too incompatible.
1714 *
1815 * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length,
1916 * if available.
2017 */
21- public final class SequenceDictionaryUtils {
18+ public final class SAMSequenceDictionaryUtils {
2219
23- private SequenceDictionaryUtils (){}
20+ private SAMSequenceDictionaryUtils (){}
2421
2522 /**
2623 * Compares sequence records by their order
@@ -59,166 +56,10 @@ public enum SequenceDictionaryCompatibility {
5956 UNEQUAL_COMMON_CONTIGS , // common subset has contigs that have the same name but different lengths
6057 NON_CANONICAL_HUMAN_ORDER , // human reference detected but the order of the contigs is non-standard (lexicographic, for example)
6158 OUT_OF_ORDER , // the two dictionaries overlap but the overlapping contigs occur in different
62- // orders with respect to each other
59+ // orders with respect to each other
6360 DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
64- // order with respect to each other, but one or more of them have different
65- // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
66- }
67-
68- /**
69- * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate
70- * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages.
71- *
72- * The standard validation settings used by this method are:
73- *
74- * -Require the dictionaries to share a common subset of equivalent contigs
75- *
76- * -Do not require dict1 to be a superset of dict2.
77- *
78- * -Do not perform checks related to contig ordering: don't throw if the common contigs are in
79- * different orders with respect to each other, occur at different absolute indices, or are
80- * lexicographically sorted human dictionaries. GATK uses contig names rather than contig
81- * indices, and so should not be sensitive to contig ordering issues.
82- *
83- * For comparing a CRAM dictionary against a reference dictionary, call
84- * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead.
85- *
86- * @param name1 name associated with dict1
87- * @param dict1 the sequence dictionary dict1
88- * @param name2 name associated with dict2
89- * @param dict2 the sequence dictionary dict2
90- */
91- public static void validateDictionaries ( final String name1 ,
92- final SAMSequenceDictionary dict1 ,
93- final String name2 ,
94- final SAMSequenceDictionary dict2 ) {
95- final boolean requireSuperset = false ;
96- final boolean checkContigOrdering = false ;
97-
98- validateDictionaries (name1 , dict1 , name2 , dict2 , requireSuperset , checkContigOrdering );
99- }
100-
101- /**
102- * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate
103- * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with
104- * detailed error messages.
105- *
106- * The standard validation settings used by this method are:
107- *
108- * -Require the reference dictionary to be a superset of the cram dictionary
109- *
110- * -Do not perform checks related to contig ordering: don't throw if the common contigs are in
111- * different orders with respect to each other, occur at different absolute indices, or are
112- * lexicographically sorted human dictionaries. GATK uses contig names rather than contig
113- * indices, and so should not be sensitive to contig ordering issues.
114- *
115- * @param referenceDictionary the sequence dictionary for the reference
116- * @param cramDictionary sequence dictionary from a CRAM file
117- */
118- public static void validateCRAMDictionaryAgainstReference ( final SAMSequenceDictionary referenceDictionary ,
119- final SAMSequenceDictionary cramDictionary ) {
120- // For CRAM, we require the reference dictionary to be a superset of the reads dictionary
121- final boolean requireSuperset = true ;
122- final boolean checkContigOrdering = false ;
123-
124- validateDictionaries ("reference" , referenceDictionary , "reads" , cramDictionary , requireSuperset , checkContigOrdering );
125- }
126-
127-
128- /**
129- * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
130- * UserExceptions are thrown with detailed error messages.
131- *
132- * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs,
133- * where equivalent contigs are defined as having the same name and length.
134- *
135- * @param name1 name associated with dict1
136- * @param dict1 the sequence dictionary dict1
137- * @param name2 name associated with dict2
138- * @param dict2 the sequence dictionary dict2
139- * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset
140- * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other
141- * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries
142- */
143- public static void validateDictionaries ( final String name1 ,
144- final SAMSequenceDictionary dict1 ,
145- final String name2 ,
146- final SAMSequenceDictionary dict2 ,
147- final boolean requireSuperset ,
148- final boolean checkContigOrdering ) {
149- Utils .nonNull (dict1 , "Something went wrong with sequence dictionary detection, check that " +name1 +" has a valid sequence dictionary" );
150- Utils .nonNull (dict2 , "Something went wrong with sequence dictionary detection, check that " +name2 +" has a valid sequence dictionary" );
151-
152- final SequenceDictionaryCompatibility type = compareDictionaries (dict1 , dict2 , checkContigOrdering );
153-
154- switch ( type ) {
155- case IDENTICAL :
156- return ;
157- case SUPERSET :
158- return ;
159- case COMMON_SUBSET :
160- if ( requireSuperset ) {
161- final Set <String > contigs1 = dict1 .getSequences ().stream ().map (SAMSequenceRecord ::getSequenceName ).collect (Collectors .toSet ());
162- final List <String > missingContigs = dict2 .getSequences ().stream ()
163- .map (SAMSequenceRecord ::getSequenceName )
164- .filter (contig -> !contigs1 .contains (contig ))
165- .collect (Collectors .toList ());
166- throw new UserException .IncompatibleSequenceDictionaries (String .format ("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n " , name1 , name2 , String .join (", " , missingContigs )), name1 , dict1 , name2 , dict2 );
167- }
168- return ;
169- case NO_COMMON_CONTIGS :
170- throw new UserException .IncompatibleSequenceDictionaries ("No overlapping contigs found" , name1 , dict1 , name2 , dict2 );
171-
172- case UNEQUAL_COMMON_CONTIGS : {
173- final List <SAMSequenceRecord > x = findDisequalCommonContigs (getCommonContigsByName (dict1 , dict2 ), dict1 , dict2 );
174- final SAMSequenceRecord elt1 = x .get (0 );
175- final SAMSequenceRecord elt2 = x .get (1 );
176- throw new UserException .IncompatibleSequenceDictionaries (
177- String .format ("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d" ,
178- name1 , elt1 .getSequenceName (), elt1 .getSequenceLength (),
179- name2 , elt2 .getSequenceName (), elt2 .getSequenceLength ()),
180- name1 , dict1 , name2 , dict2
181- );
182- }
183-
184- case NON_CANONICAL_HUMAN_ORDER : {
185- // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering,
186- // so we should always throw when we see it.
187- final UserException ex ;
188- if ( nonCanonicalHumanContigOrder (dict1 ) ) {
189- ex = new UserException .LexicographicallySortedSequenceDictionary (name1 , dict1 );
190- }
191- else {
192- ex = new UserException .LexicographicallySortedSequenceDictionary (name2 , dict2 );
193- }
194-
195- throw ex ;
196- }
197-
198- case OUT_OF_ORDER : {
199- // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering,
200- // so we should always throw when we see it.
201- throw new UserException .IncompatibleSequenceDictionaries (
202- "The relative ordering of the common contigs in " + name1 + " and " + name2 +
203- " is not the same; to fix this please see: "
204- + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
205- + " which describes reordering contigs in BAM and VCF files." ,
206- name1 , dict1 , name2 , dict2 );
207- }
208-
209- case DIFFERENT_INDICES : {
210- // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering,
211- // so we should always throw when we see it.
212- final String msg = "One or more contigs common to both dictionaries have " +
213- "different indices (ie., absolute positions) in each dictionary. Code " +
214- "that is sensitive to contig ordering can fail when this is the case. " +
215- "You should fix the sequence dictionaries so that all shared contigs " +
216- "occur at the same absolute positions in both dictionaries." ;
217- throw new UserException .IncompatibleSequenceDictionaries (msg , name1 , dict1 , name2 , dict2 );
218- }
219- default :
220- throw new GATKException ("Unexpected SequenceDictionaryComparison type: " + type );
221- }
61+ // order with respect to each other, but one or more of them have different
62+ // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
22263 }
22364
22465 /**
@@ -465,14 +306,14 @@ public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SA
465306 }
466307
467308 public static Set <String > getContigNames (SAMSequenceDictionary dict ) {
468- Set <String > contigNames = new LinkedHashSet <String >(Utils . optimumHashSize ( dict .size () ));
309+ Set <String > contigNames = new LinkedHashSet <String >(dict .size ());
469310 for (SAMSequenceRecord dictionaryEntry : dict .getSequences ())
470311 contigNames .add (dictionaryEntry .getSequenceName ());
471312 return contigNames ;
472313 }
473314
474315 public static List <String > getContigNamesList (final SAMSequenceDictionary refSeqDict ) {
475- Utils .nonNull (refSeqDict , "provided reference sequence ditionary is null" );
316+ ValidationUtils .nonNull (refSeqDict , "provided reference sequence ditionary is null" );
476317 return refSeqDict .getSequences ().stream ().map (SAMSequenceRecord ::getSequenceName ).collect (Collectors .toList ());
477318 }
478319
@@ -486,7 +327,7 @@ public static List<String> getContigNamesList(final SAMSequenceDictionary refSeq
486327 * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
487328 */
488329 public static String getDictionaryAsString ( final SAMSequenceDictionary dict ) {
489- Utils .nonNull (dict , "Sequence dictionary must be non-null" );
330+ ValidationUtils .nonNull (dict , "Sequence dictionary must be non-null" );
490331
491332 StringBuilder s = new StringBuilder ("[ " );
492333
0 commit comments