From bf8d8da66213f5f044b4d3e4fd75a0fbbcf0ef19 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 8 Nov 2021 17:11:25 -0500 Subject: [PATCH 01/22] Commit with raw GATK SequenceDictionaryUtils and SequenceDictionaryUtilsTest. --- .../samtools/SAMSequenceDictionaryUtils.java | 505 ++++++++++++++++++ .../SAMSequenceDictionaryUtilsTest.java | 357 +++++++++++++ 2 files changed, 862 insertions(+) create mode 100644 src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java create mode 100644 src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java new file mode 100644 index 0000000000..7f1db9fd94 --- /dev/null +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -0,0 +1,505 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * + * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that + * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will + * blow up with a UserException if the dicts are too incompatible. + * + * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, + * if available. + */ +public final class SequenceDictionaryUtils { + + private SequenceDictionaryUtils(){} + + /** + * Compares sequence records by their order + */ + private static final Comparator SEQUENCE_INDEX_ORDER = Comparator.comparing(SAMSequenceRecord::getSequenceIndex); + + // The following sets of contig records are used to perform the non-canonical human ordering check. + // This check ensures that the order is 1,2,3... instead of 1, 10, 11, 12...2, 20, 21... + + // hg18 + protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + + // b36 + protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); + protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); + protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); + + // b37 + protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); + protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); + protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); + + + public enum SequenceDictionaryCompatibility { + IDENTICAL, // the dictionaries are identical + COMMON_SUBSET, // there exists a common subset of equivalent contigs + SUPERSET, // the first dict's set of contigs supersets the second dict's set + NO_COMMON_CONTIGS, // no overlap between dictionaries + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different + // orders with respect to each other + DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } + } + + /** + * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate + * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the dictionaries to share a common subset of equivalent contigs + * + * -Do not require dict1 to be a superset of dict2. + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * For comparing a CRAM dictionary against a reference dictionary, call + * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2) { + final boolean requireSuperset = false; + final boolean checkContigOrdering = false; + + validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering); + } + + /** + * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate + * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with + * detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the reference dictionary to be a superset of the cram dictionary + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * @param referenceDictionary the sequence dictionary for the reference + * @param cramDictionary sequence dictionary from a CRAM file + */ + public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary, + final SAMSequenceDictionary cramDictionary ) { + // For CRAM, we require the reference dictionary to be a superset of the reads dictionary + final boolean requireSuperset = true; + final boolean checkContigOrdering = false; + + validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering); + } + + + /** + * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then + * UserExceptions are thrown with detailed error messages. + * + * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs, + * where equivalent contigs are defined as having the same name and length. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset + * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other + * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2, + final boolean requireSuperset, + final boolean checkContigOrdering ) { + Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary"); + Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary"); + + final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering); + + switch ( type ) { + case IDENTICAL: + return; + case SUPERSET: + return; + case COMMON_SUBSET: + if ( requireSuperset ) { + final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet()); + final List missingContigs = dict2.getSequences().stream() + .map(SAMSequenceRecord::getSequenceName) + .filter(contig -> !contigs1.contains(contig)) + .collect(Collectors.toList()); + throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2); + } + return; + case NO_COMMON_CONTIGS: + throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); + + case UNEQUAL_COMMON_CONTIGS: { + final List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final SAMSequenceRecord elt1 = x.get(0); + final SAMSequenceRecord elt2 = x.get(1); + throw new UserException.IncompatibleSequenceDictionaries( + String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", + name1, elt1.getSequenceName(), elt1.getSequenceLength(), + name2, elt2.getSequenceName(), elt2.getSequenceLength()), + name1, dict1, name2, dict2 + ); + } + + case NON_CANONICAL_HUMAN_ORDER: { + // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final UserException ex; + if ( nonCanonicalHumanContigOrder(dict1) ) { + ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); + } + else { + ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); + } + + throw ex; + } + + case OUT_OF_ORDER: { + // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + throw new UserException.IncompatibleSequenceDictionaries( + "The relative ordering of the common contigs in " + name1 + " and " + name2 + + " is not the same; to fix this please see: " + + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " + + " which describes reordering contigs in BAM and VCF files.", + name1, dict1, name2, dict2); + } + + case DIFFERENT_INDICES: { + // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final String msg = "One or more contigs common to both dictionaries have " + + "different indices (ie., absolute positions) in each dictionary. Code " + + "that is sensitive to contig ordering can fail when this is the case. " + + "You should fix the sequence dictionaries so that all shared contigs " + + "occur at the same absolute positions in both dictionaries."; + throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); + } + default: + throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type); + } + } + + /** + * Workhorse routine that takes two dictionaries and returns their compatibility. + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @param checkContigOrdering if true, perform checks related to contig ordering: forbid lexicographically-sorted + * dictionaries, and require common contigs to be in the same relative order and at the + * same absolute indices + * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries + */ + public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final boolean checkContigOrdering ) { + if ( checkContigOrdering && (nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2)) ) { + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; + } + + final Set commonContigs = getCommonContigsByName(dict1, dict2); + + if (commonContigs.isEmpty()) { + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; + } + else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; + } + + final boolean commonContigsAreInSameRelativeOrder = commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2); + + if ( checkContigOrdering && ! commonContigsAreInSameRelativeOrder ) { + return SequenceDictionaryCompatibility.OUT_OF_ORDER; + } + else if ( commonContigsAreInSameRelativeOrder && commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) { + return SequenceDictionaryCompatibility.IDENTICAL; + } + else if ( checkContigOrdering && ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.DIFFERENT_INDICES; + } + else if ( supersets(dict1, dict2) ) { + return SequenceDictionaryCompatibility.SUPERSET; + } + else { + return SequenceDictionaryCompatibility.COMMON_SUBSET; + } + } + + + /** + * Utility function that tests whether dict1's set of contigs is a superset of dict2's + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if dict1's set of contigs supersets dict2's + */ + private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) { + // Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account). + for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) { + final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName()); + if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) { + return false; + } + } + + return true; + } + + + + /** + * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means + * that the seq records have the same length, if both are non-zero. + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return true if all of the common contigs are equivalent + */ + private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + } + + /** + * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * null if all common contigs are equivalent + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return + */ + private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + for ( String name : commonContigs ) { + SAMSequenceRecord elt1 = dict1.getSequence(name); + SAMSequenceRecord elt2 = dict2.getSequence(name); + if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) + return Arrays.asList(elt1,elt2); + } + + return null; + } + + /** + * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and + * lengths. + * + * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH + * + * @param first first sequence record to compare + * @param second second sequence record to compare + * @return true if first and second have the same names and lengths, otherwise false + */ + public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) { + if ( first == second ) { + return true; + } + if ( first == null || second == null ) { + return false; + } + final int length1 = first.getSequenceLength(); + final int length2 = second.getSequenceLength(); + + if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){ + return false; + } + if (! first.getSequenceName().equals(second.getSequenceName())){ + return false; + } + return true; + } + + /** + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. + * + * @param dict + * @return + */ + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt; + } + if ( chr1 != null && chr2 != null && chr10 != null) { + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } + + return false; + } + + /** + * Trivial helper that returns true if elt has the same name and length as rec1 or rec2 + * @param elt record to test + * @param recs the list of records to check for name and length equivalence + * @return true if elt has the same name and length as any of the recs + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) { + for (SAMSequenceRecord rec : recs) { + if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) { + return true; + } + } + return false; + } + + /** + * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to + * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting + * these according to their indices, and then walking through the sorted list to ensure that each ordered contig + * is equivalent + * + * @param commonContigs names of the contigs common to both dictionaries + * @param dict1 first SAMSequenceDictionary + * @param dict2 second SAMSequenceDictionary + * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false + */ + private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + final List list1 = getSequencesOfName(commonContigs, dict1); + final List list2 = getSequencesOfName(commonContigs, dict2); + list1.sort(SEQUENCE_INDEX_ORDER); + list2.sort(SEQUENCE_INDEX_ORDER); + + for ( int i = 0; i < list1.size(); i++ ) { + SAMSequenceRecord elt1 = list1.get(i); + SAMSequenceRecord elt2 = list2.get(i); + if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) + return false; + } + + return true; + } + + /** + * Gets the subset of SAMSequenceRecords in commonContigs in dict + * + * @param commonContigs + * @param dict + * @return + */ + private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { + List l = new ArrayList<>(commonContigs.size()); + for ( String name : commonContigs ) { + l.add(dict.getSequence(name) ); + } + + return l; + } + + /** + * Checks whether the common contigs in the given sequence dictionaries occur at the same indices + * in both dictionaries + * + * @param commonContigs Set of names of the contigs that occur in both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, + * otherwise false + */ + private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { + for ( String commonContig : commonContigs ) { + SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + + // Each common contig must have the same index in both dictionaries + if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { + return false; + } + } + + return true; + } + + /** + * Returns the set of contig names found in both dicts. + * @param dict1 + * @param dict2 + * @return + */ + public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + Set intersectingSequenceNames = getContigNames(dict1); + intersectingSequenceNames.retainAll(getContigNames(dict2)); + return intersectingSequenceNames; + } + + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size())); + for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) + contigNames.add(dictionaryEntry.getSequenceName()); + return contigNames; + } + + public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { + Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); + } + + /** + * Returns a compact String representation of the sequence dictionary it's passed + * + * The format of the returned String is: + * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] + * + * @param dict a non-null SAMSequenceDictionary + * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed + */ + public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { + Utils.nonNull(dict, "Sequence dictionary must be non-null"); + + StringBuilder s = new StringBuilder("[ "); + + for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { + s.append(dictionaryEntry.getSequenceName()); + s.append("(length:"); + s.append(dictionaryEntry.getSequenceLength()); + s.append(") "); + } + + s.append("]"); + + return s.toString(); + } + +} diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java new file mode 100644 index 0000000000..37842f8a9a --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -0,0 +1,357 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.*; +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; + +public final class SequenceDictionaryUtilsUnitTest extends GATKBaseTest { + + private static Logger logger = LogManager.getLogger(SequenceDictionaryUtilsUnitTest.class); + + @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) + public Object[][] testSequenceRecordsAreEquivalentDataProvider() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + return new Object[][]{ + {CHR1_HG19, CHR1_HG19, true}, + {CHR1_HG19, CHRM_HG19, false}, + {CHR1_HG19, CHR_NONSTANDARD1, false}, + {null, null, true}, + {CHR1_HG19, null, false}, + {null, CHR1_HG19, false}, + {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true}, + {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false}, + {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true}, + {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false}, + }; + } + + @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") + public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ + final boolean actual = SequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + Assert.assertEquals(actual, expected); + } + + @DataProvider( name = "SequenceDictionaryDataProvider" ) + public Object[][] generateSequenceDictionaryTestData() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + + final SAMSequenceRecord CHR1_HG19_WITH_ATTRIBUTES = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), CHR1_HG19.getSequenceLength()); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); + + final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; + final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + + final List hg19AllContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1), + new SimpleInterval("chr2", 1, 1), + new SimpleInterval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1)); + + return new Object[][] { + // Identical dictionaries: + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + + // Dictionaries with a common subset: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, true}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + // If checkContigOrdering == false, ordering of the common contigs should not matter: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + + // Dictionaries with no common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + + // Dictionaries with unequal common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + + // One or both dictionaries in non-canonical human order: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset, but different relative ordering within that subset + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get OUT_OF_ORDER: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset in the same relative order, but with different indices. + // This will only throw an exception during validation if checkContigOrdering is true + + // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + + // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + + // tests for SUPERSET + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + // Extended attributes should be ignored when determining whether a superset exists: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, null, false, false} + }; + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryValidation( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + Exception exceptionThrown = null; + try { + SequenceDictionaryUtils.validateDictionaries( + "firstDictionary", + firstDictionary, + "secondDictionary", + secondDictionary, + requireSuperset, + checkContigOrdering); + } + catch ( Exception e ) { + exceptionThrown = e; + } + if ( expectedExceptionUponValidation != null ) { + Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), + String.format("Expected exception %s but saw %s instead. %s", + expectedExceptionUponValidation.getSimpleName(), + exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), + testDescription)); + } + else { + Assert.assertTrue(exceptionThrown == null, + String.format("Expected no exception but saw exception %s instead. %s", + exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", + testDescription)); + } + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryComparison( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + + final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + + Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, + String.format("Dictionary comparison should have returned %s but instead returned %s. %s", + dictionaryCompatibility, reportedCompatibility, testDescription)); + } + + @DataProvider(name = "StandardValidationIgnoresContigOrderData") + public Object[][] getStandardValidationIgnoresContigOrderData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19) }, + + }; + } + + @Test(dataProvider = "StandardValidationIgnoresContigOrderData") + public void testStandardValidationIgnoresContigOrder( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should ignore differences in ordering of common contigs, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @DataProvider(name = "NonSupersetData") + public Object[][] getNonSupersetData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) } + }; + } + + @Test(dataProvider = "NonSupersetData") + public void testStandardValidationDoesNotRequireSuperset( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should not require a superset relationship, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @Test(dataProvider = "NonSupersetData", expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) + public void testCRAMValidationDoesRequireSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + // CRAM validation against the reference SHOULD require a superset relationship, so we should + // get an exception here + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + @DataProvider(name = "SupersetData") + public Object[][] getSupersetData() { + return new Object[][] { + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19)}, //exactly same + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19) }, + { Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19) } + }; + } + + @Test(dataProvider = "SupersetData") + public void testCRAMValidationDoesAcceptSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + //In these inputs , cram contigs are subsets of ref contigs and so it should be accepted + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { + final List clonedContigs = new ArrayList(contigs.size()); + + // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects + // across multiple dictionaries in tests + for ( SAMSequenceRecord contig : contigs ) { + clonedContigs.add(contig.clone()); + } + + return new SAMSequenceDictionary(clonedContigs); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetContigNamesListExpectingException() { + getContigNamesList(null); + } + + @Test + public void testGetContigNamesList() { + + final SAMSequenceDictionary samSequenceDictionary = new SAMSequenceDictionary(Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37)); + + Assert.assertEquals(getContigNamesList(samSequenceDictionary), Arrays.asList("1", "2", "10")); + } +} \ No newline at end of file From f66252a46001952589d7ec1b69ac31613bdc030f Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 8 Nov 2021 17:09:17 -0500 Subject: [PATCH 02/22] VCFHeader and VCFHeaderLine refactoring to enable support for VCF4.3/BCF2.2 and bug fixes. --- src/main/java/htsjdk/samtools/Defaults.java | 6 + .../samtools/SAMSequenceDictionary.java | 15 + .../samtools/SAMSequenceDictionaryUtils.java | 181 +--- .../java/htsjdk/tribble/TribbleException.java | 6 + .../java/htsjdk/variant/bcf2/BCF2Utils.java | 29 +- .../variantcontext/writer/VCFWriter.java | 54 +- .../htsjdk/variant/vcf/AbstractVCFCodec.java | 548 +++++++---- .../java/htsjdk/variant/vcf/VCF3Codec.java | 69 +- .../htsjdk/variant/vcf/VCFAltHeaderLine.java | 40 +- .../java/htsjdk/variant/vcf/VCFCodec.java | 127 +-- .../variant/vcf/VCFCompoundHeaderLine.java | 580 ++++++------ .../java/htsjdk/variant/vcf/VCFConstants.java | 26 +- .../variant/vcf/VCFContigHeaderLine.java | 165 +++- .../variant/vcf/VCFFilterHeaderLine.java | 48 +- .../variant/vcf/VCFFormatHeaderLine.java | 61 +- .../java/htsjdk/variant/vcf/VCFHeader.java | 643 +++++++------ .../htsjdk/variant/vcf/VCFHeaderLine.java | 125 ++- .../variant/vcf/VCFHeaderLineCount.java | 69 ++ .../variant/vcf/VCFHeaderLineTranslator.java | 127 +-- .../htsjdk/variant/vcf/VCFHeaderLineType.java | 30 +- .../htsjdk/variant/vcf/VCFHeaderMerger.java | 286 ++++++ .../htsjdk/variant/vcf/VCFHeaderVersion.java | 43 +- .../htsjdk/variant/vcf/VCFInfoHeaderLine.java | 72 +- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 525 +++++++++++ .../htsjdk/variant/vcf/VCFMetaHeaderLine.java | 32 +- .../variant/vcf/VCFPedigreeHeaderLine.java | 42 +- .../htsjdk/variant/vcf/VCFRecordCodec.java | 3 +- .../variant/vcf/VCFSampleHeaderLine.java | 33 +- .../variant/vcf/VCFSimpleHeaderLine.java | 216 +++-- .../variant/vcf/VCFStandardHeaderLines.java | 50 +- .../java/htsjdk/variant/vcf/VCFUtils.java | 150 +-- .../variant/vcf/VCFValidationFailure.java | 63 ++ .../SAMSequenceDictionaryUtilsTest.java | 345 +++---- .../variant/bcf2/BCF2UtilsUnitTest.java | 26 +- .../variant/bcf2/BCF2WriterUnitTest.java | 1 + .../VariantContextTestProvider.java | 1 + .../AsyncVariantContextWriterUnitTest.java | 3 +- .../writer/VCFWriterUnitTest.java | 11 +- .../variant/vcf/AbstractVCFCodecTest.java | 69 +- .../variant/vcf/VCFAltHeaderLineUnitTest.java | 43 + .../variant/vcf/VCFCodec43FeaturesTest.java | 34 +- .../vcf/VCFCompoundHeaderLineUnitTest.java | 237 ++++- .../vcf/VCFContigHeaderLineUnitTest.java | 184 ++++ .../htsjdk/variant/vcf/VCFEncoderTest.java | 1 + .../vcf/VCFFormatHeaderLineUnitTest.java | 19 + .../vcf/VCFHeaderLineTranslatorUnitTest.java | 25 +- .../variant/vcf/VCFHeaderLineUnitTest.java | 123 ++- .../variant/vcf/VCFHeaderMergerUnitTest.java | 554 +++++++++++ .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 875 +++++++++++------- .../variant/vcf/VCFHeaderUnitTestData.java | 203 ++++ .../vcf/VCFInfoHeaderLineUnitTest.java | 86 ++ .../variant/vcf/VCFMetaDataLinesUnitTest.java | 354 +++++++ .../vcf/VCFMetaHeaderLineUnitTest.java | 44 + .../vcf/VCFPedigreeHeaderLineUnitTest.java | 50 + .../vcf/VCFSampleHeaderLineUnitTest.java | 43 + .../vcf/VCFSimpleHeaderLineUnitTest.java | 151 +++ .../vcf/VCFStandardHeaderLinesUnitTest.java | 9 +- .../java/htsjdk/variant/vcf/VCFUtilsTest.java | 55 +- .../resources/htsjdk/variant/HiSeq.10000.vcf | 1 - .../htsjdk/variant/VCF4HeaderTest.vcf | 1 - 60 files changed, 5905 insertions(+), 2107 deletions(-) create mode 100644 src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java create mode 100644 src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java create mode 100644 src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index e2ecf3d1f7..b3db211e20 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -110,6 +110,11 @@ public class Defaults { */ public static final boolean DISABLE_SNAPPY_COMPRESSOR; + /** + * Strict VCF version validation. Default = true. + */ + public static final boolean STRICT_VCF_VERSION_VALIDATION; + public static final String SAMJDK_PREFIX = "samjdk."; static { @@ -134,6 +139,7 @@ public class Defaults { SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); + STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true); } /** diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index cf40fe6532..1e6cb764e0 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List list) { setSequences(list); } + //TODO: this returns sequences in the internal list order instead of + // honoring each sequence's contigIndex + /** + * Get a list of sequences for this dictionary. + * @return the list of sequences for this dictionary in internal order (the order in which the sequences + * were added to this dictionary) + */ public List getSequences() { return Collections.unmodifiableList(mSequences); } @@ -75,6 +82,14 @@ public void setSequences(final List list) { list.forEach(this::addSequence); } + /** + * Add a sequence to the dictionary. + * @param sequenceRecord the sequence record to add - note that this method mutates the contig + * index of the sequenceRecord to match the newly added record's relative + * order in the list + */ + //TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match + // the record's relative placement in the dictionary's internal list public void addSequence(final SAMSequenceRecord sequenceRecord) { if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java index 7f1db9fd94..0d5073a0ba 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -1,16 +1,13 @@ -package org.broadinstitute.hellbender.utils; +package htsjdk.samtools; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.exceptions.UserException; +import htsjdk.utils.ValidationUtils; import java.util.*; import java.util.stream.Collectors; /** * - * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * A series of utility functions that enable comparison of two sequence dictionaries -- from the reference, * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will * blow up with a UserException if the dicts are too incompatible. @@ -18,9 +15,9 @@ * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, * if available. */ -public final class SequenceDictionaryUtils { +public final class SAMSequenceDictionaryUtils { - private SequenceDictionaryUtils(){} + private SAMSequenceDictionaryUtils(){} /** * Compares sequence records by their order @@ -59,166 +56,10 @@ public enum SequenceDictionaryCompatibility { UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different - // orders with respect to each other + // orders with respect to each other DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same - // order with respect to each other, but one or more of them have different - // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } - } - - /** - * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate - * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages. - * - * The standard validation settings used by this method are: - * - * -Require the dictionaries to share a common subset of equivalent contigs - * - * -Do not require dict1 to be a superset of dict2. - * - * -Do not perform checks related to contig ordering: don't throw if the common contigs are in - * different orders with respect to each other, occur at different absolute indices, or are - * lexicographically sorted human dictionaries. GATK uses contig names rather than contig - * indices, and so should not be sensitive to contig ordering issues. - * - * For comparing a CRAM dictionary against a reference dictionary, call - * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead. - * - * @param name1 name associated with dict1 - * @param dict1 the sequence dictionary dict1 - * @param name2 name associated with dict2 - * @param dict2 the sequence dictionary dict2 - */ - public static void validateDictionaries( final String name1, - final SAMSequenceDictionary dict1, - final String name2, - final SAMSequenceDictionary dict2) { - final boolean requireSuperset = false; - final boolean checkContigOrdering = false; - - validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering); - } - - /** - * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate - * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with - * detailed error messages. - * - * The standard validation settings used by this method are: - * - * -Require the reference dictionary to be a superset of the cram dictionary - * - * -Do not perform checks related to contig ordering: don't throw if the common contigs are in - * different orders with respect to each other, occur at different absolute indices, or are - * lexicographically sorted human dictionaries. GATK uses contig names rather than contig - * indices, and so should not be sensitive to contig ordering issues. - * - * @param referenceDictionary the sequence dictionary for the reference - * @param cramDictionary sequence dictionary from a CRAM file - */ - public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary, - final SAMSequenceDictionary cramDictionary ) { - // For CRAM, we require the reference dictionary to be a superset of the reads dictionary - final boolean requireSuperset = true; - final boolean checkContigOrdering = false; - - validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering); - } - - - /** - * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then - * UserExceptions are thrown with detailed error messages. - * - * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs, - * where equivalent contigs are defined as having the same name and length. - * - * @param name1 name associated with dict1 - * @param dict1 the sequence dictionary dict1 - * @param name2 name associated with dict2 - * @param dict2 the sequence dictionary dict2 - * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset - * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other - * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries - */ - public static void validateDictionaries( final String name1, - final SAMSequenceDictionary dict1, - final String name2, - final SAMSequenceDictionary dict2, - final boolean requireSuperset, - final boolean checkContigOrdering ) { - Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary"); - Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary"); - - final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering); - - switch ( type ) { - case IDENTICAL: - return; - case SUPERSET: - return; - case COMMON_SUBSET: - if ( requireSuperset ) { - final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet()); - final List missingContigs = dict2.getSequences().stream() - .map(SAMSequenceRecord::getSequenceName) - .filter(contig -> !contigs1.contains(contig)) - .collect(Collectors.toList()); - throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2); - } - return; - case NO_COMMON_CONTIGS: - throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); - - case UNEQUAL_COMMON_CONTIGS: { - final List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - final SAMSequenceRecord elt1 = x.get(0); - final SAMSequenceRecord elt2 = x.get(1); - throw new UserException.IncompatibleSequenceDictionaries( - String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", - name1, elt1.getSequenceName(), elt1.getSequenceLength(), - name2, elt2.getSequenceName(), elt2.getSequenceLength()), - name1, dict1, name2, dict2 - ); - } - - case NON_CANONICAL_HUMAN_ORDER: { - // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - final UserException ex; - if ( nonCanonicalHumanContigOrder(dict1) ) { - ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); - } - else { - ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); - } - - throw ex; - } - - case OUT_OF_ORDER: { - // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - throw new UserException.IncompatibleSequenceDictionaries( - "The relative ordering of the common contigs in " + name1 + " and " + name2 + - " is not the same; to fix this please see: " - + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " - + " which describes reordering contigs in BAM and VCF files.", - name1, dict1, name2, dict2); - } - - case DIFFERENT_INDICES: { - // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - final String msg = "One or more contigs common to both dictionaries have " + - "different indices (ie., absolute positions) in each dictionary. Code " + - "that is sensitive to contig ordering can fail when this is the case. " + - "You should fix the sequence dictionaries so that all shared contigs " + - "occur at the same absolute positions in both dictionaries."; - throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); - } - default: - throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type); - } + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } } /** @@ -465,14 +306,14 @@ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SA } public static Set getContigNames(SAMSequenceDictionary dict) { - Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size())); + Set contigNames = new LinkedHashSet(dict.size()); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames; } public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { - Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + ValidationUtils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); } @@ -486,7 +327,7 @@ public static List getContigNamesList(final SAMSequenceDictionary refSeq * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed */ public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { - Utils.nonNull(dict, "Sequence dictionary must be non-null"); + ValidationUtils.nonNull(dict, "Sequence dictionary must be non-null"); StringBuilder s = new StringBuilder("[ "); diff --git a/src/main/java/htsjdk/tribble/TribbleException.java b/src/main/java/htsjdk/tribble/TribbleException.java index abcbc25ca0..4e2651640b 100644 --- a/src/main/java/htsjdk/tribble/TribbleException.java +++ b/src/main/java/htsjdk/tribble/TribbleException.java @@ -86,6 +86,12 @@ public static class InternalCodecException extends TribbleException { public InternalCodecException(String message) { super (message); } } + public static class VersionValidationFailure extends TribbleException { + public VersionValidationFailure(final String message) { + super(String.format("Version validation failure: %s", message)); + } + } + // ////////////////////////////////////////////////////////////////////// // Index exceptions // ////////////////////////////////////////////////////////////////////// diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java index 39478bf069..545ede7497 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java @@ -27,7 +27,11 @@ import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.TribbleException; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; import java.io.File; import java.io.FileNotFoundException; @@ -93,10 +97,15 @@ public static ArrayList makeDictionary(final VCFHeader header) { // set up the strings dictionary for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line.shouldBeAddedToDictionary() ) { - final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - if ( ! seen.contains(idLine.getID())) { - dict.add(idLine.getID()); - seen.add(idLine.getID()); + if (!line.isIDHeaderLine()) { + //is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true + throw new TribbleException(String.format( + "The header line %s cannot be added to the BCF dictionary since its not an ID header line", + line)); + } + if ( ! seen.contains(line.getID())) { + dict.add(line.getID()); + seen.add(line.getID()); } } } @@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) { * Are the elements and their order in the output and input headers consistent so that * we can write out the raw genotypes block without decoding and recoding it? * - * If the order of INFO, FILTER, or contrig elements in the output header is different than + * If the order of INFO, FILTER, or contig elements in the output header is different than * in the input header we must decode the blocks using the input header and then recode them * based on the new output order. * @@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) return false; - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); + final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); while ( inputLinesIt.hasNext() ) { if ( ! outputLinesIt.hasNext() ) // missing lines in output return false; - final VCFIDHeaderLine outputLine = outputLinesIt.next(); - final VCFIDHeaderLine inputLine = inputLinesIt.next(); + final VCFSimpleHeaderLine outputLine = outputLinesIt.next(); + final VCFSimpleHeaderLine inputLine = inputLinesIt.next(); if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) return false; diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java index 21f1453fbb..1b6edae1d8 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java @@ -27,8 +27,11 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; +import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFConstants; @@ -36,6 +39,7 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFUtils; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; @@ -45,14 +49,15 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Path; +import java.util.stream.Collectors; /** * this class writes VCF files */ class VCFWriter extends IndexingVariantContextWriter { + protected final static Log logger = Log.getInstance(VCFWriter.class); - private static final String VERSION_LINE = - VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString(); + private static final String DEFAULT_VERSION_LINE = VCFHeader.DEFAULT_VCF_VERSION.toHeaderVersionLine(); // Initialized when the header is written to the output stream private VCFEncoder vcfEncoder = null; @@ -164,7 +169,7 @@ public void writeHeader(final VCFHeader header) { } public static String getVersionLine() { - return VERSION_LINE; + return DEFAULT_VERSION_LINE; } public static VCFHeader writeHeader(VCFHeader header, @@ -175,12 +180,18 @@ public static VCFHeader writeHeader(VCFHeader header, try { rejectVCFV43Headers(header); - // the file format field needs to be written first + // Validate that the file version we're writing is version-compatible this header's version. + validateHeaderVersion(header, versionLine); + + // The file format field needs to be written first; below any file format lines + // embedded in the header will be removed writer.write(versionLine + "\n"); for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { - if ( VCFHeaderVersion.isFormatString(line.getKey()) ) + // Remove the fileformat header lines + if ( VCFHeaderVersion.isFormatString(line.getKey()) ) { continue; + } writer.write(VCFHeader.METADATA_INDICATOR); writer.write(line.toString()); @@ -189,14 +200,9 @@ public static VCFHeader writeHeader(VCFHeader header, // write out the column line writer.write(VCFHeader.HEADER_INDICATOR); - boolean isFirst = true; - for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { - if ( isFirst ) - isFirst = false; // don't write out a field separator - else - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(field.toString()); - } + writer.write(header.getHeaderFields().stream() + .map(f -> f.name()) + .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString()); if ( header.hasGenotypingData() ) { writer.write(VCFConstants.FIELD_SEPARATOR); @@ -274,6 +280,28 @@ private static void rejectVCFV43Headers(final VCFHeader targetHeader) { if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion())); } + } + // Given a header and a requested target output version, see if the header's version is compatible with the + // requested version (where compatible means its ok to just declare that the header has the requested + // version). + private static void validateHeaderVersion(final VCFHeader header, final String requestedVersionLine) { + ValidationUtils.nonNull(header); + ValidationUtils.nonNull(requestedVersionLine); + + final VCFHeaderVersion vcfCurrentVersion = header.getVCFHeaderVersion(); + final VCFHeaderVersion vcfRequestedVersion = VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine); + if (!vcfCurrentVersion.equals(vcfRequestedVersion)) { + if (!VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine), vcfCurrentVersion)) { + final String message = String.format("Attempting to write a %s VCF header to a %s VCFWriter", + vcfRequestedVersion, + vcfCurrentVersion.getVersionString()); + if (VCFUtils.isStrictVCFVersionValidation()) { + throw new TribbleException(message); + } + logger.warn(message); + } + } } + } diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index bfa718453e..1a1267e5c8 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -26,12 +26,14 @@ package htsjdk.variant.vcf; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; import htsjdk.variant.utils.GeneralUtils; @@ -46,6 +48,8 @@ import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { + protected final static Log logger = Log.getInstance(AbstractVCFCodec.class); + public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th @@ -60,26 +64,22 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec private VCFTextTransformer vcfTextTransformer = passThruTextTransformer; // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); - - // for performance testing purposes - public static boolean validate = true; + protected final Map> alleleMap = new HashMap<>(3); // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; - protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); + protected final HashMap> filterHash = new HashMap<>(); // we store a name to give to each of the variant contexts we emit protected String name = "Unknown"; protected int lineNo = 0; - protected Map stringCache = new HashMap(); + protected final Map stringCache = new HashMap<>(); protected boolean warnedAboutNoEqualsForNonFlag = false; @@ -117,17 +117,72 @@ class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { @Override public LazyGenotypesContext.LazyData parse(final Object data) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } } /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied + * Return true if this codec can decode files with the target version + * @param targetVersion the target version to consider + * @return true if this codec can handle targetVersion + */ + public abstract boolean canDecodeVersion(final VCFHeaderVersion targetVersion); + + /** + * Reads all of the header from the provided iterator, but reads no further. + * @param lineIterator the line reader to take header lines from + * @return The parsed header */ - protected abstract List parseFilters(String filterString); + @Override + public Object readActualHeader(final LineIterator lineIterator) { + final List headerStrings = new ArrayList<>(); + + // Extract one line and retrieve the file format and version, which must be the first line, + // and then add it back into the headerLines. + final VCFHeaderVersion fileFormatVersion = readFormatVersionLine(lineIterator); + headerStrings.add(fileFormatVersion.toHeaderVersionLine()); + + // collect metadata lines until we hit the required header line, or a non-metadata line, + // in which case throw since there was no header line + while (lineIterator.hasNext()) { + final String line = lineIterator.peek(); + if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + } else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + this.header = parseHeaderFromLines(headerStrings, fileFormatVersion); + return this.header; + } + } + throw new TribbleException.InvalidHeader( + "The required header line (starting with one #) is missing in the input VCF file"); + } + + /** + * Read ahead one line to obtain and return the vcf header version for this file + * + * @param headerLineIterator + * @return VCFHeaderVersion for this file + * @throws TribbleException if no file format header line is found in the first line or, the version can't + * be handled by this codec + */ + protected VCFHeaderVersion readFormatVersionLine(final LineIterator headerLineIterator) { + if (headerLineIterator.hasNext()) { + final String headerVersionLine = headerLineIterator.next(); + if (headerVersionLine.startsWith(VCFHeader.METADATA_INDICATOR)) { + final VCFHeaderVersion vcfFileVersion = VCFHeaderVersion.fromHeaderVersionLine(headerVersionLine); + if (!canDecodeVersion(vcfFileVersion)) { + throw new TribbleException.InvalidHeader( + String.format("The \"(%s)\" codec does not support VCF version: %s", getName(), vcfFileVersion)); + } else { + return vcfFileVersion; + } + } + } + throw new TribbleException.InvalidHeader("The VCF version header line is missing"); + } /** * create a VCF header from a set of header record lines @@ -135,180 +190,306 @@ public LazyGenotypesContext.LazyData parse(final Object data) { * @param headerStrings a list of strings that represent all the ## and # entries * @return a VCFHeader object */ - protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion version ) { - this.version = version; + protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion sourceVersion ) { + this.version = sourceVersion; - Set metaData = new LinkedHashSet(); - Set sampleNames = new LinkedHashSet(); + final Set metaData = new LinkedHashSet<>(); + Set sampleNames = new LinkedHashSet<>(); int contigCounter = 0; - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; - } - - while ( arrayIndex < strings.length ) - sampleNames.add(strings[arrayIndex++]); - - if ( sawFormatTag && sampleNames.isEmpty()) - throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); - - // If we're performing sample name remapping and there is exactly one sample specified in the header, replace - // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested - // for this file. - if ( remappedSampleName != null ) { - // We currently only support on-the-fly sample name remapping for single-sample VCFs - if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { - throw new TribbleException(String.format("Cannot remap sample name to %s because %s samples are specified in the VCF header, and on-the-fly sample name remapping is only supported for single-sample VCFs", - remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); - } - - sampleNames.clear(); - sampleNames.add(remappedSampleName); - } + for ( String headerLine : headerStrings ) { + if ( !headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { + sampleNames = parsePrimaryHeaderLine(headerLine); } else { - if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); - metaData.add(info); - } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { - final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); - metaData.add(filter); - } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { - final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); - metaData.add(format); - } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); - metaData.add(contig); - } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - metaData.add(getAltHeaderLine(str.substring(VCFConstants.ALT_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.PEDIGREE_HEADER_START) && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // only model pedigree header lines as structured header lines starting with v4.3 - metaData.add(getPedigreeHeaderLine(str.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.META_HEADER_START) ) { - metaData.add(getMetaHeaderLine(str.substring(VCFConstants.META_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { - metaData.add(getSampleHeaderLine(str.substring(VCFConstants.SAMPLE_HEADER_OFFSET), version)); + if ( headerLine.startsWith(VCFConstants.INFO_HEADER_START) ) { + metaData.add(getInfoHeaderLine(headerLine.substring(VCFConstants.INFO_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FILTER_HEADER_START) ) { + metaData.add(getFilterHeaderLine(headerLine.substring(VCFConstants.FILTER_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FORMAT_HEADER_START) ) { + metaData.add(getFormatHeaderLine(headerLine.substring(VCFConstants.FORMAT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.CONTIG_HEADER_START) ) { + metaData.add(getContigHeaderLine(headerLine.substring(VCFConstants.CONTIG_HEADER_OFFSET), sourceVersion, contigCounter++)); + } else if ( headerLine.startsWith(VCFConstants.ALT_HEADER_START) ) { + metaData.add(getAltHeaderLine(headerLine.substring(VCFConstants.ALT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) { + metaData.add(getPedigreeHeaderLine(headerLine.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.META_HEADER_START) ) { + metaData.add(getMetaHeaderLine(headerLine.substring(VCFConstants.META_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { + metaData.add(getSampleHeaderLine(headerLine.substring(VCFConstants.SAMPLE_HEADER_OFFSET), sourceVersion)); } else { - int equals = str.indexOf('='); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); + final VCFHeaderLine otherHeaderLine = getOtherHeaderLine( + headerLine.substring(VCFHeader.METADATA_INDICATOR.length()), + sourceVersion); + if (otherHeaderLine != null) + metaData.add(otherHeaderLine); } } } - - setVCFHeader(new VCFHeader(version, metaData, sampleNames), version); - return this.header; + // return the header that is returned by setVCFHeader, since it may be different than the + // one we create here since setVCFHeader calls + // {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}, which can create an + // entirely new "repaired" header. + final VCFHeader vcfHeader = new VCFHeader(metaData, sampleNames); + return setVCFHeader(vcfHeader); } /** - * @return the header that was either explicitly set on this codec, or read from the file. May be null. - * The returned value should not be modified. + * Create and return a VCFInfoHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFInfoHeaderLine object */ - public VCFHeader getHeader() { - return header; + protected VCFInfoHeaderLine getInfoHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFInfoHeaderLine(headerLineString, sourceVersion); } /** - * @return the version number that was either explicitly set on this codec, or read from the file. May be null. + * Create and return a VCFFormatHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFormatHeaderLine object */ - public VCFHeaderVersion getVersion() { - return version; + protected VCFFormatHeaderLine getFormatHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFormatHeaderLine(headerLineString, sourceVersion); } /** - * Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file - * and the version state stored in this instance; conversely, reading the header from a file will - * overwrite whatever is set here. - * - * @param newHeader - * @param newVersion - * @return the actual header for this codec. The returned header may not be identical to the header - * argument since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set. - * @throws TribbleException if the requested header version is not compatible with the existing version + * Create and return a VCFFilterHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFilterHeaderLine object */ - public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { - validateHeaderVersionTransition(newHeader, newVersion); - if (this.doOnTheFlyModifications) { - final VCFHeader repairedHeader = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); - // validate the new header after repair to ensure the resulting header version is - // still compatible with the current version - validateHeaderVersionTransition(repairedHeader, newVersion); - this.header = repairedHeader; - } else { - this.header = newHeader; - } - - this.version = newVersion; - this.vcfTextTransformer = getTextTransformerForVCFVersion(newVersion); + protected VCFFilterHeaderLine getFilterHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFilterHeaderLine(headerLineString, sourceVersion); + } - return this.header; + /** + * Create and return a VCFContigHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be valid for this header version. + * @return a VCFContigHeaderLine object + */ + protected VCFContigHeaderLine getContigHeaderLine( + final String headerLineString, + final VCFHeaderVersion sourceVersion, + final int contigIndex) { + return new VCFContigHeaderLine(headerLineString, sourceVersion, contigIndex); } /** * Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##ALT=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFAltHeaderLine object */ - public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFAltHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##PEDIGREE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFPedigreeHeaderLine object + * + * NOTE:this can't return a VCFPedigreeHeaderLine since for pre-v4.3 PEDIGREE lines must be modeled as + * VCFHeaderLine due to the lack of a requirement for an ID field */ - public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { - return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + protected VCFHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + if (sourceVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + } else { + return new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, headerLineString); + } } /** * Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##META=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFMetaHeaderLine object */ - public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFMetaHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##SAMPLE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFSampleHeaderLine object */ - public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFSampleHeaderLine(headerLineString, sourceVersion); } + /** + * Create and return a header line that is not modeled by a specific VCFHeaderLine subclass, ie., its not + * a info/format/contig/alt/pedigree/meta/sample VCFHeaderLine. This may return either a VCFSimpleHeaderLine + * or a VCFHeaderLine. + * + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion VCFHeaderVersion being parsed + * @return a VCFHeaderLine + */ + protected VCFHeaderLine getOtherHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + final int indexOfEquals = headerLineString.indexOf('='); + if (indexOfEquals < 1) { // must at least have "?=" + if (VCFUtils.isStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader("Unrecognized metadata line type: " + headerLineString); + } + logger.warn("Dropping unrecognized VCFHeader metadata line type: " + headerLineString); + return null; + } + final String headerLineValue = headerLineString.substring(indexOfEquals + 1).trim(); + if (headerLineValue.startsWith("<") && headerLineValue.endsWith(">")) { + if (sourceVersion.isAtLeastAsRecentAs((VCFHeaderVersion.VCF4_3)) || headerLineString.contains(""), + // but which do not contain an ID attribute, i.e., GATK Funcotator uses v4.1 ClinVar test + // files with lines like that look like this: + // + // "ID=" + // + // where the key is "ID", and no ID attribute is present + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } else { + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } + + // Parse the primary header line of the form: + // + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... + // + // The string passed in is the first non-metadata line we've seen, so it should conform. + // + private Set parsePrimaryHeaderLine(final String headerLine) { + final Set sampleNames = new LinkedHashSet<>(); + + final String[] columns = headerLine.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( columns.length < VCFHeader.HEADER_FIELDS.values().length ) { + throw new TribbleException.InvalidHeader("not enough columns present in header line: " + headerLine); + } + + int col = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(columns[col])) { + throw new TribbleException.InvalidHeader("expected column headerLineID '" + field + "' but saw '" + columns[col] + "'"); + } + } catch (IllegalArgumentException e) { + throw new TribbleException.InvalidHeader("column headerLineID '" + columns[col] + "' is not a legal column header headerLineID."); + } + col++; + } + + boolean sawFormatTag = false; + if ( col < columns.length ) { + if ( !columns[col].equals("FORMAT") ) + throw new TribbleException.InvalidHeader("expected column headerLineID 'FORMAT' but saw '" + columns[col] + "'"); + sawFormatTag = true; + col++; + } + + while ( col < columns.length ) { + sampleNames.add(columns[col++]); + } + + if ( sawFormatTag && sampleNames.isEmpty()) + throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); + + // If we're performing sample name remapping and there is exactly one sample specified in the header, replace + // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested + // for this file. + if ( remappedSampleName != null ) { + // We currently only support on-the-fly sample name remapping for single-sample VCFs + if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { + throw new TribbleException( + String.format("Cannot remap sample headerLineID to %s because %s samples are specified in the VCF header, " + + "and on-the-fly sample headerLineID remapping is only supported for single-sample VCFs", + remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); + } + + sampleNames.clear(); + sampleNames.add(remappedSampleName); + } + + return sampleNames; + } + + /** + * @return the header that was either explicitly set on this codec, or read from the file. May be null. + * The returned value should not be modified. + */ + public VCFHeader getHeader() { + return header; + } + + /** + * @return the version number that was either explicitly set on this codec, or read from the file. May be null. + */ + public VCFHeaderVersion getVersion() { + return version; + } + + @Deprecated // starting after version 2.24.1 + //Note: this is currently used by Disq + public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { + ValidationUtils.nonNull(newHeader); + ValidationUtils.nonNull(newVersion); + ValidationUtils.validateArg( + newHeader.getVCFHeaderVersion().equals(newVersion), + "new version must equal the newHeader's version"); + return setVCFHeader(newHeader); + } + + /** + * Set the VCFHeader for this codec. The final header may be a complete replacement for the + * provided input header, since header lines may be "repaired" (upgraded to vcf v4.2) if + * doOnTheFlyModifications is set. See + * {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + * + * @param newHeader the new header to be used by this codec + * @return the actual header that is established for this codec. See {@link + * VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + */ + public VCFHeader setVCFHeader(final VCFHeader newHeader) { + ValidationUtils.nonNull(newHeader); + + if (this.doOnTheFlyModifications) { + // calling this with a header that has any pre-v4.3 version will always result in a header + // with version vcfV4.2, no matter what the header version originally was, since the "repair" + // operation is essentially a transform of the header so that it conforms with header line rules + // as of 4.2 + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + } else { + this.header = newHeader; + } + this.version = this.header.getVCFHeaderVersion(); + // Obtain a text transformer (technically, this should be based on the ORIGINAL header version, not + // the updated version after repairStandardHeaderLines is called), but it doesn't matter in practice + // since the transformer only differs starting with 4.3. + this.vcfTextTransformer = getTextTransformerForVCFVersion(this.version); + + return this.header; + } + /** * the fast decode function * @param line the line of text for the record @@ -328,28 +509,6 @@ public VariantContext decode(String line) { return decodeLine(line, true); } - /** - * Throw if new a version/header are not compatible with the existing version/header. Generally, any version - * before v4.2 can be up-converted to v4.2, but not to v4.3. Once a header is established as v4.3, it cannot - * can not be up or down converted, and it must remain at v4.3. - * @param newHeader - * @param newVersion - * @throws TribbleException if the header conversion is not valid - */ - private void validateHeaderVersionTransition(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { - ValidationUtils.nonNull(newHeader); - ValidationUtils.nonNull(newVersion); - - VCFHeader.validateVersionTransition(version, newVersion); - - // If this codec currently has no header (this happens when the header is being established for - // the first time during file parsing), establish an initial header and version, and bypass - // validation. - if (header != null && newHeader.getVCFHeaderVersion() != null) { - VCFHeader.validateVersionTransition(header.getVCFHeaderVersion(), newHeader.getVCFHeaderVersion()); - } - } - /** * For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded * on read. Return a version-aware text transformer that can decode encoded text. @@ -421,7 +580,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) final String alts = parts[4]; builder.log10PError(parseQual(parts[5])); - final List filters = parseFilters(getCachedString(parts[6])); + final Set filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) { builder.filters(new HashSet<>(filters)); } @@ -432,7 +591,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) // update stop with the end key if provided try { builder.stop(Integer.parseInt(attrs.get(VCFConstants.END_KEY).toString())); - } catch (Exception e) { + } catch (NumberFormatException e) { generateException("the END value in the INFO field is not valid"); } } else { @@ -499,20 +658,64 @@ protected String getCachedString(String str) { return internedString; } + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected Set parseFilters(final String filterString) { + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) + return Collections.emptySet(); + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter headerLineID in vcf4", lineNo); + if (filterString.isEmpty()) + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); + + // do we have the filter string cached? + if ( filterHash.containsKey(filterString) ) + return filterHash.get(filterString); + + // empty set for passes filters + final Set fFields = new HashSet<>(); + // otherwise we have to parse and cache the value + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) + fFields.add(filterString); + else { + // Variant context uses a Set to store these, so duplicates have historically been + // dropped in previous versions. Delegate handling of warning for these to the + // specific codec subclass. + String[] filters = filterString.split(VCFConstants.FILTER_CODE_SEPARATOR); + for (int i = 0; i < filters.length; i++) { + if (!fFields.add(filters[i])) { + reportDuplicateFilterIDs(filters[i], lineNo); + } + } + } + + filterHash.put(filterString, Collections.unmodifiableSet(fFields)); + + return fFields; + } + /** * parse out the info fields * @param infoField the fields * @return a mapping of keys to objects */ - private Map parseInfo(String infoField) { - Map attributes = new HashMap(); + protected Map parseInfo(String infoField) { + Map attributes = new HashMap<>(); if ( infoField.isEmpty() ) generateException("The VCF specification requires a valid (non-zero length) info field"); if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf('\t') != -1 || infoField.indexOf(' ') != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\""); + if ( infoField.indexOf('\t') != -1 ) { + generateException("The VCF specification does not allow for tab characters in the INFO field. Offending field value was \"" + infoField + "\""); + } List infoFields = ParsingUtils.split(infoField, VCFConstants.INFO_FIELD_SEPARATOR_CHAR); for (int i = 0; i < infoFields.size(); i++) { @@ -540,8 +743,8 @@ private Map parseInfo(String infoField) { key = infoFields.get(i); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { - System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " + if ( warnedAboutNoEqualsForNonFlag ) { + logger.warn("Found info key " + key + " without a = value, but the header says the field is of type " + headerLine.getType() + " but this construct is only value for FLAG type fields"); warnedAboutNoEqualsForNonFlag = true; } @@ -555,6 +758,10 @@ private Map parseInfo(String infoField) { // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + if (attributes.containsKey(key)) { + reportDuplicateInfoKeyValue(key, infoField, lineNo); + } + attributes.put(key, value); } } @@ -562,6 +769,23 @@ private Map parseInfo(String infoField) { return attributes; } + /** + * Handle reporting of duplicate filter IDs + * + * @param duplicateFilterString the duplicate filter string + * @param lineNo line number of the offending line + */ + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) {} + + /** + * Handle reporting of duplicate info line field values + * + * @param duplicateKey the key name of the field that is duplicated + * @param infoField the entire info field line + * @param lineNo line number of the offending line + */ + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { } + /** * create a an allele from an index and an array of alleles * @param index the index @@ -796,8 +1020,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } else if ( missing ) { // if its truly missing (there no provided value) skip adding it to the attributes } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - final List filters = parseFilters(getCachedString(genotypeValues.get(i))); - if ( filters != null ) gb.filters(filters); + final Set filters = parseFilters(getCachedString(genotypeValues.get(i))); + if ( filters != null ) gb.filters(new ArrayList<>(filters)); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { // don't add missing values to the map } else { @@ -880,11 +1104,11 @@ public void setRemappedSampleName( final String remappedSampleName ) { } protected void generateException(String message) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } protected static void generateException(String message, int lineNo) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java index e9ca3abdf7..3c19a7f051 100644 --- a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java +++ b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java @@ -25,12 +25,9 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; +import java.util.HashSet; +import java.util.Set; /** @@ -53,45 +50,19 @@ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; /** - * @param reader the line reader to take header lines from - * @return the number of header lines + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator reader) { - final List headerStrings = new ArrayList(); - - VCFHeaderVersion version = null; - boolean foundHeaderVersion = false; - while (reader.hasNext()) { - lineNo++; - final String line = reader.peek(); - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(reader.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(reader.next()); - return super.parseHeaderFromLines(headerStrings, version); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF3_3 || targetHeaderVersion == VCFHeaderVersion.VCF3_2; } + @Override + public boolean canDecode(final String potentialInputFile) { + return canDecodeFile(potentialInputFile, VCF3_MAGIC_HEADER); + } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt @@ -99,24 +70,24 @@ else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { * @return a set of the filters applied */ @Override - protected List parseFilters(String filterString) { + protected Set parseFilters(String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; // empty set for passes filters - List fFields = new ArrayList(); + HashSet fFields = new HashSet<>(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return new ArrayList(fFields); + return new HashSet<>(fFields); if (filterString.isEmpty()) generateException("The VCF specification requires a valid filter status"); // do we have the filter string cached? if ( filterHash.containsKey(filterString) ) - return new ArrayList(filterHash.get(filterString)); + return new HashSet<>(filterHash.get(filterString)); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) @@ -130,7 +101,13 @@ protected List parseFilters(String filterString) { } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // no-op since this codec historically doesn't report duplicates } + + @Override + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + // no-op since this codec historically doesn't report duplicates + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java index 71c4850f07..37ac9874e9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java @@ -1,5 +1,7 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; + import java.util.*; /** @@ -7,16 +9,46 @@ */ public class VCFAltHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFHeader.class); private static List expectedTags = Collections.unmodifiableList( new ArrayList(2) {{ - add(ID_ATTRIBUTE); - add(DESCRIPTION_ATTRIBUTE); - }} + add(ID_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} ); public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags)); + // Honor the requested version to choose the parser, and let validateForVersion figure out + // whether that version is valid for this line (for example, if this is called with a pre-4.0 version) + super(VCFConstants.ALT_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, expectedTags)); + validateForVersion(version); } + public VCFAltHeaderLine(final String id, final String description) { + super(VCFConstants.ALT_HEADER_KEY, + new LinkedHashMap() {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + //TODO: Should we validate/constrain these to match the 4.3 spec constraints ? + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final VCFValidationFailure validationFailure = new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", getKey(), vcfTargetVersion)); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(validationFailure); + } else { + logger.warn(validationFailure.getFailureMessage()); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 42f07150d1..3ebf47c02a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -25,17 +25,10 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * A feature codec for the VCF 4 specification + * A feature codec for the VCF 4.0, 4.1, 4.2, and 4.3 specification versions * *

* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a @@ -45,7 +38,7 @@ * of related samples. Recently the format for storing next-generation read alignments has been * standardised by the SAM/BAM file format specification. This has significantly improved the * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * We propose the Variant Call Format (VCF) as a standardised format for storing the most prevalent * types of sequence variation, including SNPs, indels and larger structural variants, together * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for * fast data retrieval of variants from a range of positions on the reference genome. @@ -72,91 +65,55 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying + // on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** - * Reads all of the header from the provided iterator, but no reads no further. - * @param lineIterator the line reader to take header lines from - * @return The parsed header + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator lineIterator) { - final List headerStrings = new ArrayList(); - - String line; - boolean foundHeaderVersion = false; - while (lineIterator.hasNext()) { - line = lineIterator.peek(); - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(lineIterator.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(lineIterator.next()); - super.parseHeaderFromLines(headerStrings, version); - return this.header; - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF4_0 || + targetHeaderVersion == VCFHeaderVersion.VCF4_1 || + targetHeaderVersion == VCFHeaderVersion.VCF4_2 || + targetHeaderVersion == VCFHeaderVersion.VCF4_3; } - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) - */ @Override - protected List parseFilters(final String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptyList(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if (filterString.isEmpty()) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // empty set for passes filters - final List fFields = new LinkedList(); - // otherwise we have to parse and cache the value - if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, Collections.unmodifiableList(fFields)); + public boolean canDecode(final String potentialInput) { + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + } - return fFields; + @Override + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // older versions of htsjdk have been silently dropping these for a while, but we can at least warn + logger.warn(String.format("Duplicate filter %s found on line %d", duplicateFilterString, lineNo)); } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + logger.warn(String.format("Duplicate key %s found in %s on line %d", duplicateKey, infoField, lineNo)); } + + /** + * parse out the info fields + * @param infoField the fields + * @return a mapping of keys to objects + */ + protected Map parseInfo(String infoField) { + if (infoField.indexOf(' ') != -1) { + generateException( + String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", + version == null ? + "unknown" : + version.getVersionString(), + infoField) + ); + } + return super.parseInfo(infoField); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index f955a434e1..60eb4fc90f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -25,60 +25,185 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; -import htsjdk.variant.utils.GeneralUtils; +import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.VariantContext; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.regex.Pattern; + import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** - * a base class for compound header lines, which include info lines and format lines (so far) + * Abstract base class for compound header lines, which include INFO lines and FORMAT lines. + * + * Compound header lines are distinguished only in that are required to have TYPE and NUMBER attributes + * (VCFHeaderLineCount, a VCFHeaderLineType, and a count). */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { +public abstract class VCFCompoundHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFCompoundHeaderLine.class); + + // regex pattern corresponding to legal info/format field keys + protected static final Pattern VALID_HEADER_ID_PATTERN = Pattern.compile("^[A-Za-z_][0-9A-Za-z_.]*$"); + protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; + + protected static final String NUMBER_ATTRIBUTE = "Number"; + protected static final String TYPE_ATTRIBUTE = "Type"; + + // List of expected tags that have a predefined order (used by the parser to verify order only). The + // header line class itself should verify that all required tags are present. + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(4) {{ + add(ID_ATTRIBUTE); + add(NUMBER_ATTRIBUTE); + add(TYPE_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} + ); + + // immutable, cached binary representations of compound header line attributes + private final VCFHeaderLineType type; + private final VCFHeaderLineCount countType; + private final int count; - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); + /** + * create a VCF compound header line with count type = VCFHeaderLineCount.INTEGER + * + * @param key the key (header line type) for this header line + * @param headerLineID the is or this header line + * @param count the count for this header line, sets countType type as VCFHeaderLineCount.INTEGER + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final int count, + final VCFHeaderLineType type, + final String description) + { + this(key, createAttributeMap(headerLineID, VCFHeaderLineCount.INTEGER, count, type, description), VCFHeader.DEFAULT_VCF_VERSION); + } - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } + /** + * create a VCF compound header line + * + * @param key the key (header line type) for this header line + * @param headerLineID the id for this header line + * @param countType the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final VCFHeaderLineCount countType, + final VCFHeaderLineType type, + final String description) { + this(key, createAttributeMap(headerLineID, countType, VCFHeaderLineCount.VARIABLE_COUNT, type, description), VCFHeader.DEFAULT_VCF_VERSION); } - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; - private String source; - private String version; + /** + * create a VCF compound header line from an attribute map + * + * @param key the key (header line type) for this header line + * @param mapping the header line attribute map + * @param vcfVersion the VCF header version. This may be null, in which case + */ + protected VCFCompoundHeaderLine(final String key, final Map mapping, final VCFHeaderVersion vcfVersion) { + super(key, mapping); + ValidationUtils.nonNull(vcfVersion); + + this.type = decodeLineType(getGenericFieldValue(TYPE_ATTRIBUTE)); + final String countString = getGenericFieldValue(NUMBER_ATTRIBUTE); + this.countType = decodeCountType(countString, vcfVersion); + this.count = decodeCount(countString, this.countType); + validateForVersion(vcfVersion); + } + + /** + * Return the description for this header line. + * @return the header line's description + */ + public String getDescription() { + final String description = getGenericFieldValue(DESCRIPTION_ATTRIBUTE); + return description == null ? + UNBOUND_DESCRIPTION : + description; + } - // access methods - @Override - public String getID() { return name; } - public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } - public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } + + /** + * @return true if this header line has a fixed integer count type ({@link #getCountType()} + * equals {@link VCFHeaderLineCount#INTEGER}) + */ + public boolean isFixedCount() { return countType.isFixedCount(); } + + /** + * @return the integer count for this header line if the header has a fixed integer + * count type ({@link #isFixedCount()} is true). A TribbleException is thrown if the + * header line does not have a fixed integer count type ({@link #getCountType()} equals + * {@link VCFHeaderLineCount#INTEGER}). + * + * @throws TribbleException if the {@link VCFHeaderLineCount} is not a fixed integer + */ public int getCount() { - if (!isFixedCount()) - throw new TribbleException("Asking for header line count when type is not an integer"); + if (!isFixedCount()) { + throw new TribbleException("Header line count request when count type is not an integer"); + } return count; } public String getSource() { - return source; + return getGenericFieldValue(SOURCE_ATTRIBUTE); } public String getVersion() { - return version; + return getGenericFieldValue(VERSION_ATTRIBUTE); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // The VCF 4.3 spec does not phrase this restriction as one on the form of the ID value of + // INFO/FORMAT lines but instead on the INFO/FORMAT fixed field key values (c.f. section 1.6.1). + // However, the key values correspond to INFO/FORMAT header lines defining the attribute and its type, + // so we do the validation here + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (!validHeaderID(getID())) { + final VCFValidationFailure validationFailure = new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("ID tag \"%s\" does not conform to tag restrictions", getID())); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(validationFailure); + } else { + // warn for older versions - this line can't be used as a v4.3 line + logger.warn(validationFailure.getFailureMessage()); + } + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + /** + * @param id the candidate ID + * @return true if ID conforms to header line id requirements, otherwise false + */ + //TODO: the existing VCFHeaderLine.validateKeyOrID method should be refactored so it can be used instead of this + protected boolean validHeaderID(final String id) { + return VALID_HEADER_ID_PATTERN.matcher(id).matches(); } /** @@ -113,278 +238,209 @@ public int getCount(final VariantContext vc) { } } - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - /** - * create a VCF format header line + * Specify annotation source + *

+ * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param source annotation source (case-insensitive, e.g. "dbsnp") */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after 2.24.1 + public void setSource(final String source) { + updateGenericField(SOURCE_ATTRIBUTE, source); } /** - * create a VCF format header line + * Specify annotation version + *

+ * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param version exact version (e.g. "138") */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after version 2.24.1 + public void setVersion(final String version) { + updateGenericField(VERSION_ATTRIBUTE, version); } - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); - } + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFCompoundHeaderLine)) return false; + if (!super.equals(o)) return false; - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); + final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; + + if (count != that.count) return false; + if (type != that.type) return false; + return countType == that.countType; } - /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * - */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + type.hashCode(); + result = 31 * result + countType.hashCode(); + result = 31 * result + count; + return result; + } - final ArrayList expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description")); - final List recommendedTags; - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - recommendedTags = Arrays.asList("Source", "Version"); - } else { - recommendedTags = Collections.emptyList(); - } - final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags, recommendedTags); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if (numberStr.equals(VCFConstants.PER_ALTERNATE_COUNT)) { - countType = VCFHeaderLineCount.A; - } else if (numberStr.equals(VCFConstants.PER_ALLELE_COUNT)) { - countType = VCFHeaderLineCount.R; - } else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) { - countType = VCFHeaderLineCount.G; - } else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { - countType = VCFHeaderLineCount.UNBOUNDED; + private VCFHeaderLineType decodeLineType(final String lineTypeString) { + if (lineTypeString == null) { + throw new TribbleException(String.format("A line type attribute is required for %s header lines", getKey())); } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.parseInt(numberStr); - - } - - if (count < 0 && countType == VCFHeaderLineCount.INTEGER) - throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name); - - try { - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - } catch (Exception e) { - throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); + try { + return VCFHeaderLineType.valueOf(lineTypeString); + } catch (IllegalArgumentException e) { + throw new TribbleException(String.format( + "\"%s\" is not a valid type for %s header lines (note that types are case-sensitive)", + lineTypeString, + getKey())); + } } - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field at line - " + line); - - description = mapping.get("Description"); - if (description == null && ALLOW_UNBOUND_DESCRIPTIONS) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - - this.lineType = lineType; + } - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - this.source = mapping.get("Source"); - this.version = mapping.get("Version"); + private VCFHeaderLineCount decodeCountType(final String countString, final VCFHeaderVersion vcfVersion) { + if (countString == null) { + throw new TribbleException.InvalidHeader( + String.format("A count type/value must be provided for %s header lines.", getID())); } - - validate(); + return VCFHeaderLineCount.decode(vcfVersion, countString); } - private void validate() { - if (type != VCFHeaderLineType.Flag && countType == VCFHeaderLineCount.INTEGER && count <= 0) - throw new IllegalArgumentException(String.format("Invalid count number, with fixed count the number should be 1 or higher: key=%s name=%s type=%s desc=%s lineType=%s count=%s", - getKey(), name, type, description, lineType, count)); - if (name == null || type == null || description == null || lineType == null) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - getKey(), name, type, description, lineType)); - if (name.contains("<") || name.contains(">")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if (name.contains("=")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - if (type == VCFHeaderLineType.Flag && count != 0) { - count = 0; - if (GeneralUtils.DEBUG_MODE_ENABLED) { - System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); + private int decodeCount(final String countString, final VCFHeaderLineCount requestedCountType) { + int lineCount = VCFHeaderLineCount.VARIABLE_COUNT; + if (requestedCountType.isFixedCount()) { + if (countString == null) { + throw new TribbleException.InvalidHeader(String.format("Missing count value in VCF header field %s", getID())); + } + try { + lineCount = Integer.parseInt(countString); + } catch (NumberFormatException e) { + throw new TribbleException.InvalidHeader(String.format("Invalid count value %s in VCF header field %s", lineCount, getID())); + } + if (getType() == VCFHeaderLineType.Flag) { + if (lineCount != 0) { + // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag + // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but + // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; + logger.warn(String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", + lineCount, + getID())); + } + } else if (lineCount <= 0) { + throw new TribbleException.InvalidHeader( + String.format("Invalid count number %d for fixed count in header line with ID %s. For fixed count, the count number must be 1 or higher.", + lineCount, + getID())); } } + return lineCount; } - /** - * make a string representation of this header line - * @return a string representation - */ - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch (countType) { - case A: - number = VCFConstants.PER_ALTERNATE_COUNT; - break; - case R: - number = VCFConstants.PER_ALLELE_COUNT; - break; - case G: - number = VCFConstants.PER_GENOTYPE_COUNT; - break; - case UNBOUNDED: - number = VCFConstants.UNBOUNDED_ENCODING_v4; - break; - case INTEGER: - default: - number = count; - } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - if (source != null) { - map.put("Source", source); - } - if (version != null) { - map.put("Version", version); - } - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + // Create a backing attribute map out of VCFCompoundHeaderLine elements + private static LinkedHashMap createAttributeMap( + final String headerLineID, + final VCFHeaderLineCount countType, + final int count, + final VCFHeaderLineType type, + final String description) { + return new LinkedHashMap() { + { put(ID_ATTRIBUTE, headerLineID); } + { put(NUMBER_ATTRIBUTE, countType.encode(count)); } + { put(TYPE_ATTRIBUTE, type.encode()); } + { + // Handle the case where there's no description provided, ALLOW_UNBOUND_DESCRIPTIONS is the default + // note: if no description was provided, don't cache it, which means we don't round trip it + if (description != null) { + put(DESCRIPTION_ATTRIBUTE, description); + } + } + }; } /** - * returns true if we're equal to another compound header line - * @param o a compound header line - * @return true if equal + * Compare two VCFCompoundHeaderLine (FORMAT or INFO) lines to determine if they have compatible number types, + * and return a VCFCompoundHeaderLine that can be used to represent the result of merging these lines. In the + * case where the merged line requires "promoting" one of the types to the other, a new line of the appropriate + * type is created by calling the {@code compoundHeaderLineResolver} to produce new line of the correct + * subclass (INFO or FORMAT). + * + * @param line1 first line to merge + * @param line2 second line to merge + * @param conflictWarner conflict warning manager + * @param compoundHeaderLineResolver function that accepts two compound header lines of the same type (info or + * format, and returns a new header line representing the combination of the + * two input header lines + * @param type of VCFCompoundHeaderLine to merge (subclass of VCFCompoundHeaderLine) + * @return the merged line if one can be created */ - @Override - public boolean equals(final Object o) { - if ( this == o ) { - return true; + static T getMergedCompoundHeaderLine( + final T line1, + final T line2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner, + BiFunction compoundHeaderLineResolver) + { + ValidationUtils.nonNull(line1); + ValidationUtils.nonNull(line2); + ValidationUtils.validateArg(line1.getKey().equals(line2.getKey()) && line1.getID().equals(line2.getID()), + "header lines must have the same type to merge"); + T mergedLine = line1; + + if (!line1.equalsExcludingExtraAttributes(line2)) { + if (getCompoundLineDifferenceScore(line1, line2) > 1) { + // merge lines if they have zero or one mergeable differences, but if there are multiple + // differences, call the headers incompatible and bail, since we need to choose one line + // or the other as the merge line (we can't do generic field-level resolution) + throw new TribbleException( + String.format("Incompatible header merge, can't merge lines with multiple attribute differences %s/%s.", + line1, line2)); + } + if (line1.getType().equals(line2.getType())) { + // The lines have a common type. + // The Number entry is an Integer that describes the number of values that can be + // included with the INFO field. For example, if the INFO field contains a single + // number, then this value should be 1. However, if the INFO field describes a pair + // of numbers, then this value should be 2 and so on. If the number of possible + // values varies, is unknown, or is unbounded, then this value should be '.'. + conflictWarner.warn("Promoting header field Number to . due to number differences in header lines: " + line1 + " " + line2); + mergedLine = compoundHeaderLineResolver.apply(line1, line2); + } else if (line1.getType() == VCFHeaderLineType.Integer && line2.getType() == VCFHeaderLineType.Float) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + mergedLine = line2; + } else if (line1.getType() == VCFHeaderLineType.Float && line2.getType() == VCFHeaderLineType.Integer) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + } else { + throw new IllegalStateException("Attempt to merge incompatible headers, can't merge these lines: " + line1 + " " + line2); + } } - if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { - return false; + if (!line1.getDescription().equals(line2.getDescription())) { + conflictWarner.warn("Allowing unequal description fields through: keeping " + line2 + " excluding " + line1); } - final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; - return equalsExcludingDescription(that) && - description.equals(that.description); - } - - @Override - public int hashCode() { - int result = super.hashCode(); - result = 31 * result + name.hashCode(); - result = 31 * result + count; - result = 31 * result + (countType != null ? countType.hashCode() : 0); // only nullable field according to validate() - result = 31 * result + description.hashCode(); - result = 31 * result + type.hashCode(); - result = 31 * result + lineType.hashCode(); - result = 31 * result + (source != null ? source.hashCode() : 0); - result = 31 * result + (version != null ? version.hashCode() : 0); - return result; + return mergedLine; } - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { + boolean equalsExcludingExtraAttributes(final VCFCompoundHeaderLine other) { return count == other.count && countType == other.countType && type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); + getKey().equals(other.getKey()) && + getID().equals(other.getID()); } - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - - /** - * Specify annotation source - *

- * This value is optional starting with VCFv4.2. - * - * @param source annotation source (case-insensitive, e.g. "dbsnp") - */ - public void setSource(final String source) { - this.source = source; - } - - /** - * Specify annotation version - *

- * This value is optional starting with VCFv4.2. - * - * @param version exact version (e.g. "138") - */ - public void setVersion(final String version) { - this.version = version; + private static int getCompoundLineDifferenceScore(final T line1, final T line2) { + final int dataTypeDiffers = line1.getType().equals(line2.getType()) ? 0 : 1; // data type + final int countTypeDiffers = line1.getCountType().equals(line2.getCountType()) ? 0 : 1; // count type + // getCount is only valid if the getCountType==Integer + final int countDiffers = + (countTypeDiffers == 0 && + line1.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line2.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line1.getCount() != line2.getCount()) ? 1 : 0; + return dataTypeDiffers + countTypeDiffers + countDiffers; } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFConstants.java b/src/main/java/htsjdk/variant/vcf/VCFConstants.java index 64fdf2bc8e..11f12cf07c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFConstants.java +++ b/src/main/java/htsjdk/variant/vcf/VCFConstants.java @@ -45,7 +45,7 @@ public final class VCFConstants { public static final String GENOTYPE_KEY = "GT"; public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD isn't reserved, but is specifically handled by VariantContext + public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD is now reserved public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; @Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods @@ -86,14 +86,20 @@ public final class VCFConstants { public static final String PHASING_TOKENS = "/|\\"; // header lines - public static final String FILTER_HEADER_START = "##FILTER"; - public static final String FORMAT_HEADER_START = "##FORMAT"; - public static final String INFO_HEADER_START = "##INFO"; - public static final String ALT_HEADER_KEY = "ALT"; - public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY ; - public static final String CONTIG_HEADER_KEY = "contig"; - public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY; + public static final String FILTER_HEADER_KEY = "FILTER"; + public static final String FILTER_HEADER_START = VCFHeader.METADATA_INDICATOR + FILTER_HEADER_KEY; + public static final int FILTER_HEADER_OFFSET = FILTER_HEADER_START.length() + 1; + + public static final String FORMAT_HEADER_KEY = "FORMAT"; + public static final String FORMAT_HEADER_START = VCFHeader.METADATA_INDICATOR + FORMAT_HEADER_KEY; + public static final int FORMAT_HEADER_OFFSET = FORMAT_HEADER_START.length() + 1; + + public static final String INFO_HEADER_KEY = "INFO"; + public static final String INFO_HEADER_START = VCFHeader.METADATA_INDICATOR + INFO_HEADER_KEY; + public static final int INFO_HEADER_OFFSET = INFO_HEADER_START.length() + 1; + public static final String ALT_HEADER_KEY = "ALT"; + public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY; public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1; public static final String PEDIGREE_HEADER_KEY = "PEDIGREE"; @@ -108,6 +114,10 @@ public final class VCFConstants { public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY; public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1; + public static final String CONTIG_HEADER_KEY = "contig"; + public static final String CONTIG_HEADER_START = VCFHeader.METADATA_INDICATOR + CONTIG_HEADER_KEY; + public static final int CONTIG_HEADER_OFFSET = CONTIG_HEADER_START.length() + 1; + // old indel alleles public static final char DELETION_ALLELE_v3 = 'D'; public static final char INSERTION_ALLELE_v3 = 'I'; diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 9ec50681b4..d8a19e2fa5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -26,11 +26,14 @@ package htsjdk.variant.vcf; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Optional; +import java.util.regex.Pattern; /** * A special class representing a contig VCF header line. Knows the true contig order and sorts on that @@ -40,42 +43,111 @@ * @author mdepristo */ public class VCFContigHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFContigHeaderLine.class); + + final static Pattern VALID_CONTIG_ID_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"); final Integer contigIndex; + public static final String LENGTH_ATTRIBUTE = "length"; + public static final String ASSEMBLY_ATTRIBUTE = "assembly"; + public static final String MD5_ATTRIBUTE = "md5"; + public static final String URL_ATTRIBUTE = "URL"; + public static final String SPECIES_ATTRIBUTE = "species"; + /** * create a VCF contig header line * + * NOTE: This is retained for backward compatibility, but is deprecated and should not be used. + * * @param line the header line * @param version the vcf header version * @param key the key for this header line + * @param contigIndex the contig index for this contig */ + @Deprecated // starting after version 2.24.1 public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) { - super(line, version, key, null, Collections.emptyList()); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); - this.contigIndex = contigIndex; + // deprecated because this constructor has a parameter to specify the key (??), but for + // contig lines the key has to be "contig" + this(line, version, contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(key)) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + key, + VCFHeader.CONTIG_KEY)); + } + } + + /** + * create a VCF contig header line + * + * @param line the header line + * @param version the vcf header version + * @param contigIndex the contig index for this contig + */ + public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final int contigIndex) { + this(VCFHeaderLineTranslator.parseLine( + version, line, Collections.singletonList(VCFSimpleHeaderLine.ID_ATTRIBUTE)), contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(getKey())) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + getKey(), + VCFHeader.CONTIG_KEY)); + } + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } + validateForVersion(version); } public VCFContigHeaderLine(final Map mapping, final int contigIndex) { super(VCFHeader.CONTIG_KEY, mapping); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } this.contigIndex = contigIndex; } - VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { - // Using LinkedHashMap to preserve order of keys in contig line (ID, length, assembly) - super(VCFHeader.CONTIG_KEY, new LinkedHashMap() {{ - // Now inside an init block in an anon HashMap subclass - this.put("ID", sequenceRecord.getSequenceName()); - this.put("length", Integer.toString(sequenceRecord.getSequenceLength())); - if ( assembly != null ) this.put("assembly", assembly); - }}); - this.contigIndex = sequenceRecord.getSequenceIndex(); + /** + * Return a VCFContigHeaderLine representing a SAMSequenceRecord. + * + * NOTE: round-tripping between VCFContigHeaderLines and SAMSequenceRecords can be lossy since they + * don't necessarily have equivalent attributes, i.e., SAMSequenceRecord can have a species attribute + * that isn't defined by the VCF spec. + * + * @return VCFContigHeaderLine for the SAMSequenceRecord + */ + public VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { + // preserve order of keys in contig line (ID, length, assembly) + this(new LinkedHashMap() {{ + this.put(ID_ATTRIBUTE, sequenceRecord.getSequenceName()); + if (sequenceRecord.getSequenceLength() != 0) { + this.put(LENGTH_ATTRIBUTE, Integer.toString(sequenceRecord.getSequenceLength())); + } + if (assembly != null) { + if (!assembly.equals(sequenceRecord.getAssembly())) { + logger.warn(String.format( + "Inconsistent \"assembly\" attribute values found while creating VCFContigLine " + + "(with assembly \"%s\") from SAMSequenceRecord (with assembly \"%s\")", + assembly, + sequenceRecord.getAssembly())); + } + this.put(ASSEMBLY_ATTRIBUTE, assembly); + } + if (sequenceRecord.getMd5() != null) { + this.put(MD5_ATTRIBUTE, sequenceRecord.getMd5()); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG) != null) { + this.put(URL_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG)); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) { + this.put(SPECIES_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG)); + } + }}, + sequenceRecord.getSequenceIndex() + ); } - public Integer getContigIndex() { - return contigIndex; - } - /** * Get the SAMSequenceRecord that corresponds to this VCF header line. * If the VCF header line does not have a length tag, the SAMSequenceRecord returned will be set to have a length of @@ -85,20 +157,56 @@ public Integer getContigIndex() { * contig header line does not have a length. */ public SAMSequenceRecord getSAMSequenceRecord() { - final String lengthString = this.getGenericFieldValue("length"); - final int length; - if (lengthString == null) { - length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; + final String lengthString = this.getGenericFieldValue(LENGTH_ATTRIBUTE); + final int length; + if (lengthString == null) { + length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; } else { - length = Integer.parseInt(lengthString); + length = Integer.parseInt(lengthString); + } + final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); + final String assemblyString = this.getGenericFieldValue(ASSEMBLY_ATTRIBUTE); + if (assemblyString != null) { + record.setAssembly(assemblyString); } - final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); - record.setAssembly(this.getGenericFieldValue("assembly")); - record.setSequenceIndex(this.contigIndex); - return record; + record.setSequenceIndex(this.contigIndex); + final String md5 = getGenericFieldValue(MD5_ATTRIBUTE); + if (md5 != null) { + record.setMd5(md5); + } + final String url = getGenericFieldValue(URL_ATTRIBUTE); + if (url != null) { + record.setAttribute(SAMSequenceRecord.URI_TAG, url); + } + final String species = getGenericFieldValue(SPECIES_ATTRIBUTE); + if (species != null) { + record.setSpecies(species); + } + return record; } @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (!VALID_CONTIG_ID_PATTERN.matcher(getID()).matches()) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("Contig headerLine ID \"%s\" doesn't conform to contig ID restrictions", getID()))); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + public Integer getContigIndex() { + return contigIndex; + } + + /** + * Note: this class has a natural ordering that is inconsistent with equals() + */ + @Override public boolean equals(final Object o) { if ( this == o ) { return true; @@ -120,6 +228,11 @@ public int hashCode() { /** * IT IS CRITICAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER + * + * NOTE: this class has a natural ordering that is inconsistent with equals(). This results + * in inconsistent behavior when these lines are used in the sets that are created/accepted + * by VCFHeader (ie., getMetaDataInSortedOrder will filter out VCFContigHeaderLines that are + * returned by getMetaDataInInputOrder or getContigheaderLines). */ @Override public int compareTo(final Object other) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 6ca8f3f532..1b890db1b1 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -25,26 +25,40 @@ package htsjdk.variant.vcf; -import java.util.Arrays; -import java.util.Collections; +import htsjdk.tribble.TribbleException; + +import java.util.*; /** * @author ebanks * - * A class representing a key=value entry for FILTER fields in the VCF header + * A class representing FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - +public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + private static List requiredTagOrder = Collections.unmodifiableList( + new ArrayList(2) {{ + add(ID_ATTRIBUTE); + add(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE); + }} + ); + /** * create a VCF filter header line * - * @param name the name for this header line + * @param id the headerLineID for this header line * @param description the description for this header line */ - public VCFFilterHeaderLine(final String name, final String description) { - super("FILTER", name, description); + public VCFFilterHeaderLine(final String id, final String description) { + super(VCFConstants.FILTER_HEADER_KEY, + new LinkedHashMap(2) {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + validate(); } /** @@ -52,29 +66,37 @@ public VCFFilterHeaderLine(final String name, final String description) { * @param name */ public VCFFilterHeaderLine(final String name) { - super("FILTER", name, name); + this(name, name); } /** - * create a VCF info header line + * create a VCF filter header line * * @param line the header line * @param version the vcf header version */ public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description"), Collections.emptyList()); + super(VCFConstants.FILTER_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, requiredTagOrder)); + validate(); + validateForVersion(version); + } + + private void validate() { + if (getDescription() == null) { + throw new TribbleException.InvalidHeader("Missing Description attribute in filter header line"); + } } @Override public boolean shouldBeAddedToDictionary() { return true; } - + /** * get the "Description" field * @return the "Description" field */ public String getDescription() { - return getGenericFieldValue("Description"); + return getGenericFieldValue(DESCRIPTION_ATTRIBUTE); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index 74f4d5e5e3..fc75ee5291 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -26,34 +26,75 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + /** * @author ebanks *

* Class VCFFormatHeaderLine *

*

- * A class representing a key=value entry for genotype FORMAT fields in the VCF header

+ * A class representing genotype FORMAT fields in the VCF header

*/ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version); + validate(); + validateForVersion(version); } - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; + /** + * Compare two VCFFormatHeaderLine objects to determine if they have compatible number types, and return a + * VCFFormatHeaderLine that represents the result of merging these two lines. + * + * @param formatLine1 first format line to merge + * @param formatLine2 second format line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFFormatHeaderLine + */ + public static VCFFormatHeaderLine getMergedFormatHeaderLine( + final VCFFormatHeaderLine formatLine1, + final VCFFormatHeaderLine formatLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(formatLine1); + ValidationUtils. nonNull(formatLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + formatLine1, + formatLine2, + conflictWarner, + (l1, l2) -> new VCFFormatHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + } + + private void validate() { + if (this.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException(String.format("Flag is an unsupported type for format fields: ", this.toStringEncoding())); + } } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index c39bef5684..637c04c4fc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -27,60 +27,52 @@ import htsjdk.beta.plugin.HtsHeader; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContextComparator; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - +import java.util.*; +import java.util.stream.Collectors; /** - * A class to represent a VCF header + * A class to represent a VCF header. + * + * A VCFHeader has a "current" VCFHeaderVersion that is established when the header is constructed. If + * metadata lines are provided to the constructor, a ##fileformat line must be included, and all lines + * in that are provided must be valid for the specified version. If no metadata lines are initially + * provided, the default version {@link VCFHeader#DEFAULT_VCF_VERSION} will be used. + * + * Each line in the list is always guaranteed to be valid for the current version, and any line added must + * conform to the current version (as defined by the VCF specification). If a new line is added that fails to + * validate against the current version, or a new line that changes the current version, and an existing line + * in the list fails to validate against the new version, an exception will be thrown. * - * @author aaron - * NOTE: This class stores header lines in lots of places. The original author noted that this should - * be cleaned up at some point in the future (jgentry - 5/2013) + * Once a header version is established, it can be changed by adding a new file format/version line (see + * {@link VCFHeader#makeHeaderVersionLine)} (the new version line will replace any existing line), but only + * if the new version is newer than the previous version. Attempts to move the version to an older version + * will result in an exception. */ public class VCFHeader implements HtsHeader, Serializable { public static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFHeader.class); + public static final VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_2; // the mandatory header fields public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - /** - * The VCF version for this header; once a header version is established, it can only be - * changed subject to version transition rules defined by - * {@link #validateVersionTransition(VCFHeaderVersion, VCFHeaderVersion)} - */ + // the VCF version for this header private VCFHeaderVersion vcfHeaderVersion; - // the associated meta data - private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new LinkedHashMap(); - private final Map mFormatMetaData = new LinkedHashMap(); - private final Map mFilterMetaData = new LinkedHashMap(); - private final Map mOtherMetaData = new LinkedHashMap(); - private final Map contigMetaData = new LinkedHashMap<>(); + // header meta data + private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); - // the list of auxillary tags - private final List mGenotypeSampleNames = new ArrayList(); + // the list of auxiliary tags + private final List mGenotypeSampleNames = new ArrayList<>(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; @@ -108,59 +100,74 @@ public enum HEADER_FIELDS { private boolean writeCommandLine = true; /** - * Create an empty VCF header with no header lines and no samples + * Create an empty VCF header with no header lines and no samples. Defaults to + * VCF version {@link VCFHeader#DEFAULT_VCF_VERSION}. */ public VCFHeader() { - this(Collections.emptySet(), Collections.emptySet()); + this(makeHeaderVersionLineSet(DEFAULT_VCF_VERSION), Collections.emptySet()); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a list of meta data and auxiliary tags. The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for the header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData the meta data associated with this header + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData) { - mMetaData.addAll(metaData); - removeVCFVersionLines(mMetaData); - createLookupEntriesForAllHeaderLines(); - checkForDeprecatedGenotypeLikelihoodsKey(); + this(metaData, Collections.emptySet()); } /** - * Creates a deep copy of the given VCFHeader, duplicating all its metadata and + * Creates a copy of the given VCFHeader, duplicating all it's metadata and * sample names. */ public VCFHeader(final VCFHeader toCopy) { - this(toCopy.mMetaData, toCopy.mGenotypeSampleNames); + this(toCopy.getMetaDataInInputOrder(), toCopy.mGenotypeSampleNames); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a set of meta data and auxiliary tags. The provided metadata + * list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData set of meta data associated with this header * @param genotypeSampleNames the sample names + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); + this(metaData, new ArrayList<>(genotypeSampleNames)); } /** - * create a VCF header, given a target version, a list of meta data and auxiliary tags + * Create a versioned VCF header. * - * @param vcfHeaderVersion the vcf header version for this header, can not be null - * @param metaData the meta data associated with this header - * @param genotypeSampleNames the sample names + * @param metaData The metadata lines for this header.The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. + * @param genotypeSampleNames Sample names for this header. + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ - public VCFHeader(final VCFHeaderVersion vcfHeaderVersion, final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); - ValidationUtils.nonNull(vcfHeaderVersion); - setVCFHeaderVersion(vcfHeaderVersion); - } - public VCFHeader(final Set metaData, final List genotypeSampleNames) { - this(metaData); + ValidationUtils.nonNull(metaData); + ValidationUtils.nonNull(genotypeSampleNames); - if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) + // propagate the lines and establish the version for this header; note that if multiple version + // lines are presented in the set, a warning will be issued, only the last one will be retained, + // and the header version will be established using the last version line encountered + mMetaData.addMetaDataLines(metaData); + vcfHeaderVersion = initializeHeaderVersion(); + mMetaData.validateMetaDataLines(vcfHeaderVersion); + + checkForDeprecatedGenotypeLikelihoodsKey(); + if ( genotypeSampleNames.size() != new HashSet<>(genotypeSampleNames).size() ) throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names"); mGenotypeSampleNames.addAll(genotypeSampleNames); @@ -168,50 +175,34 @@ public VCFHeader(final Set metaData, final List genotypeS buildVCFReaderMaps(genotypeSampleNames); } - /** - * Establish the header version for this header. If the header version has already been established - * for this header, the new version will be subject to version transition validation. - * @param vcfHeaderVersion - * @throws TribbleException if the requested header version is not compatible with the existing version - */ - public void setVCFHeaderVersion(final VCFHeaderVersion vcfHeaderVersion) { - validateVersionTransition(this.vcfHeaderVersion, vcfHeaderVersion); - this.vcfHeaderVersion = vcfHeaderVersion; + /** + * Get the header version for this header. + * @return the VCFHeaderVersion for this header. will not be null + */ + public VCFHeaderVersion getVCFHeaderVersion() { + return vcfHeaderVersion; } /** - * Throw if {@code fromVersion} is not compatible with a {@code toVersion}. Generally, any version before - * version 4.2 can be up-converted to version 4.2, but not to version 4.3. Once a header is established as - * version 4.3, it cannot be up or down converted, and it must remain at version 4.3. - * @param fromVersion current version. May be null, in which case {@code toVersion} can be any version - * @param toVersion new version. Cannot be null. - * @throws TribbleException if {@code fromVersion} is not compatible with {@code toVersion} + * Adds a new line to the VCFHeader. If a duplicate line is already exists (same key/ID pair for + * structured lines, or duplicate content for unstructured lines with identical keys), the new + * line will replace the existing line. + * + * @param headerLine header line to attempt to add */ - public static void validateVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - ValidationUtils.nonNull(toVersion); - - final String errorMessageFormatString = "VCF cannot be automatically promoted from %s to %s"; - - // fromVersion can be null, in which case anything goes (any transition from null is legal) - if (fromVersion != null) { - if (toVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - if (!fromVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from pre-v4.3 to v4.3+ - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } + public void addMetaDataLine(final VCFHeaderLine headerLine) { + // propagate the new line to the metadata lines object + mMetaData.addMetaDataLine(headerLine); - } else if (fromVersion.equals(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from v4.3 to pre-v4.3 - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } + // update the current version in case this line triggered a version change + final VCFHeaderVersion newHeaderVersion = mMetaData.getVCFVersion(); + if (!newHeaderVersion.equals(vcfHeaderVersion)) { + validateVersionTransition(vcfHeaderVersion, newHeaderVersion); } - } + vcfHeaderVersion = newHeaderVersion; + headerLine.validateForVersion(vcfHeaderVersion); - /** - * @return the VCFHeaderVersion for this header. Can be null. - */ - public VCFHeaderVersion getVCFHeaderVersion() { - return vcfHeaderVersion; + checkForDeprecatedGenotypeLikelihoodsKey(); } /** @@ -220,81 +211,58 @@ public VCFHeaderVersion getVCFHeaderVersion() { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance + * @param genotypeSampleNamesInAppearanceOrder genotype sample names, must iterator in order of appearance */ - private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearenceOrder) { - sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); - sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearanceOrder) { + sampleNamesInOrder = new ArrayList<>(genotypeSampleNamesInAppearanceOrder.size()); + sampleNameToOffset = new HashMap<>(genotypeSampleNamesInAppearanceOrder.size()); int i = 0; - for (final String name : genotypeSampleNamesInAppearenceOrder) { + for (final String name : genotypeSampleNamesInAppearanceOrder) { sampleNamesInOrder.add(name); sampleNameToOffset.put(name, i++); } Collections.sort(sampleNamesInOrder); } - /** - * Adds a new line to the VCFHeader. If there is an existing header line of the - * same type with the same key, the new line is not added and the existing line - * is preserved. + * Return all contig line in SORTED order, where the sort order is determined by contig index. + * Note that this behavior differs from other VCFHeader methods that return lines in input order. * - * @param headerLine header line to attempt to add - */ - public void addMetaDataLine(final VCFHeaderLine headerLine) { - // Try to create a lookup entry for the new line. If this succeeds (because there was - // no line of this type with the same key), add the line to our master list of header - // lines in mMetaData. - if ( addMetadataLineLookupEntry(headerLine) ) { - mMetaData.add(headerLine); - checkForDeprecatedGenotypeLikelihoodsKey(); - } - } - - /** - * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present */ public List getContigLines() { - // this must preserve input order - return Collections.unmodifiableList(new ArrayList<>(contigMetaData.values())); - } + // this must return lines in SORTED order + return mMetaData.getContigLines(); + } /** - * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. If contig lines are missing length tags, they will be created with - * length set to SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH. Records with unknown length will match any record with - * the same name when evaluated by SAMSequenceRecord.isSameSequence. + * Returns the contigs in this VCF Header as a SAMSequenceDictionary. + * + * @return Returns null if contig lines are not present in the header. + * @throws TribbleException if one or more contig lines do not have length + * information. */ public SAMSequenceDictionary getSequenceDictionary() { + // this must ensure that the lines used to create the dictionary are sorted by contig index final List contigHeaderLines = this.getContigLines(); - if (contigHeaderLines.isEmpty()) return null; - - final List sequenceRecords = new ArrayList(contigHeaderLines.size()); - for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { - final SAMSequenceRecord samSequenceRecord = contigHeaderLine.getSAMSequenceRecord(); - sequenceRecords.add(samSequenceRecord); - } - - return new SAMSequenceDictionary(sequenceRecords); + return contigHeaderLines.isEmpty() ? null : + new SAMSequenceDictionary( + contigHeaderLines.stream() + .map(contigLine -> contigLine.getSAMSequenceRecord()) + .collect(Collectors.toCollection(ArrayList::new)) + ); } /** - * Completely replaces the contig records in this header with those in the given SAMSequenceDictionary. + * Completely replaces all contig header lines in this header with ones derived from the given SAMSequenceDictionary. + * + * @param dictionary SAMSequenceDictionary to use to create VCFContigHeaderLines for this header */ public void setSequenceDictionary(final SAMSequenceDictionary dictionary) { - this.contigMetaData.clear(); - - // Also need to remove contig record lines from mMetaData - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFContigHeaderLine) { - toRemove.add(line); - } - } - mMetaData.removeAll(toRemove); - for (final SAMSequenceRecord record : dictionary.getSequences()) { - addMetaDataLine(new VCFContigHeaderLine(record, record.getAssembly())); + getContigLines().forEach(hl -> mMetaData.removeMetaDataLine(hl)); + if (dictionary != null) { + dictionary.getSequences().forEach(r -> addMetaDataLine(new VCFContigHeaderLine(r, r.getAssembly()))); } } @@ -305,128 +273,12 @@ public VariantContextComparator getVCFRecordComparator() { /** * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present */ - public List getFilterLines() { - final List filters = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if ( line instanceof VCFFilterHeaderLine ) { - filters.add((VCFFilterHeaderLine)line); - } - } - return filters; - } - - /** - * @return all of the VCF ID-based header lines in their original file order, or an empty list if none were present - */ - public List getIDHeaderLines() { - final List lines = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFIDHeaderLine) { - lines.add((VCFIDHeaderLine)line); - } - } - return lines; - } - - /** - * Remove all lines with a VCF version tag from the provided set of header lines - */ - private void removeVCFVersionLines( final Set headerLines ) { - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : headerLines) { - if (VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - } - headerLines.removeAll(toRemove); - } + public List getFilterLines() { return mMetaData.getFilterLines(); } /** - * Creates lookup table entries for all header lines in mMetaData. + * @return all of the VCFSimpleHeaderLine (ID) lines in their original file order, or an empty list if none are present */ - private void createLookupEntriesForAllHeaderLines() { - for (final VCFHeaderLine line : mMetaData) { - addMetadataLineLookupEntry(line); - } - } - - /** - * Add a single header line to the appropriate type-specific lookup table (but NOT to the master - * list of lines in mMetaData -- this must be done separately if desired). - * - * If a header line is present that has the same key as an existing line, it will not be added. A warning - * will be shown if this occurs when GeneralUtils.DEBUG_MODE_ENABLED is true, otherwise this will occur - * silently. - * - * @param line header line to attempt to add to its type-specific lookup table - * @return true if the line was added to the appropriate lookup table, false if there was an existing - * line with the same key and the new line was not added - */ - private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) { - if ( line instanceof VCFInfoHeaderLine ) { - final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - return addMetaDataLineMapLookupEntry(mInfoMetaData, infoLine.getID(), infoLine); - } else if ( line instanceof VCFFormatHeaderLine ) { - final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFormatMetaData, formatLine.getID(), formatLine); - } else if ( line instanceof VCFFilterHeaderLine ) { - final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFilterMetaData, filterLine.getID(), filterLine); - } else if ( line instanceof VCFContigHeaderLine ) { - return addContigMetaDataLineLookupEntry((VCFContigHeaderLine) line); - } else { - return addMetaDataLineMapLookupEntry(mOtherMetaData, line.getKey(), line); - } - } - - /** - * Add a contig header line to the lookup list for contig lines (contigMetaData). If there's - * already a contig line with the same ID, does not add the line. - * - * Note: does not add the contig line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param line contig header line to add - * @return true if line was added to the list of contig lines, otherwise false - */ - private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) { - // if we are trying to add a contig for the same ID - if (contigMetaData.containsKey(line.getID())) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" ); - } - // do not add this contig if it exists - return false; - } - contigMetaData.put(line.getID(), line); - return true; - } - - /** - * Add a header line to the provided map at a given key. If the key already exists, it will not be replaced. - * If it does already exist and GeneralUtils.DEBUG_MODE_ENABLED is true, it will issue warnings about duplicates, - * otherwise it will silently leave the existing key/line pair as is. - * - * Note: does not add the header line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param map a map from each key to the associated VCFHeaderLine - * @param key the key to insert this line at - * @param line the line to insert at this key - * @param a type of vcf header line that extends VCFHeaderLine - * @return true if the line was added to the map, false if it was not added because there's already a line with that key - */ - private boolean addMetaDataLineMapLookupEntry(final Map map, final String key, final T line) { - if ( map.containsKey(key) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" ); - } - return false; - } - - map.put(key, line); - return true; - } + public List getIDHeaderLines() { return mMetaData.getIDHeaderLines(); } /** * Check for the presence of a format line with the deprecated key {@link VCFConstants#GENOTYPE_LIKELIHOODS_KEY}. @@ -435,12 +287,14 @@ private boolean addMetaDataLineMapLookupEntry(final Ma */ private void checkForDeprecatedGenotypeLikelihoodsKey() { if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " - + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" - + " automatically adding a corresponding PL field to your VCF header"); - } - addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" + + " automatically adding a corresponding PL field to your VCF header"); + addMetaDataLine(new VCFFormatHeaderLine( + VCFConstants.GENOTYPE_PL_KEY, + VCFHeaderLineCount.G, + VCFHeaderLineType.Integer, + "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); } } @@ -451,48 +305,44 @@ private void checkForDeprecatedGenotypeLikelihoodsKey() { * @return a set of the header fields, in order */ public Set getHeaderFields() { - return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); + return new LinkedHashSet<>(Arrays.asList(HEADER_FIELDS.values())); } /** - * get the meta data, associated with this header, in sorted order + * get the meta data, associated with this header, in input order * * @return a set of the meta data */ - public Set getMetaDataInInputOrder() { - return makeGetMetaDataSet(mMetaData); - } - - public Set getMetaDataInSortedOrder() { - return makeGetMetaDataSet(new TreeSet(mMetaData)); - } + public Set getMetaDataInInputOrder() { return mMetaData.getMetaDataInInputOrder(); } - private Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) { - final Set lines = new LinkedHashSet(); - if (vcfHeaderVersion != null && vcfHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // always propagate version 4.3+ to prevent these header lines from magically being back-versioned to < 4.3 - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_3.getFormatString(), VCFHeaderVersion.VCF4_3.getVersionString())); - } else { - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); - } - lines.addAll(headerLinesInSomeOrder); - return Collections.unmodifiableSet(lines); - } + /** + * Get the metadata associated with this header in sorted order. + * + * @return Metadata lines in sorted order (based on lexicographical sort of string encodings). + */ + public Set getMetaDataInSortedOrder() { return mMetaData.getMetaDataInSortedOrder(); } /** * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists - * @param key - * @return + * + * Deprecated. Use {@link #getMetaDataLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the key to use to find header lines to return + * @return the header line with key "key", or null if none is present */ + @Deprecated // starting after version 2.24.1 public VCFHeaderLine getMetaDataLine(final String key) { - for (final VCFHeaderLine line: mMetaData) { - if ( line.getKey().equals(key) ) - return line; - } - - return null; + return mMetaData.getMetaDataLines(key).stream().findFirst().orElse(null); } + /** + * Get the VCFHeaderLines whose key equals key. Returns an empty list if no such lines exist. + * + * @param key the key to use to find header lines to return + * @return the header lines with key "key" + */ + public Collection getMetaDataLines(final String key) { return mMetaData.getMetaDataLines(key); } + /** * get the genotyping sample names * @@ -532,40 +382,32 @@ public int getColumnCount() { /** * Returns the INFO HeaderLines in their original ordering */ - public Collection getInfoHeaderLines() { - return mInfoMetaData.values(); - } + public Collection getInfoHeaderLines() { return mMetaData.getInfoHeaderLines(); } /** * Returns the FORMAT HeaderLines in their original ordering */ - public Collection getFormatHeaderLines() { - return mFormatMetaData.values(); - } + public Collection getFormatHeaderLines() { return mMetaData.getFormatHeaderLines(); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ public VCFInfoHeaderLine getInfoHeaderLine(final String id) { - return mInfoMetaData.get(id); + return mMetaData.getInfoHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFormatHeaderLine getFormatHeaderLine(final String id) { - return mFormatMetaData.get(id); - } + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { return mMetaData.getFormatHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFilterHeaderLine getFilterHeaderLine(final String id) { - return mFilterMetaData.get(id); - } + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { return mMetaData.getFilterHeaderLine(id); } public boolean hasInfoLine(final String id) { return getInfoHeaderLine(id) != null; @@ -580,24 +422,82 @@ public boolean hasFilterLine(final String id) { } /** - * @param key the header key name + * Deprecated. Use {@link #getOtherHeaderLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the of the requested header line * @return the meta data line, or null if there is none */ + @Deprecated // starting after version 2.24.1 this selects one from what can be many) public VCFHeaderLine getOtherHeaderLine(final String key) { - return mOtherMetaData.get(key); + final Collection otherLines = mMetaData.getOtherHeaderLines(); + for (final VCFHeaderLine next: otherLines) { + if (next.getKey().equals(key)) { + // note that this returns the first match it finds, which is why this method is deprecated + return next; + } + } + return null; } /** - * Returns the other HeaderLines in their original ordering + * Returns all "other" VCFHeaderLines, in their original (input) order, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { return mMetaData.getOtherHeaderLines(); } + + /** + * Returns "other" HeaderLines that have the key "key", in their original ordering, where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public List getOtherHeaderLines(final String key) { + return mMetaData.getOtherHeaderLines().stream().filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Adds a single "other" VCFHeaderLine that has key "key". Any lines with that key that already exist + * in the header will be removed. This method can only be used to set unique non-structured (non-ID) + * header lines. + * + * @param uniqueLine the unique line to add + * @throws TribbleException if the line to be added is an ID line. */ - public Collection getOtherHeaderLines() { - return mOtherMetaData.values(); + public void addOtherHeaderLineUnique(final VCFHeaderLine uniqueLine) { + if (uniqueLine.isIDHeaderLine()) { + throw new TribbleException(String.format("Only non-ID header lines can be added using this method: %s", uniqueLine)); + } + getOtherHeaderLines(uniqueLine.getKey()).forEach(hl -> mMetaData.removeMetaDataLine(hl)); + addMetaDataLine(uniqueLine); + } + + /** + * Returns a single "other" VCFHeaderLine that has the key "key", where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. If more than + * one such line is available, throws a TribbleException. + * + * @param key the key to match + * @return a single VCHeaderLine, or null if none + * @throws TribbleException if more than one other line matches the key + */ + public VCFHeaderLine getOtherHeaderLineUnique(final String key) { + final List lineList = getOtherHeaderLines(key); + if (lineList.isEmpty()) { + return null; + } else if (lineList.size() > 1) { + throw new TribbleException( + String.format( + "More than one \"other\" header line matches the key \"%s\". Use getOtherHeaderLines() to retrieve multiple lines:", + key, + lineList.stream().map(VCFHeaderLine::toString).collect(Collectors.joining(",")))); + } else { + return lineList.get(0); + } } /** * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @return true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteEngineHeaders() { return writeEngineHeaders; } @@ -606,6 +506,7 @@ public boolean isWriteEngineHeaders() { * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @param writeEngineHeaders true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteEngineHeaders(final boolean writeEngineHeaders) { this.writeEngineHeaders = writeEngineHeaders; } @@ -614,6 +515,7 @@ public void setWriteEngineHeaders(final boolean writeEngineHeaders) { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @return true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteCommandLine() { return writeCommandLine; } @@ -622,6 +524,7 @@ public boolean isWriteCommandLine() { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @param writeCommandLine true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteCommandLine(final boolean writeCommandLine) { this.writeCommandLine = writeCommandLine; } @@ -640,10 +543,98 @@ public HashMap getSampleNameToOffset() { @Override public String toString() { - final StringBuilder b = new StringBuilder(); - b.append("[VCFHeader:"); - for ( final VCFHeaderLine line : mMetaData ) - b.append("\n\t").append(line); - return b.append("\n]").toString(); + return mMetaData.toString(); } + + /** + * Obtain a valid fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return the version line + */ + public static VCFHeaderLine makeHeaderVersionLine(final VCFHeaderVersion requestedVersion) { + return new VCFHeaderLine(requestedVersion.getFormatString(), requestedVersion.getVersionString()); + } + + /** + * Obtain a VCFHeaderLine set containing only a fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return a VCFHeaderLine set containing only fileformat/version line for the requestedVersion + */ + public static Set makeHeaderVersionLineSet(final VCFHeaderVersion requestedVersion) { + return new LinkedHashSet() {{ add(VCFHeader.makeHeaderVersionLine(requestedVersion)); }}; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final VCFHeader vcfHeader = (VCFHeader) o; + + if (samplesWereAlreadySorted != vcfHeader.samplesWereAlreadySorted) return false; + if (writeEngineHeaders != vcfHeader.writeEngineHeaders) return false; + if (writeCommandLine != vcfHeader.writeCommandLine) return false; + if (vcfHeaderVersion != vcfHeader.vcfHeaderVersion) return false; + if (!mMetaData.equals(vcfHeader.mMetaData)) return false; + if (mGenotypeSampleNames != null ? !mGenotypeSampleNames.equals(vcfHeader.mGenotypeSampleNames) : + vcfHeader.mGenotypeSampleNames != null) + return false; + if (sampleNamesInOrder != null ? !sampleNamesInOrder.equals(vcfHeader.sampleNamesInOrder) : + vcfHeader.sampleNamesInOrder != null) + return false; + return sampleNameToOffset != null ? sampleNameToOffset.equals(vcfHeader.sampleNameToOffset) : + vcfHeader.sampleNameToOffset == null; + } + + @Override + public int hashCode() { + int result = vcfHeaderVersion.hashCode(); + result = 31 * result + mMetaData.hashCode(); + result = 31 * result + (mGenotypeSampleNames != null ? mGenotypeSampleNames.hashCode() : 0); + result = 31 * result + (samplesWereAlreadySorted ? 1 : 0); + result = 31 * result + (sampleNamesInOrder != null ? sampleNamesInOrder.hashCode() : 0); + result = 31 * result + (sampleNameToOffset != null ? sampleNameToOffset.hashCode() : 0); + result = 31 * result + (writeEngineHeaders ? 1 : 0); + result = 31 * result + (writeCommandLine ? 1 : 0); + return result; + } + + /** + * Establish the version for this header using the (required) ##fileformat metadata line in the metadata list. + * @throws TribbleException if no ##fileformat line is included in the metadata lines + */ + private VCFHeaderVersion initializeHeaderVersion() { + final VCFHeaderVersion metaDataVersion = mMetaData.getVCFVersion(); + if (metaDataVersion == null) { + //we dont relax this even if VCFUtils.getStrictVCFVersionValidation() == false, since that + //would confound subsequent header version management + throw new TribbleException("The VCFHeader metadata must include a ##fileformat (version) header line"); + } + return metaDataVersion; + } + + private void validateVersionTransition( + final VCFHeaderVersion previousVersion, + final VCFHeaderVersion newVersion) { + final int compareTo = newVersion.compareTo(previousVersion); + if (compareTo < 0) { + // We only allow going forward to a newer version, not backwards to an older one, since there + // is really no way to validate old header lines (pre vcfV4.2). The only way to create a header with + // an old version is to create it that way from the start. + // to be created with the old version from the start. + throw new TribbleException(String.format( + "When changing a header version, the new header version %s must be > the previous version %s", + newVersion, + previousVersion)); + } else if (compareTo > 0) { + logger.debug(() -> String.format("Updating VCFHeader version from %s to %s", + previousVersion.getVersionString(), + newVersion.getVersionString())); + + // the version moved forward, so validate ALL of the existing lines in the list to ensure + // that the transition is valid + mMetaData.validateMetaDataLines(newVersion); + } + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 0d07a83078..94a3a0849e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -26,28 +26,23 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; import java.io.Serializable; import java.util.Map; - +import java.util.Optional; /** - * @author ebanks - *

- * Class VCFHeaderLine - *

- *

- * A class representing a key=value entry in the VCF header - *

+ *

A class representing a key=value entry in the VCF header, and the base class for structured header lines. + * Header lines are immutable, and derived classes should maintain immutability. + *

*/ public class VCFHeaderLine implements Comparable, Serializable { public static final long serialVersionUID = 1L; - protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; + // immutable - we don't want to let the hash value change + private final String mKey; + private final String mValue; /** * create a VCF header line @@ -56,14 +51,9 @@ public class VCFHeaderLine implements Comparable, Serializable { * @param value the value for this header line */ public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null"); - if ( key.contains("<") || key.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain angle brackets"); - if ( key.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain an equals sign"); mKey = key; mValue = value; + validate(); } /** @@ -76,16 +66,97 @@ public String getKey() { } /** - * Get the value + * Get the value. May be null. * - * @return the value + * @return the value. may be null (for subclass implementations that use structured values) */ public String getValue() { return mValue; } /** - * By default the header lines won't be added to the dictionary, unless this method will be override (for example in FORMAT, INFO or FILTER header lines) + * @return true if this is a structured header line (has a unique ID, and key/value pairs), otherwise false + */ + public boolean isIDHeaderLine() { return false; } + + /** + * Return the unique ID for this line. Returns null iff {@link #isIDHeaderLine()} is false. + * @return the line's ID, or null if isIDHeaderLine() is false + */ + public String getID() { return null; } + + /** + * Validates this header line against {@code vcfTargetVersion}. + * Subclasses can override this to provide line type-specific version validation, and the + * overrides should also call super.getValidationFailure to allow each class in the class hierarchy + * to do class-level validation. + * + * @return Optional containing a {@link VCFValidationFailure} describing validation failure if this + * line fails validation, otherwise Optional.empty(). + */ + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // If this header line is itself a fileformat/version line, + // make sure it doesn't clash with the requested vcfTargetVersion. + if (VCFHeaderVersion.isFormatString(getKey())) { + if (!vcfTargetVersion.getFormatString().equals(getKey()) || + !vcfTargetVersion.getVersionString().equals(getValue()) + ) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("The target version (%s) is incompatible with the header line's content.", + vcfTargetVersion))); + } + } else if (getKey().equals(VCFConstants.PEDIGREE_HEADER_KEY)) { + // previous to vcf4.3, PEDIGREE header lines are not modeled as VCFPedigreeHeaderLine because they + // were not structured header lines (had no ID), so we need to check HERE to see if an attempt is + // being made to use one of those old-style pedigree lines in a newer-versioned header, and reject + // it if so + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) && ! (this instanceof VCFPedigreeHeaderLine)) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("A pedigree line with no ID cannot be merged with version %s", vcfTargetVersion))); + } + } + + return Optional.empty(); + } + + /** + * Validate that the header line conforms to {@code vcfTargetVersion. + * @param vcfTargetVersion + * @throws {@link TribbleException.VersionValidationFailure} if this header line fails to conform + */ + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + final Optional> error = getValidationFailure(vcfTargetVersion); + if (error.isPresent()) { + throw new TribbleException.VersionValidationFailure(error.get().getSourceMessage()); + } + } + + /** + * Validate a string that is to be used as a unique id or key field. + */ + protected static void validateKeyOrID(final String keyString, final String sourceName) { + ValidationUtils.nonNull(sourceName); + if (keyString == null) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot be null or empty", sourceName)); + } + if ( keyString.contains("<") || keyString.contains(">") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain angle brackets", sourceName)); + } + if ( keyString.contains("=") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain an equals sign", sourceName)); + } + } + + /** + * By default the header lines won't be added to the BCF dictionary, unless this method is overriden + * (for example in FORMAT, INFO or FILTER header lines). * * @return false */ @@ -141,10 +212,11 @@ public static boolean isHeaderLine(String line) { } /** - * create a string of a mapping pair for the target VCF version + * create a string of a mapping pair * @param keyValues a mapping of the key->value pairs to output * @return a string, correctly formatted */ + @Deprecated // starting after version 2.24.1 public static String toStringEncoding(Map keyValues) { StringBuilder builder = new StringBuilder(); builder.append('<'); @@ -167,6 +239,13 @@ public static String toStringEncoding(Map keyValues) { return builder.toString(); } + /** + * Validate the state of this header line. Require the key be valid as an "id". + */ + private void validate() { + validateKeyOrID(mKey, "key"); + } + private static String escapeQuotes(final String value) { // java escaping in a string literal makes this harder to read than it should be // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java index 080153a990..24195c73d3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java @@ -25,9 +25,78 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the count encodings we use for fields in VCF header lines */ public enum VCFHeaderLineCount { INTEGER, A, R, G, UNBOUNDED; + + // A default int value used to represent an integral count value (not a count *type*) when the + // actual count is derived and not a fixed integer (i.e., when isFixedCount()==false) + public static final int VARIABLE_COUNT = -1; + + /** Return true if this line uses a fixed (integer) count. **/ + public boolean isFixedCount() { return this.equals(INTEGER); } + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, assume the string represents a fixed, numeric + * value, and return Integer. The caller should convert and validate the actual value. + * + * @param vcfVersion + * @param countTypeString + * @return + */ + protected static VCFHeaderLineCount decode(final VCFHeaderVersion vcfVersion, final String countTypeString) { + ValidationUtils.nonNull(vcfVersion); + ValidationUtils.nonNull(countTypeString); + + if (countTypeString.equals(VCFConstants.PER_ALTERNATE_COUNT)) { + return A; + } else if (countTypeString.equals(VCFConstants.PER_ALLELE_COUNT)) { + return R; + } else if (countTypeString.equals(VCFConstants.PER_GENOTYPE_COUNT)) { + return G; + } else if ( + (vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { + return VCFHeaderLineCount.UNBOUNDED; + } else { + return VCFHeaderLineCount.INTEGER; // assume integer + } + } + + /** + * Encode a count type as a string suitable for serialization to a VCF header. Note this is + * not version aware and defaults to VCFv4 format. + * + * @param actualCount Must be the special value {@code VARIABLE_COUNT} unless this object is {@code VCFHeaderLineCount.INTEGER}. + * @return String encoding of this enum, or the {@code actualCount} if the type of this count + * is VCFHeaderLineCount.INTEGER. + * + * @throws IllegalArgumentException if {@code actualCount} is not the special value {@code VARIABLE_COUNT} and this + * is not the {@code VCFHeaderLineCount.INTEGER} enum object. + */ + public String encode(final int actualCount) { + if (this != INTEGER && actualCount != VARIABLE_COUNT) { + // Should only supply an actualCount if the count type == INTEGER + throw new IllegalArgumentException("Inconsistent header line number encoding request"); + } + switch (this) { + case A: + return VCFConstants.PER_ALTERNATE_COUNT; + case R: + return VCFConstants.PER_ALLELE_COUNT; + case G: + return VCFConstants.PER_GENOTYPE_COUNT; + case UNBOUNDED: + return VCFConstants.UNBOUNDED_ENCODING_v4; + case INTEGER: + return Integer.toString(actualCount); + } + throw new IllegalStateException("Unexpected VCFHeaderLineCount enum value"); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index 6c83574fee..a22ecd2102 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -34,7 +34,7 @@ import java.util.Map; /** - * A class for translating between vcf header versions + * A class for translating between vcf header versions and corresponding header line parsers. */ public class VCFHeaderLineTranslator { private static final Map mapping; @@ -50,57 +50,57 @@ public class VCFHeaderLineTranslator { mapping = Collections.unmodifiableMap(map); } + /** + * Parse a VCFHeaderLine for the given version. + * + * @param version VCFHeaderVersion of the header line + * @param valueLine the header line string + * @param expectedTagOrder List of expected tags (interpreted differently by the VCF3 and VCF4 parsers). + * @return a mapping of the tags parsed out. Note that the order of attributes is significant (ID must be + * first) and this should return a LinkedHashMap in order to preserve attribute order. + */ public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return parseLine(version, valueLine, expectedTagOrder, Collections.emptyList()); - } - - public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder, List recommendedTags) { - return mapping.get(version).parseLine(valueLine, expectedTagOrder, recommendedTags); + return mapping.get(version).parseLine(valueLine, expectedTagOrder); } } - +/** + * Parse a VCFHeaderLine. + */ interface VCFLineParser { /** * parse a VCF line - * - * @see #parseLine(String, List, List) VCFv4.2+ recommended tags support - * - * @param valueLine the line - * @param expectedTagOrder List of expected tags - * @return a mapping of the tags parsed out - */ - default Map parseLine(String valueLine, List expectedTagOrder) { - return parseLine(valueLine, expectedTagOrder, Collections.emptyList()); - } - - /** - * parse a VCF line - * - * The recommended tags were introduced in VCFv4.2. - * Older implementations may throw an exception when the recommendedTags field is not empty. - * - * We use a list to represent tags as we assume there will be a very small amount of them, - * so using a {@code Set} is overhead. - * + * + * @see #parseLine(String, List) VCFv4.2+ recommended tags support + * * @param valueLine the line * @param expectedTagOrder List of expected tags - * @param recommendedTags List of tags that may or may not be present. Use an empty list instead of NULL for none. * @return a mapping of the tags parsed out */ - Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags); + Map parseLine(String valueLine, List expectedTagOrder); } - /** * a class that handles the to and from disk for VCF 4 lines */ class VCF4Parser implements VCFLineParser { - + + /** + * Parse a VCFHeaderLine. The expectedTagOrder list prescribes the order in which tags should appear, but + * all tags are treated as optional. Additional tags are allowed after the expected tags, and may appear in + * any order. It is the caller's responsibility to validate that all required tags are present and that + * any additional "optional" tags are valid. + * + * @param valueLine the header line string + * @param expectedTagOrder List of tags that are required to appear in the order they're expected. Additional + * "extra" tags are allowed after the tags in this list, and must be validated by + * the caller. + * @return a mapping of all tags parsed out + */ @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -159,28 +159,23 @@ public Map parseLine(String valueLine, List expectedTagO throw new TribbleException.InvalidHeader("Unclosed quote in header line value " + valueLine); } - // validate the tags against the expected list - index = 0; + // Validate the order of all discovered tags against requiredTagOrder. All tags are treated as + // "optional". Succeeding does not mean that all expected tags in the list were seen. Also, all + // structured header lines can have "extra" tags, with no order specified, so additional tags + // are tolerated. if ( expectedTagOrder != null ) { - if (ret.keySet().isEmpty() && !expectedTagOrder.isEmpty()) { - throw new TribbleException.InvalidHeader("Header with no tags is not supported when there are expected tags in line " + valueLine); - } - for ( String str : ret.keySet() ) { - if (index < expectedTagOrder.size()) { - if (!expectedTagOrder.get(index).equals(str)) { - if (expectedTagOrder.contains(str)) { - throw new TribbleException.InvalidHeader("Tag " + str + " in wrong order (was #" + (index+1) + ", expected #" + (expectedTagOrder.indexOf(str)+1) + ") in line " + valueLine); - } else if (recommendedTags.contains(str)) { - throw new TribbleException.InvalidHeader("Recommended tag " + str + " must be listed after all expected tags in line " + valueLine); - } - else { - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); - } - } + index = 0; + for (String str : ret.keySet()) { + if (index >= expectedTagOrder.size()) { + break; // done - end of requiredTagOrder list + } else if (!expectedTagOrder.get(index).equals(str)) { + throw new TribbleException.InvalidHeader( + String.format("Unexpected tag or tag order for tag \"%s\" in line %s", str, valueLine)); } index++; } } + return ret; } } @@ -188,13 +183,9 @@ public Map parseLine(String valueLine, List expectedTagO class VCF3Parser implements VCFLineParser { @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { - if (!recommendedTags.isEmpty()) { - throw new TribbleException.InternalCodecException("Recommended tags are not allowed in VCFv3.x"); - } - + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -211,20 +202,34 @@ public Map parseLine(String valueLine, List expectedTagO for (char c: valueLine.toCharArray()) { switch (c) { case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + case (',') : + if (!inQuote) { + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + builder = new StringBuilder(); + break; + } // drop the current key value to the return map default: builder.append(c); // otherwise simply append to the current string } index++; } ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - // validate the tags against the expected list + // Validate that: + // we have no more tags than are expected + // the ones we have are in the expected list + // they appear in the same order as in the expected list + // This does no checking for missing tags; all tags are treated as optional + // index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + if (tagIndex != expectedTagOrder.size()) { + throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + } for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + if (!expectedTagOrder.get(index).equals(str)) { + throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + } index++; } return ret; } -} +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java index 785449de89..88432f0b18 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java @@ -25,9 +25,37 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the type encodings we use for fields in VCF header lines */ public enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; + Integer, + Float, + String, + Character, + Flag; + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, we assume the string represents a numeric + * value and return Integer. The caller should convert and validate the value. + * + * @param lineTypeString + * @return VCFHeaderLineType for {@code lineTypeString} + */ + protected static VCFHeaderLineType decode(final String lineTypeString) { + ValidationUtils.nonNull(lineTypeString); + return VCFHeaderLineType.valueOf(lineTypeString); + } + + /** + * Encode this line type as a string suitable for serialization to a VCF header. Note this is + * not version specific and defaults to VCFv42. + * + * The serialized encoding is the simple name of the enum constant + * @return string encoding of this line type + */ + String encode() { return this.toString(); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java new file mode 100644 index 0000000000..becbf64eb1 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java @@ -0,0 +1,286 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceDictionaryUtils; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Class used to produce a set of header lines resulting from the merger of one or more input VCFHeaders. + *

+ * The resulting lines have a version line matching the highest version of any of the input headers. + *

+ * The headers to be merged must conform to certain requirements: + * Some headers sets cannot be merged, and will result in an exception being thrown: + *

    + *
  • Headers must have a version that is at least VCF v4.2. Headers from older versions may not be merged (note + * that older headers that are read from input files are automatically "converted" to VCF v4.2 by VCFCodec. See + * {@link AbstractVCFCodec#setVCFHeader(VCFHeader).}
  • + *
  • any header that contains a header line that doesn't conform to the resulting (highest )version of any + * header in the merge list
  • + *
  • any header that has a sequence dictionary that is incompatible with any other merged header's + * sequence dictionary. All headers must either share a common sequence dictionary, or have a sequence dictionary + * that is a subset of the common sequence dictionary that is taken from the remaining headers.
  • + *
+ */ +public class VCFHeaderMerger { + + /** + * Merge all header lines in a set of headers into a single set of header lines. The resulting set includes + * all unique lines that appeared in any header; duplicates of lines are excluded from the result set. Equivalent + * header lines are reduced to a single representative header line. The resulting set contains a ##fileformat + * version line for the newest version seen in any of the headers provided in the input header collection, + * and all lines in the merged set are compatible with that version. + * + * @param headers the headers to merge + * @param emitWarnings true if warnings should be emitted + * @return a set of merged VCFHeaderLines + * @throws TribbleException if any header has a version < VCFv4.2, or if any header line in any + * input header is not compatible the newest version selected from amongst all headers provided, or if any + * header has a sequence dictionary that is incompatible with any other header's sequence dictionary + */ + public static Set getMergedHeaderLines(final Collection headers, final boolean emitWarnings) { + ValidationUtils.nonNull(headers, "headers"); + ValidationUtils.validateArg(!headers.isEmpty(), "headers collection must be non empty"); + + // use a VCFMetaDataLines object to accumulate header lines + final VCFMetaDataLines mergedMetaData = new VCFMetaDataLines(); + final HeaderMergeConflictWarnings conflictWarner = new HeaderMergeConflictWarnings(emitWarnings); + + final VCFHeaderVersion newestVersion = getNewestHeaderVersion(headers); + final SAMSequenceDictionary commonSequenceDictionary = getCommonSequenceDictionaryOrThrow(headers, conflictWarner); + + for (final VCFHeader sourceHeader : headers) { + for (final VCFHeaderLine line : sourceHeader.getMetaDataInSortedOrder()) { + final String key = line.getKey(); + if (VCFHeaderVersion.isFormatString(key) || key.equals(VCFHeader.CONTIG_KEY)) { + // drop all version and contig lines, and at the end we'll set the version and + // commonSequenceDictionary + continue; + } + + // Structured header lines are only considered equal if they have identical key, id, and + // attribute/value pairs, but for merging we need to reduce lines that have the same key/id pairs + // but different attributes to a single line. So use the more permissive "findEquivalentHeaderLine" + // to detect equivalent lines, and delegate to the individual header line implementations to do the + // smart reconciliation. + final VCFHeaderLine other = mergedMetaData.findEquivalentHeaderLine(line); + if (other != null && !line.equals(other)) { + if (key.equals(VCFConstants.FORMAT_HEADER_KEY)) { + // Delegate to the FORMAT line resolver + mergedMetaData.addMetaDataLine( + VCFFormatHeaderLine.getMergedFormatHeaderLine( + (VCFFormatHeaderLine) line, + (VCFFormatHeaderLine) other, + conflictWarner) + ); + } else if (key.equals(VCFConstants.INFO_HEADER_KEY)) { + // Delegate to the INFO line resolver + mergedMetaData.addMetaDataLine( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + (VCFInfoHeaderLine) line, + (VCFInfoHeaderLine) other, + conflictWarner) + ); + } else if (line.isIDHeaderLine()) { + // equivalent ID header line, but not a compound(format/info) line, and also not strictly equal + // to the existing line: preserve the existing line (this *may* drop attributes/values if the + // dropped line has additional attributes) + conflictWarner.warn( + String.format("Dropping duplicate header line %s during header merge, retaining equivalent line %s", + line, + other)); + } else { + // a non-structured line with a duplicate key of an existing line, but a different value, + // retain the new line in addition to the old one + mergedMetaData.addMetaDataLine(line); + } + } else { + mergedMetaData.addMetaDataLine(line); + } + } + } + return makeMergedMetaDataSet(mergedMetaData, newestVersion, commonSequenceDictionary, conflictWarner); + } + + // Create the final set of all of our merged header lines. Start with the version line for the new + // version, add in the lines from the merged set, use the resulting list to create a header, add the common + // sequence dictionary to that, and then extract and return the resulting set of lines in sorted order + private static Set makeMergedMetaDataSet( + final VCFMetaDataLines mergedMetaData, + final VCFHeaderVersion newestVersion, + final SAMSequenceDictionary commonSequenceDictionary, + final HeaderMergeConflictWarnings conflictWarner) { + + if (conflictWarner.emitWarnings) { + mergedMetaData.getValidationErrors(newestVersion) + .forEach(validationError -> conflictWarner.warn(validationError.getFailureMessage())); + } + + final Set mergedLines = VCFHeader.makeHeaderVersionLineSet(newestVersion); + mergedLines.addAll(mergedMetaData.getMetaDataInInputOrder()); + final VCFHeader mergedHeader = new VCFHeader(mergedLines, Collections.emptySet()); + if (commonSequenceDictionary != null) { + mergedHeader.setSequenceDictionary(commonSequenceDictionary); + } else { + conflictWarner.warn( + "The header lines resulting from a header merge contain no contig lines because none " + + "of the input headers contains a sequence dictionary."); + } + + return new LinkedHashSet<>(mergedHeader.getMetaDataInSortedOrder()); + } + + // Find the newest version af any header in the input set, and return that to use as the target + // version for the merged lines. + private static VCFHeaderVersion getNewestHeaderVersion(final Collection vcfHeaders) { + VCFHeaderVersion newestVersion = null; + for (final VCFHeader header : vcfHeaders) { + final VCFHeaderVersion vcfVersion = header.getVCFHeaderVersion(); + if (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { + throw new TribbleException(String.format( + "Cannot merge a VCFHeader with version (%s) that is older than version %s", + header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2)); + } + if (newestVersion == null || (vcfVersion.ordinal() > newestVersion.ordinal())) { + newestVersion = vcfVersion; + } + } + return newestVersion; + } + + // Create a common sequence dictionary from the set of dictionaries in VCFHeaders. The headers must + // either have identical dictionaries, or contain a common superset dictionary where individual dictionaries + // contain a dictionary that is subset of that common superset. Otherwise throw. + private static SAMSequenceDictionary getCommonSequenceDictionaryOrThrow( + final Collection headers, + final HeaderMergeConflictWarnings conflictWarner) { + SAMSequenceDictionary candidateDictionary = null; + + // Because we're doing pairwise comparisons and always selecting the best dictionary as + // our running candidate, we need to visit the headers in order of dictionary size + // (largest first). This prevents a premature failure where an individual pairwise + // comparison erroneously fails because the source is pairwise incompatible with the + // running candidate, and the common superset exists but we just haven't seen it yet. + final List headersByDictionarySize = new ArrayList<>(headers); + headersByDictionarySize.sort(((Comparator) + (hdr1, hdr2) -> Integer.compare(getDictionarySize(hdr1), getDictionarySize(hdr2))).reversed()); + + for ( final VCFHeader sourceHeader : headersByDictionarySize ) { + final SAMSequenceDictionary sourceDictionary = sourceHeader.getSequenceDictionary(); + if (sourceDictionary != null) { + if (candidateDictionary == null) { + candidateDictionary = sourceDictionary; + } else { + // first, compare with checkContigOrdering on + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility compatibility = + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + true); + switch (compatibility) { + case IDENTICAL: // existing candidateDictionary is identical to sourceDictionary, so keep it + case SUPERSET: // existing candidateDictionary is a superset of sourceDictionary, so keep it + break; + + case COMMON_SUBSET: // fall through + case DIFFERENT_INDICES: + // There exists a common subset of contigs, but for merging purposes we have a slightly + // stricter requirement, that one dictionary is a superset of the other. So try the + // comparison again with checkContigOrdering off, in both directions. If one is a + // superset of the other, retain the superset. + if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + false)) { + break; // keep our candidate + } else if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + sourceDictionary, + candidateDictionary, + false)) { + candidateDictionary = sourceDictionary; // take the sourceDictionary as the new candidate + } else { + // dictionaries are disjoint, and we have no basis to choose a merge order for the + // non-common contigs, so give up + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + break; + + case NO_COMMON_CONTIGS: // no overlap between dictionaries + case UNEQUAL_COMMON_CONTIGS: // common subset has contigs that have the same name but different lengths + case NON_CANONICAL_HUMAN_ORDER: // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + case OUT_OF_ORDER: // the two dictionaries overlap but the overlapping contigs occur in different + default: + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + } + } else { + conflictWarner.warn( + String.format( + "Merging header with no sequence dictionary: %s", + getHeaderFragmentForDisplay(sourceHeader))); + } + } + return candidateDictionary; + } + + private static Integer getDictionarySize(final VCFHeader hdr) { + final SAMSequenceDictionary dictionary = hdr.getSequenceDictionary(); + return dictionary == null ? 0 : dictionary.size(); + } + + private static String createHeaderDictionaryFailureMessage( + final SAMSequenceDictionary commonSequenceDictionary, + final VCFHeader sourceHeader, + final SAMSequenceDictionary sourceSequenceDictionary, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility failureReason) { + // return a nice long message that includes as much of the offending context as is reasonable, + // without printing the entire context, since the headers and sequence dictionaries can have + // thousands of entries + return String.format( + "Can't merge VCF headers with incompatible sequence dictionaries, merge failed due to %s:" + + "\n\nHeader dictionary:\n\n%1.2000s\n\nis incompatible with the common dictionary:\n\n%1.2000s\n\n merging VCF header:\n\n%1.2000s\n", + failureReason, + sourceSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + commonSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + getHeaderFragmentForDisplay(sourceHeader)); + } + + private static String getHeaderFragmentForDisplay(final VCFHeader sourceHeader) { + return sourceHeader.getContigLines().stream().map(VCFContigHeaderLine::toString).collect(Collectors.joining("\n")); + } + + /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ + static final class HeaderMergeConflictWarnings { + boolean emitWarnings; + final Set alreadyIssued = new HashSet<>(); + + protected HeaderMergeConflictWarnings(final boolean emitWarnings ) { + this.emitWarnings = emitWarnings; + } + + public void warn(final String msg) { + if ( emitWarnings && ! alreadyIssued.contains(msg) ) { + alreadyIssued.add(msg); + VCFHeader.logger.warn(msg); + } + } + } +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index 43f43c65c3..ce5ed1920a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -26,6 +26,7 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; /** * information that identifies each header version @@ -47,7 +48,7 @@ public enum VCFHeaderVersion { * @param vString the version string * @param fString the format string */ - VCFHeaderVersion(String vString, String fString) { + VCFHeaderVersion(String vString, String fString) { this.versionString = vString; this.formatString = fString; } @@ -67,7 +68,8 @@ public static VCFHeaderVersion toHeaderVersion(String version) { /** * are we a valid version string of some type - * @param version the version string + * @param version the version string (the part of the header line that specifies the version, + * i.e., "VCFv4.3" if the line is "##fileformat=VCFv4.3") * @return true if we're valid of some type, false otherwise */ public static boolean isVersionString(String version){ @@ -75,7 +77,8 @@ public static boolean isVersionString(String version){ } /** - * are we a valid format string for some type + * are we a valid format string for some type (the key part of the header line that specifies a version, + * i.e., "fileformat" if the line is "##fileformat=VCFv4.3") * @param format the format string * @return true if we're valid of some type, false otherwise */ @@ -87,8 +90,16 @@ public static boolean isFormatString(String format){ return false; } - public static VCFHeaderVersion getHeaderVersion(String versionLine) { - String[] lineFields = versionLine.split("="); + /** + * + * @param versionLine a VCF header version line, including the leading meta data indicator, + * for example "##fileformat=VCFv4.2" + * @return the VCFHeaderVersion for this string + * @throws TribbleException.InvalidHeader if the string is not a version string for a recognized supported version + */ + public static VCFHeaderVersion fromHeaderVersionLine(final String versionLine) { + ValidationUtils.nonNull(versionLine, "version line"); + final String[] lineFields = versionLine.split("="); if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); @@ -98,6 +109,13 @@ public static VCFHeaderVersion getHeaderVersion(String versionLine) { return toHeaderVersion(lineFields[1]); } + /** + * @return A VCF "##fileformat=version" metadata string for the supplied version. + */ + public String toHeaderVersionLine() { + return String.format("%s%s=%s", VCFHeader.METADATA_INDICATOR, getFormatString(), getVersionString()); + } + /** * Utility function to clean up a VCF header string * @@ -118,6 +136,20 @@ public boolean isAtLeastAsRecentAs(final VCFHeaderVersion target) { return this.ordinal() >= target.ordinal(); } + /** + * Determine if two header versions are compatible (header lines from these versions are interchangeable). + * For now, the only incompatibility is between V4.3 and any other version. All other version combinations + * are compatible. + * @param v1 first version to compare + * @param v2 scond version to compare + * @return true if the versions are compatible + */ + //TODO: this method can be removed once this is rebased on the vcf4.3 writing branch + public static boolean versionsAreCompatible(final VCFHeaderVersion v1, final VCFHeaderVersion v2) { + return v1.equals(v2) || + (!v1.isAtLeastAsRecentAs(VCF4_3) && !v2.isAtLeastAsRecentAs(VCF4_3)); + } + public String getVersionString() { return versionString; } @@ -125,4 +157,5 @@ public String getVersionString() { public String getFormatString() { return formatString; } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 13df34bc87..12a29a1f6c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -26,44 +26,90 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.utils.ValidationUtils; + /** - * @author ebanks *

* Class VCFInfoHeaderLine *

*

- * A class representing a key=value entry for INFO fields in the VCF header + * A class representing an INFO field in the VCF header *

*/ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } + private static final long serialVersionUID = 1L; + + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + } + + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); } public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version + ); + validateForVersion(version); } - // info fields allow flag values - @Override - boolean allowFlagValues() { - return true; + /** + * Compare two VCFInfoHeaderLine objects to determine if they have compatible number types, and return a + * VCFInfoHeaderLine that represents the result of merging these two lines. + * + * @param infoLine1 first info line to merge + * @param infoLine2 second info line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFInfoHeaderLine + */ + public static VCFInfoHeaderLine getMergedInfoHeaderLine( + final VCFInfoHeaderLine infoLine1, + final VCFInfoHeaderLine infoLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(infoLine1); + ValidationUtils. nonNull(infoLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + infoLine1, + infoLine2, + conflictWarner, + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); } @Override public boolean shouldBeAddedToDictionary() { return true; } + + @Override + //TODO: integrate this with the existing validateKeyOrID method + protected boolean validHeaderID(final String id) { + return super.validHeaderID(id) || id.equals(VCFConstants.THOUSAND_GENOMES_KEY); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java new file mode 100644 index 0000000000..843fdf98cc --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -0,0 +1,525 @@ +package htsjdk.variant.vcf; + +import htsjdk.annotations.InternalAPI; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Class for managing the set of VCFHeaderLines maintained by a VCFHeader. + * + * Since this class is used to incrementally build up a set of header lines for use with a VCFHeader, + * it does not require that the list always contain a fileformat line (its VCFHeader's job to enforce + * that condition). + * + * This class maintains several invariants: + * + * - The list keeps track of the "current version" by tracking whether a version line (a line that + * establishes the VCFHeaderVersion, such as format/fileformat line) is contained in the list. If + * no version line has been added, the list will have a null current version, and contain 0 version + * lines. If a version line has been added, it will have a non-null version, and contain 1 version line. + * If the version line is manually removed, the "current version" is reset to null. + * + * - Each contig line that is retained is guaranteed to have a unique contig index. This does + * NOT guarantee that the contig indices are contiguous, or ordered, only that they are unique. + * + * - Each structured (ID) line for a given key will have a unique ID. Any new line that has the same + * key/ID pair as an existing line will replace the previous line. (Previous htsjdk implementations + * preserve such lines in a master line list, but would silently drop them from the typed + * lookup lists, so such duplicates would never be returned in queries for typed lines such as + * getInfoHeaderLines(), but would still be serialized on write.) + * + * This class does NOT validate that the lines contained are valid for the current version (that is + * the caller's responsibilty). + */ +//Visible to allow disq Kryo registration for serialization +@InternalAPI +final class VCFMetaDataLines implements Serializable { + public static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFMetaDataLines.class); + + // Master map of all header lines (including file format version lines and contig header lines) + private final Map mMetaData = new LinkedHashMap<>(); + + // Map of contig index to contig header line. Must be kept in sync with the mMetaData map + private final Map contigIndexMap = new LinkedHashMap<>(); + + // Current version for lines included in the list. May be null. Must be kept in sync with the + // contents of the mMetaData map. + private VCFHeaderVersion vcfVersion; + + /** + * Add all metadata lines from Set. If a duplicate line is encountered (duplicate content for + * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only + * the new line will be retained. + * + * @param newMetaData Set of lines to be added to the list. + * @throws IllegalArgumentException if a version is established or if any line fails validation for that version + */ + public void addMetaDataLines(final Set newMetaData) { + newMetaData.forEach(this::addMetaDataLine); + } + + /** + * Add a metadata line to the list. If a duplicate line is encountered (duplicate content for + * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only + * the newest line will be retained. + * + * @param newMetaDataLine header line to attempt to add + * @returns an existing (equivalent) header line that was replaced by newMetaDataLine, if any, + * otherwise null + */ + public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.nonNull(newMetaDataLine, "metadata line"); + + if (VCFHeaderVersion.isFormatString(newMetaDataLine.getKey())) { + // for format lines, we need to remove any existing format line (which may have a different key + // than the new line, since old VCF versions use a different format key than modern versions) + return updateVersion(newMetaDataLine); + } else { + // otherwise, see if there is an equivalent line that the new line will replace + final HeaderLineMapKey newMapKey = makeKeyForLine(newMetaDataLine); + final VCFHeaderLine equivalentMetaDataLine = mMetaData.get(newMapKey); + if (equivalentMetaDataLine == null) { + createNewMapEntry(newMapKey, newMetaDataLine); + } else { + replaceExistingMapEntry(newMapKey, equivalentMetaDataLine, newMetaDataLine); + } + return equivalentMetaDataLine; + } + } + + /** + * Remove a metadata line from the list. This is the inverse of addMetaDataLine - it removes a + * line that has an identical key and value as lineToRemove if lineToRemove is an unstructured (non-ID) + * but if lineToRemove is a structured line, it will remove the line that has the same key/ID pair as + * lineToRemove, regardless of other content. + * + * The removed value is returned, and can be used by the caller to determine if the removed line has a + * different value than the line presented. + * + * @param lineToRemove the header line to remove + * @return The actual headerline removed, or null of no equivalent headerline was found to remove + */ + public VCFHeaderLine removeMetaDataLine(final VCFHeaderLine lineToRemove) { + final VCFHeaderLine removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); + if (removedLine != null) { + // only synchronize the dependent version and contig map variables if a line was ACTUALLY removed + if (VCFHeaderVersion.isFormatString(removedLine.getKey())) { + vcfVersion = null; + } else if (lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { + removeFromContigIndexMap((VCFContigHeaderLine) lineToRemove); + } + } + return removedLine; + } + + /** + * @return the version for any contained version line. may be null if no file format version + * line is in the list + */ + public VCFHeaderVersion getVCFVersion() { + return vcfVersion; + } + + /** + * Return the existing line from the list that is "equivalent" to the query line, where + * equivalent is defined as having the same key and value for unstructured header lines, or the + * same key and ID, but not necessarily the same value (for structured header lines). The + * "equivalent" line returned by this method is not guaranteed to be equal to the queryLine, + * in the case where the queryLine is an ID line. + * + * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, what + * line, if any, would it replace". + * + * @param queryLine the source line to use to check for equivalents + * @return The existing header line of the type/key provided, otherwise NULL. + */ + public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { + return mMetaData.get(makeKeyForLine(queryLine)); + } + + /** + * Validate all metadata lines except the file format line against a target version. + * Throws {@link TribbleException.VersionValidationFailure} if any line is incompatible with the given version. + * @param targetVersion the target version to validate against + * @throws TribbleException if any existing line fails to validate against {@code targetVersion} + */ + //TODO: we need to tell users how to resolve the case where this fails due to version validation + //i.e, use a custom upgrade tool + public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { + mMetaData.values().forEach(headerLine -> { + if (!VCFHeaderVersion.isFormatString(headerLine.getKey())) { + headerLine.validateForVersion(targetVersion); + } + }); + } + + /** + * Get a list of validation failures for all metadata lines (except the file format line) against + * a target version. + * + * @param targetVersion the target version to validate against + * @return an Collection describing the lines that failed to validate + * incompatible with targetVersion. The collections is empty if validation succeeded for all lines. + */ + public Collection getValidationErrors(final VCFHeaderVersion targetVersion) { + return mMetaData.values().stream() + .filter(line -> !VCFHeaderVersion.isFormatString(line.getKey())) + .map(l -> l.getValidationFailure(targetVersion)) + .filter(o -> o.isPresent()) + .map(o -> o.get()) + .collect(Collectors.toList()); + } + + /** + * get the meta data, associated with this header, in input order + * + * @return a set of the meta data + */ + public Set getMetaDataInInputOrder() { + return Collections.unmodifiableSet(new LinkedHashSet<>(mMetaData.values())); + } + + /** + * get the meta data, associated with this header, in SORTED order + * + * @return a set of the meta data + */ + public Set getMetaDataInSortedOrder() { + // Use an intermediate TreeSet to get the correct sort order (via the header line + // comparators), but return an (unmodifiable) LinkedHashSet because TreeSet has a + // `contains` implementation based on comparator equality that can lead to inconsistent + // results for header line types like VCFContigHeaderLine that have a compareTo + // implementation that is inconsistent with equals. + return Collections.unmodifiableSet(new LinkedHashSet<>(new TreeSet<>(mMetaData.values()))); + } + + /** + * @return all of the structured (ID) lines in their original file order, or an empty list if none were present + */ + public List getIDHeaderLines() { + return mMetaData.values().stream() + .filter(VCFHeaderLine::isIDHeaderLine) + .map(hl -> (VCFSimpleHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getFilterLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY)) + .map(hl -> (VCFFilterHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present + */ + public List getContigLines() { + return Collections.unmodifiableList(new ArrayList<>(new TreeSet<>(contigIndexMap.values()))); + } + + /** + * Get the VCFHeaderLine(s) whose key equals key. Returns null if no such line exists + * @param key the VCFHeaderLine key to use to locate the headerline + * @return collection of VCFHeaderLine + */ + public Collection getMetaDataLines(final String key) { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Returns the INFO VCFHeaderLine in their original ordering + */ + public Collection getInfoHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.INFO_HEADER_KEY)) + .map(hl -> (VCFInfoHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * Returns the FORMAT VCFHeaderLine in their original ordering + */ + public Collection getFormatHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY)) + .map(hl -> (VCFFormatHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @param id the id of the requested header line + * @return the VCFHeaderLine info line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(final String id) { + return (VCFInfoHeaderLine) mMetaData.get(makeKey(VCFConstants.INFO_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header format line + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { + return (VCFFormatHeaderLine) mMetaData.get(makeKey(VCFConstants.FORMAT_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { + return (VCFFilterHeaderLine) mMetaData.get(makeKey(VCFConstants.FILTER_HEADER_KEY, id)); + } + + /** + * Returns the other VCFHeaderLines in their original ordering, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { + return mMetaData.values().stream().filter( + hl -> + !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY) + ) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * The version/fileformat header line if one exists, otherwise null. + * @return The version/fileformat header line if one exists, otherwise null. + */ + public VCFHeaderLine getFileFormatLine() { + // find any existing version line(s). since there are multiple possible keys that + // represent version lines (old V3 specs used "format" instead of "fileformat") + final List existingVersionLines = mMetaData.values() + .stream() + .filter(line -> VCFHeaderVersion.isFormatString(line.getKey())) + .collect(Collectors.toList()); + + // This class doesn't mandate that the list it maintains always contains a fileformat line + // (its VCFHeader's job to maintain that condition for the header). + if (!existingVersionLines.isEmpty()) { + if (existingVersionLines.size() > 1) { + throw new IllegalStateException( + String.format("The metadata lines class contains more than one version line (%s)", + existingVersionLines.stream() + .map(VCFHeaderLine::toString) + .collect(Collectors.joining(",")))); + } + return existingVersionLines.get(0); + } else { + return null; + } + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder(); + b.append("[VCFMetaDataLines:"); + for ( final VCFHeaderLine line : mMetaData.values() ) + b.append("\n\t").append(line); + return b.append("\n]").toString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFMetaDataLines)) return false; + + final VCFMetaDataLines that = (VCFMetaDataLines) o; + + return mMetaData.equals(that.mMetaData); + } + + @Override + public int hashCode() { + return mMetaData.hashCode(); + } + + /** + * Generate a unique key for a VCFHeaderLine. If the header line is a VCFStructuredHeaderLine, the key + * is the concatenation of the VCFHeaderLine's key (i.e., the type of the VCFHeaderLine) and the ID for + * that VCFHeaderLine (with a ":" separator). Otherwise, we use the concatenation of the OTHER_KEY, the + * VCFHeaderLine's key, and a nonce value to ensure that unstructured lines never collide with structured + * lines, and also can have duplicate identical instances. + * + * @param headerLine the {@link VCFHeaderLine} for which a key should be returned + * @return the generated HeaderLineMapKey + */ + private HeaderLineMapKey makeKeyForLine(final VCFHeaderLine headerLine) { + if (headerLine.isIDHeaderLine()) { + // these are required to have a unique ID, so use the line key as the key, and the id as the constraint + return makeKey(headerLine.getKey(), headerLine.getID()); + } else { + // Allow duplicate unstructured "other" keys, as long as they have different values. Use + // the line key as the key, and the line hashcode as the constraint. + // + // The previous implementation dropped duplicate keys for unstructured lines, but the spec doesn't + // require these to be unique (only to have unique values). This implementation is more permissive in + // that it allows lines with duplicate keys to accumulate as long as they have different values, but + // retains only one with a unique value. + return makeKey(headerLine.getKey(), Integer.toString(headerLine.hashCode())); + } + } + + // Create a VCFHeaderLine hashmap key given a key and an id + private HeaderLineMapKey makeKey(final String nameSpace, final String id) { return new HeaderLineMapKey(nameSpace, id); } + + private void createNewMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + // for creation of a new entry, call updateMapEntry, but validate that it ALWAYS returns the + final VCFHeaderLine existingLine = updateMapEntry(newMapKey, newMetaDataLine); + if (existingLine != null ) { + throw new TribbleException(String.format( + "Internal header synchronization error - found unexpected previous value %s while adding %s", + existingLine, + newMetaDataLine)); + } + } + + private VCFHeaderLine updateMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + final VCFHeaderLine existingLine = mMetaData.put(newMapKey, newMetaDataLine); + if (newMetaDataLine.isIDHeaderLine() && newMetaDataLine.getKey().equals(VCFHeader.CONTIG_KEY)) { + addToContigIndexMap((VCFContigHeaderLine) newMetaDataLine); + } + return existingLine; + } + + // We can't just blindly replace a line in the map based on the key using map.put, because the contig + // map will get out of sync if the line being replaced is a contig line that has a different contig + // index than the line being replaced. So replace the line in two atomic operations; first remove + // the old line and it's corresponding contig index entry, then add the new contig line and it's + // corresponding contig index entry. + private VCFHeaderLine replaceExistingMapEntry( + final HeaderLineMapKey newMapKey, + final VCFHeaderLine existingMetaDataLine, + final VCFHeaderLine newMetaDataLine) { + removeFromMapOrThrow(existingMetaDataLine); + logger.debug(() -> + "Replacing existing header metadata line: " + + existingMetaDataLine.toStringEncoding() + + " with header metadata line: " + + newMetaDataLine.toStringEncoding() + + "."); + createNewMapEntry(newMapKey, newMetaDataLine); + return existingMetaDataLine; + } + + // remove a line that is expected to be currently in the list, and throw if the line + // isn't found, or if the removed line is different (not equal to) the line to remove + private void removeFromMapOrThrow(final VCFHeaderLine lineToRemove) { + final VCFHeaderLine removedLine = removeMetaDataLine(lineToRemove); + if (removedLine == null || !removedLine.equals(lineToRemove)) { + // sanity check since in this case there should ALWAYS be a non-null line that was removed + // that is an exact duplicate of the "existingLine" + throw new TribbleException(String.format("Internal header synchronization error %s/%s", + lineToRemove, + removedLine == null ? "null line" : removedLine)); + } + } + + //add the new line to our contig index map + private void addToContigIndexMap(final VCFContigHeaderLine newContigLine) { + final VCFContigHeaderLine collidingContigLine = contigIndexMap.get(newContigLine.getContigIndex()); + if (collidingContigLine != null && !collidingContigLine.equals(newContigLine)) { + if (collidingContigLine.getID().equals(newContigLine.getID())) { + // the new line has the same contig ID and index as an existing line, but differ in + // some other attribute, so accept it but log a warning + logger.warn(String.format( + "Replacing an existing contig header line (%s) with a new, similar line that has different attributes (%s)", + collidingContigLine, + newContigLine)); + } else { + // the new contig line collides with an existing contig index, but specifies a different + // contig name, so reject it + throw new TribbleException(String.format( + "Attempt to replace a contig header line (%s) that has the same contig index as an existing line (%s)", + newContigLine, + collidingContigLine)); + } + } + contigIndexMap.put(newContigLine.getContigIndex(), newContigLine); + } + + // remove the contig header line from the contig index map + private void removeFromContigIndexMap(final VCFContigHeaderLine existingContigLine) { + // this remove overload only removes the specified object if its actually in the map + contigIndexMap.remove(existingContigLine.getContigIndex(), existingContigLine); + } + + // First, check for existing header lines that establish a header version. Whenever a new one is + // added, we need to remove the previous version line, validate all remaining lines against the new + // version, then add the new version line, and update our version state. We have to explicitly + // call isFormatString, and manually update the lines, since there is more than one header line key + // that can change the version. In some cases this will result in removing a line fileformat/version + // line with one key and replacing it with a line that has a different key. + private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.validateArg( + VCFHeaderVersion.isFormatString(newMetaDataLine.getKey()), + "a file format line is required"); + + final VCFHeaderLine currentVersionLine = getFileFormatLine(); + final VCFHeaderVersion newVCFVersion = VCFHeaderVersion.toHeaderVersion(newMetaDataLine.getValue()); + + if (vcfVersion == null) { + logger.debug("Establishing header metadata version ", newVCFVersion); + } else if (!newVCFVersion.equals(vcfVersion)) { + logger.debug(() -> + "Updating header metadata version from " + + vcfVersion + + " to " + + newVCFVersion); + removeFromMapOrThrow(currentVersionLine); + } + + mMetaData.put(makeKeyForLine(newMetaDataLine), newMetaDataLine); + vcfVersion = newVCFVersion; + return currentVersionLine; + } + + // composite keys used by the metadata lines map + private static class HeaderLineMapKey implements Serializable { + public static final long serialVersionUID = 1L; + + final String key; + final String constraint; + + public HeaderLineMapKey(final String key, final String constraint) { + this.key = key; + this.constraint = constraint; + } + + public final String getKey() { return key; } + public final String getConstraint() { return constraint; } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final HeaderLineMapKey that = (HeaderLineMapKey) o; + + if (!key.equals(that.key)) return false; + return constraint.equals(that.constraint); + } + + @Override + public int hashCode() { + int result = key.hashCode(); + result = 31 * result + constraint.hashCode(); + return result; + } + } + +} + diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java index 991faa806f..d8cd83b8bb 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java @@ -1,13 +1,41 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing META fields in the VCF header + * A class representing META fields in the VCF header. */ public class VCFMetaHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; public VCFMetaHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.META_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.META_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFMetaHeaderLine(final Map mapping) { + super(VCFConstants.META_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return Optional.of( + new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ))); + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java index 33f163e8dc..f5bd71c474 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java @@ -1,13 +1,51 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing PEDIGREE fields in the VCF header + * A class representing PEDIGREE fields in the VCF header. Applicable starting with version VCFv4.3. + * + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= */ public class VCFPedigreeHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFPedigreeHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.PEDIGREE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.PEDIGREE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFPedigreeHeaderLine(final Map mapping) { + super(VCFConstants.PEDIGREE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // previous to VCFv4.3, the PEDIGREE line did not have an ID. Such lines are not modeled by this + // class (since it is derived from VCFSimpleHeaderLine). Therefore instances of this class always + // represent VCFv4.3 or higher. So throw if the requested version is less than 4.3. + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java index 8fe9b67d6d..cbefb13237 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java @@ -27,8 +27,7 @@ public VCFRecordCodec(final VCFHeader header) { public VCFRecordCodec(final VCFHeader header, final boolean allowMissingFieldsInHeader) { this.vcfEncoder = new VCFEncoder(header, allowMissingFieldsInHeader, false); - // Explicitly set the version because it's not available in the header itself. - this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + this.vcfDecoder.setVCFHeader(header); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java index 973a976baa..7c45e9a1b2 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java @@ -1,13 +1,42 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing SAMPLE fields in the VCF header */ public class VCFSampleHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFSampleHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.SAMPLE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.SAMPLE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFSampleHeaderLine(final Map mapping) { + super(VCFConstants.SAMPLE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index 12b45e5bc9..c0a3abce5c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2012 The Broad Institute +* Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -25,98 +25,120 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; - +import java.util.stream.Collectors; /** - * @author ebanks - * - * A class representing a key=value entry for simple VCF header types + * An abstract class representing a VCF metadata line with a key and attribute=value pairs, one of + * which represents an ID. The key determines the "type" of the structured header line (i.e., contig, FILTER, + * INFO, ALT, PEDIGREE, META). + * + * The attribute/value pairs are ordered. The first entry in the map must be an ID attribute (used by the + * VCFHeader to ensure that no two structured header lines that share the same key in a given header have the + * same ID). */ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - private String name; - private Map genericFields = new LinkedHashMap(); + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFSimpleHeaderLine.class); public static final String ID_ATTRIBUTE = "ID"; public static final String DESCRIPTION_ATTRIBUTE = "Description"; + public static final String SOURCE_ATTRIBUTE = "Source"; + public static final String VERSION_ATTRIBUTE = "Version"; + + // List of expected tags (for this base class, its ID only; subclasses with more required tags + // should use a custom tag order if more required tags are expected + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(1) {{ add(ID_ATTRIBUTE); }}); + + // Map used to retain the attribute/value pairs, in original order. The first entry in the map must be + // an ID field. The entire map must be immutable to prevent hash values from changing, since these are + // often stored in Sets. Its not ACTUALLY immutable in orderto allow for special cases where subclasses + // have to be able to "repair" header lines (via a call to updateGenericField) during constructor validation. + // + // Otherwise the values here should never change during the lifetime of the header line. + private final Map genericFields = new LinkedHashMap(); /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param description description for this header line + * Constructor that accepts a key and string that represetns the rest of the line (after the ##KEY="). + * @param key the key to use for this line + * @param line the value part of the line + * @param version the target version to validate the line against */ - public VCFSimpleHeaderLine(String key, String name, String description) { - super(key, ""); - Map map = new LinkedHashMap(1); - map.put(DESCRIPTION_ATTRIBUTE, description); - initialize(name, map); + public VCFSimpleHeaderLine(final String key, final String line, final VCFHeaderVersion version) { + this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder)); + validate(); + validateForVersion(version); } /** - * create a VCF info header line - * - * @see #VCFSimpleHeaderLine(String, VCFHeaderVersion, String, List, List) VCFv4.2+ recommended tags support + * Key cannot be null or empty. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line + * @param key key to use for this header line. can not be null. + * @param id id name to use for this line + * @param description string that will be added as a "Description" tag to this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(line, version, key, expectedTagOrdering, Collections.emptyList()); + public VCFSimpleHeaderLine(final String key, final String id, final String description) { + super(key, ""); + genericFields.put(ID_ATTRIBUTE, id); + genericFields.put(DESCRIPTION_ATTRIBUTE, description); + validate(); } /** - * create a VCF info header line + * Key cannot be null or empty. + * + * Note that for attributes where the order is significant, use a LinkedHashMap + * to ensure that attribute order is honored. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line - * @param recommendedTags tags that are optional for this header line + * @param key key to use for this header line. can not be null. + * @param attributeMapping field mappings to use. may not be null. must contain an "ID" field to use as + * a unique id for this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering, final List recommendedTags) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering, recommendedTags)); + public VCFSimpleHeaderLine(final String key, final Map attributeMapping) { + super(key, ""); + ValidationUtils.nonNull(attributeMapping, "An attribute map is required for structured header lines"); + genericFields.putAll(attributeMapping); + validate(); } - public VCFSimpleHeaderLine(final String key, final Map mapping) { - super(key, ""); - name = mapping.get(ID_ATTRIBUTE); - initialize(name, mapping); + /** + * @return true if this is a structured header line (has a unique ID and multiple key/value pairs), + * otherwise false + */ + @Override + public boolean isIDHeaderLine() { return true; } + + /** + * Return the unique ID for this line. Returns null iff isIDHeaderLine is false. + * @return + */ + @Override + public String getID() { + return getGenericFieldValue(ID_ATTRIBUTE); } - /** - * Returns the String value associated with the given key. Returns null if there is no value. Key - * must not be null. - */ - String getGenericFieldValue(final String key) { - return this.genericFields.get(key); - } - - protected void initialize(String name, Map genericFields) { - if ( name == null || genericFields == null || genericFields.isEmpty() ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); - if ( name.contains("<") || name.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if ( name.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - this.name = name; - this.genericFields.putAll(genericFields); + /** + * Returns the String value associated with the given key. Returns null if there is no value. Key + * must not be null. + */ + public String getGenericFieldValue(final String key) { + return this.genericFields.get(key); } - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put(ID_ATTRIBUTE, name); - map.putAll(genericFields); - return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); + /** + * Returns a list of all attributes for this header line. + */ + public Map getGenericFields() { + return Collections.unmodifiableMap(this.genericFields); } @Override @@ -129,28 +151,78 @@ public boolean equals( final Object o ) { } final VCFSimpleHeaderLine that = (VCFSimpleHeaderLine) o; - return name.equals(that.name) && - genericFields.equals(that.genericFields); + return genericFields.equals(that.genericFields); } @Override public int hashCode() { int result = super.hashCode(); - result = 31 * result + name.hashCode(); result = 31 * result + genericFields.hashCode(); return result; } + /** + * create a string of a mapping pair for the target VCF version + * @return a string, correctly formatted + */ @Override - public String getID() { - return name; + protected String toStringEncoding() { + //NOTE: this preserves/round-trips "extra" attributes such as SOURCE, VERSION, etc. + final StringBuilder builder = new StringBuilder(); + builder.append(getKey()); + builder.append("=<"); + builder.append(genericFields.entrySet().stream() + .map(e -> e.getKey() + "=" + quoteAttributeValueForSerialization(e.getKey(), e.getValue())) + .collect(Collectors.joining(","))); + builder.append('>'); + return builder.toString(); } + // Called by VCFInfoHeaderLine to allow repairing of VCFInfoLines that have a Flag type and a non-zero count + // (the combination of which is forbidden by the spec, but which we tolerate for backward compatibility with + // previous versions of htsjdk, which silently repaired these). + // + // Replaces the original generic fields map with another immutable map with the updated value. + protected void updateGenericField(final String attributeName, final String value) { + genericFields.put(attributeName, value); + } /** - * @return a map of all pairs of fields and values in this header line + * Return true if the attribute name requires quotes. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value should be embedded n quotes during serialization */ - public Map getGenericFields() { - return Collections.unmodifiableMap(genericFields); + protected boolean getIsQuotableAttribute(final String attributeName) { + // the (VF4.3) spec says that the DESCRIPTION, SOURCE, and VERSION attributes should be quoted + // for INFO/FORMAT lines, but htsjdk seems to have historically quoted these for all structured + // header lines + return attributeName.equals(DESCRIPTION_ATTRIBUTE) || + attributeName.equals(SOURCE_ATTRIBUTE) || + attributeName.equals(VERSION_ATTRIBUTE); } - } + + private void validate() { + if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { + throw new TribbleException( + String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); + } + validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE), "ID"); + } + + // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by + // definition per the spec (i.e., Description, Source, Version for INFO lines). + private String quoteAttributeValueForSerialization(final String attribute, final String originalValue) { + return originalValue.contains(",") || originalValue.contains(" ") || getIsQuotableAttribute(attribute) ? + "\""+ escapeQuotes(originalValue) + "\"" : + originalValue; + } + + private static String escapeQuotes(final String value) { + // java escaping in a string literal makes this harder to read than it should be + // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) + // ie replace: something that's not a backslash ([^\]) followed by a double quote + // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote + return value.replaceAll("([^\\\\])\"", "$1\\\\\""); + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index 6e9e713a20..0d61cf35e4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -51,15 +51,21 @@ public class VCFStandardHeaderLines { /** * Enabling this causes us to repair header lines even if only their descriptions differ. */ - private final static boolean REPAIR_BAD_DESCRIPTIONS = false; - private static Standards formatStandards = new Standards(); - private static Standards infoStandards = new Standards(); + private static Standards formatStandards = new Standards<>(); + private static Standards infoStandards = new Standards<>(); /** * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly * allocated {@link VCFHeader} with standard VCF header lines repaired as necessary. */ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { + if (oldHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // the "repair" operation effectively upgrades old header lines to v4.2 format, + // but we don't "back-version" headers that are already newer than v4.2, so skip + // repair for newer headers + return oldHeader; + } + final Set newLines = new LinkedHashSet(oldHeader.getMetaDataInInputOrder().size()); for ( VCFHeaderLine line : oldHeader.getMetaDataInInputOrder() ) { if ( line instanceof VCFFormatHeaderLine ) { @@ -67,17 +73,17 @@ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { } else if ( line instanceof VCFInfoHeaderLine) { line = infoStandards.repair((VCFInfoHeaderLine) line); } - newLines.add(line); } + //NOTE that its possible for this to fail in the (probably rare) case that the repaired + //lines (which are "version-less") fail validation against the header version final VCFHeader repairedHeader = new VCFHeader(newLines, oldHeader.getGenotypeSamples()); - final VCFHeaderVersion oldHeaderVersion = oldHeader.getVCFHeaderVersion(); - if (oldHeaderVersion != null && oldHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // this needs to maintain version 4.3 (and not back-version to v4.2), so propagate - // the old version only for v4.3 - repairedHeader.setVCFHeaderVersion(oldHeaderVersion); - } + + // the "repair" operation effectively upgrades old header lines to v4.2 format, so the new header should + // reflect that since it may no longer conform to it's original version + // new header reflects that + repairedHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); return repairedHeader; } @@ -159,9 +165,9 @@ private static void registerStandard(final VCFFormatHeaderLine line) { // static { // FORMAT lines - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); @@ -169,16 +175,16 @@ private static void registerStandard(final VCFFormatHeaderLine line) { registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); // INFO lines - registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); } private static class Standards { @@ -191,7 +197,7 @@ public T repair(final T line) { final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount(); final boolean badType = line.getType() != standard.getType(); final boolean badDesc = ! line.getDescription().equals(standard.getDescription()); - final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); + final boolean needsRepair = badCountType || badCount || badType; if ( needsRepair ) { if ( GeneralUtils.DEBUG_MODE_ENABLED ) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFUtils.java b/src/main/java/htsjdk/variant/vcf/VCFUtils.java index 6d0e2d7b68..3599da7edc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFUtils.java +++ b/src/main/java/htsjdk/variant/vcf/VCFUtils.java @@ -25,110 +25,59 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.FileExtensions; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class VCFUtils { private static final Pattern INF_OR_NAN_PATTERN = Pattern.compile("^(?[-+]?)((?(INF|INFINITY))|(?NAN))$", Pattern.CASE_INSENSITIVE); + private static final boolean DEFAULT_VCF_STRICT_VERSION_VALIDATION = true; - public static Set smartMergeHeaders(final Collection headers, final boolean emitWarnings) throws IllegalStateException { - // We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set. - // This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering. - final LinkedHashMap map = new LinkedHashMap<>(); // from KEY.NAME -> line - final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); - final Set headerVersions = new HashSet<>(2); + // a global mutable static - is there an alternative ? + // there isn't any other reasonable place to keep this state + private static boolean vcfStrictVersionValidation = true; - // todo -- needs to remove all version headers from sources and add its own VCF version line - for (final VCFHeader source : headers) { - for (final VCFHeaderLine line : source.getMetaDataInSortedOrder()) { - - enforceHeaderVersionMergePolicy(headerVersions, source.getVCFHeaderVersion()); - String key = line.getKey(); - if (line instanceof VCFIDHeaderLine) - key = key + "-" + ((VCFIDHeaderLine) line).getID(); - - if (map.containsKey(key)) { - final VCFHeaderLine other = map.get(key); - if (line.equals(other)) { - // continue; - } else if (!line.getClass().equals(other.getClass())) { - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFFilterHeaderLine) { - final String lineName = ((VCFFilterHeaderLine) line).getID(); - final String otherName = ((VCFFilterHeaderLine) other).getID(); - if (!lineName.equals(otherName)) - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFCompoundHeaderLine) { - final VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine) line; - final VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine) other; - - // if the names are the same, but the values are different, we need to quit - if (!(compLine).equalsExcludingDescription(compOther)) { - if (compLine.getType().equals(compOther.getType())) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if (compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if (compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other); - } - } - if (!compLine.getDescription().equals(compOther.getDescription())) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - } - } - } - - // returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled. - return new LinkedHashSet<>(map.values()); - } + /** + * Determine if strict VCF version validation is enabled. Defaults to true. Strict version validation + * ensures that all VCF contents (header and variant contexts) conforms to the established header version. + * This should only be disabled when absolutely necessary. + * + * @return true if strict version validation is enabled + */ + public static boolean isStrictVCFVersionValidation() { return Defaults.STRICT_VCF_VERSION_VALIDATION; } - // Reject attempts to merge a VCFv4.3 header with any other version - private static void enforceHeaderVersionMergePolicy( - final Set headerVersions, - final VCFHeaderVersion candidateVersion) { - if (candidateVersion != null) { - headerVersions.add(candidateVersion); - if (headerVersions.size() > 1 && headerVersions.contains(VCFHeaderVersion.VCF4_3)) { - throw new IllegalArgumentException( - String.format("Attempt to merge version %s header with incompatible header version %s", - VCFHeaderVersion.VCF4_3.getVersionString(), - headerVersions.stream() - .filter(hv -> !hv.equals(VCFHeaderVersion.VCF4_3)) - .map(VCFHeaderVersion::getVersionString) - .collect(Collectors.joining(" ")))); - } - } + /** + * The headers passed in must be version >= 4.2 (older headers that are read in via AbstractVCFCodecs + * are "repaired" and stamped as VCF4.2 when they're read in). + * + * @param headers the set of headers to merge + * @param emitWarnings true if warning should be emitted by the merge + * @return + * @throws {@link htsjdk.tribble.TribbleException} if any header has a version < vcfV4.2 + * @throws {@link htsjdk.tribble.TribbleException} if any header cannot be upgraded to the newest version amongst + * all headers provided + */ + public static Set smartMergeHeaders( + final Collection headers, + final boolean emitWarnings) { + return VCFHeaderMerger.getMergedHeaderLines(headers, emitWarnings); } /** @@ -149,8 +98,8 @@ public static Set withUpdatedContigsAsLines(final Set withUpdatedContigsAsLines(final Set oldLines, final File referenceFile, final SAMSequenceDictionary refDict, final boolean referenceNameOnly) { final Set lines = new LinkedHashSet<>(oldLines.size()); - for (final VCFHeaderLine line : oldLines) { - if (line instanceof VCFContigHeaderLine) + for ( final VCFHeaderLine line : oldLines ) { + if ( line.isIDHeaderLine() && line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) ) continue; // skip old contig lines if (line.getKey().equals(VCFHeader.REFERENCE_KEY)) continue; // skip the old reference key @@ -184,17 +133,14 @@ public static List makeContigHeaderLines(final SAMSequenceD final File referenceFile) { final List lines = new ArrayList<>(); final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; - for (final SAMSequenceRecord contig : refDict.getSequences()) - lines.add(makeContigHeaderLine(contig, assembly)); + for ( final SAMSequenceRecord contig : refDict.getSequences() ) + lines.add(new VCFContigHeaderLine(contig, assembly)); return lines; } + @Deprecated private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { - final Map map = new LinkedHashMap<>(3); - map.put("ID", contig.getSequenceName()); - map.put("length", String.valueOf(contig.getSequenceLength())); - if (assembly != null) map.put("assembly", assembly); - return new VCFContigHeaderLine(map, contig.getSequenceIndex()); + return new VCFContigHeaderLine(contig, assembly); } /** @@ -295,22 +241,4 @@ else if (refPath.contains("hg38")) return assembly; } - /** - * Only displays a warning if warnings are enabled and an identical warning hasn't been already issued - */ - private static final class HeaderConflictWarner { - boolean emitWarnings; - Set alreadyIssued = new HashSet<>(); - - private HeaderConflictWarner(final boolean emitWarnings) { - this.emitWarnings = emitWarnings; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if (GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && !alreadyIssued.contains(line.getKey())) { - alreadyIssued.add(line.getKey()); - System.err.println(msg); - } - } - } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java new file mode 100644 index 0000000000..c6f0ad8708 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java @@ -0,0 +1,63 @@ +package htsjdk.variant.vcf; + +import htsjdk.utils.ValidationUtils; + +/** + * A class representing a VCF validation failure. + * @param a type representing the object that is being validated + */ +class VCFValidationFailure { + private final VCFHeaderVersion targetVersion; + private final T source; + private final String sourceMessage; + + /** + * A VCF validation failure. + * + * @param targetVersion the version for which validation failed. + * @param source the source object being validated + * @param sourceMessage the validation failure reason + */ + public VCFValidationFailure(final VCFHeaderVersion targetVersion, final T source, final String sourceMessage) { + ValidationUtils.nonNull(targetVersion); + ValidationUtils.nonNull(source); + ValidationUtils.nonNull(sourceMessage); + + this.targetVersion = targetVersion; + this.source = source; + this.sourceMessage = sourceMessage; + } + + /** + * @return the source object being validated + */ + public T getSource() { + return source; + } + + /** + * @return The validation failure reason. + */ + public String getSourceMessage() { + return sourceMessage; + } + + /** + * @return A formatted message describing the validation failure reason and target version. + */ + public String getFailureMessage() { + return String.format( + "Failure validating %s for reason %s, target version %s", + source.toString(), + sourceMessage, + targetVersion); + } + + /** + * @return The version for which validation failed. May be null. + */ + public VCFHeaderVersion getTargetVersion() { + return targetVersion; + } + +} diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java index 37842f8a9a..7167fa8f12 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -1,11 +1,7 @@ -package org.broadinstitute.hellbender.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.GATKBaseTest; +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.Interval; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -14,12 +10,10 @@ import java.util.Arrays; import java.util.List; -import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.*; -import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; - -public final class SequenceDictionaryUtilsUnitTest extends GATKBaseTest { +import static htsjdk.samtools.SAMSequenceDictionaryUtils.*; +import static htsjdk.samtools.SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.*; - private static Logger logger = LogManager.getLogger(SequenceDictionaryUtilsUnitTest.class); +public final class SAMSequenceDictionaryUtilsTest extends HtsjdkTest { @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) public Object[][] testSequenceRecordsAreEquivalentDataProvider() { @@ -43,7 +37,7 @@ public Object[][] testSequenceRecordsAreEquivalentDataProvider() { @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ - final boolean actual = SequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + final boolean actual = SAMSequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); Assert.assertEquals(actual, expected); } @@ -59,204 +53,157 @@ public Object[][] generateSequenceDictionaryTestData() { CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); - final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; - final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - - final List hg19AllContigsIntervalSet = Arrays.asList( - new SimpleInterval("chrM", 1, 1), - new SimpleInterval("chr1", 1, 1), - new SimpleInterval("chr2", 1, 1), - new SimpleInterval("chr10", 1, 1)); - final List hg19PartialContigsIntervalSet = Arrays.asList( - new SimpleInterval("chrM", 1, 1), - new SimpleInterval("chr1", 1, 1)); + final List hg19AllContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1), + new Interval("chr2", 1, 1), + new Interval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1)); return new Object[][] { // Identical dictionaries: - {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, // Dictionaries with a common subset: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, true}, // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, // If checkContigOrdering == false, ordering of the common contigs should not matter: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, // Dictionaries with no common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, // Dictionaries with unequal common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, true}, - { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, // One or both dictionaries in non-canonical human order: - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, false, true}, // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, // Dictionaries with a common subset, but different relative ordering within that subset - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, false, true}, // If checkContigOrdering == false, we should not get OUT_OF_ORDER: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET,false, false}, // Dictionaries with a common subset in the same relative order, but with different indices. // This will only throw an exception during validation if checkContigOrdering is true // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, false, true}, // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, // tests for SUPERSET - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, // Extended attributes should be ignored when determining whether a superset exists: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, null, false, false} + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, false, false} }; } - @Test( dataProvider = "SequenceDictionaryDataProvider" ) - public void testSequenceDictionaryValidation( final List firstDictionaryContigs, - final List secondDictionaryContigs, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test - final Class expectedExceptionUponValidation, - final boolean requireSuperset, - final boolean checkContigOrdering) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - final String testDescription = String.format("First dictionary: %s Second dictionary: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); - Exception exceptionThrown = null; - try { - SequenceDictionaryUtils.validateDictionaries( - "firstDictionary", - firstDictionary, - "secondDictionary", - secondDictionary, - requireSuperset, - checkContigOrdering); - } - catch ( Exception e ) { - exceptionThrown = e; - } - if ( expectedExceptionUponValidation != null ) { - Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), - String.format("Expected exception %s but saw %s instead. %s", - expectedExceptionUponValidation.getSimpleName(), - exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), - testDescription)); - } - else { - Assert.assertTrue(exceptionThrown == null, - String.format("Expected no exception but saw exception %s instead. %s", - exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", - testDescription)); - } - } - @Test( dataProvider = "SequenceDictionaryDataProvider" ) public void testSequenceDictionaryComparison( final List firstDictionaryContigs, final List secondDictionaryContigs, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, - final Class expectedExceptionUponValidation, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, final boolean requireSuperset, final boolean checkContigOrdering) { final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); final String testDescription = String.format("First dictionary: %s Second dictionary: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + SAMSequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SAMSequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); - final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = - SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SAMSequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, String.format("Dictionary comparison should have returned %s but instead returned %s. %s", @@ -274,64 +221,8 @@ public Object[][] getStandardValidationIgnoresContigOrderData() { }; } - @Test(dataProvider = "StandardValidationIgnoresContigOrderData") - public void testStandardValidationIgnoresContigOrder( final List firstDictionaryContigs, final List secondDictionaryContigs ) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - - // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) - // should ignore differences in ordering of common contigs, so we shouldn't get an exception here - SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); - } - - @DataProvider(name = "NonSupersetData") - public Object[][] getNonSupersetData() { - return new Object[][] { - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) } - }; - } - - @Test(dataProvider = "NonSupersetData") - public void testStandardValidationDoesNotRequireSuperset( final List firstDictionaryContigs, final List secondDictionaryContigs ) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - - // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) - // should not require a superset relationship, so we shouldn't get an exception here - SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); - } - - @Test(dataProvider = "NonSupersetData", expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) - public void testCRAMValidationDoesRequireSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { - final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); - final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); - - // CRAM validation against the reference SHOULD require a superset relationship, so we should - // get an exception here - SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); - } - - @DataProvider(name = "SupersetData") - public Object[][] getSupersetData() { - return new Object[][] { - { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19)}, //exactly same - { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19) }, - { Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19) } - }; - } - - @Test(dataProvider = "SupersetData") - public void testCRAMValidationDoesAcceptSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { - final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); - final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); - - //In these inputs , cram contigs are subsets of ref contigs and so it should be accepted - SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); - } - private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { - final List clonedContigs = new ArrayList(contigs.size()); + final List clonedContigs = new ArrayList<>(contigs.size()); // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects // across multiple dictionaries in tests diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 91804c48dc..95fb359446 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -34,9 +34,8 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -75,22 +74,22 @@ public void testCollapseExpandTest(final List in, final String expectedC public void testCreateDictionary() { final List inputLines = new ArrayList(); int counter = 0; + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFHeaderLine("x", "misc")); inputLines.add(new VCFHeaderLine("y", "misc")); - inputLines.add(new VCFSimpleHeaderLine("GATKCommandLine","z","misc")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); final ArrayList dict = BCF2Utils.makeDictionary(inputHeader); final int dict_size = dict.size(); - Assert.assertEquals(7,dict_size); + Assert.assertEquals(8,dict_size); } /** @@ -115,6 +114,7 @@ public Object[][] makeHeaderOrderTestProvider() { final List extraLines = new ArrayList(); int counter = 0; + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); @@ -142,7 +142,7 @@ public Object[][] makeHeaderOrderTestProvider() { for ( final List permutation : permutations ) { for ( int i = -1; i < inputLines.size(); i++ ) { final List allLines = new ArrayList(inputLines); - if ( i >= 0 ) + if ( i >= 0 && !VCFHeaderVersion.isFormatString(allLines.get(i).getKey()) ) allLines.remove(i); allLines.addAll(permutation); final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); @@ -179,8 +179,8 @@ public Object[][] makeHeaderOrderTestProvider() { private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { final List ids = new ArrayList(); for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine) { - ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); + if ( line.isIDHeaderLine()) { + ids.add(Integer.valueOf(line.getID())); } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 7a99916c5b..17e2ae3257 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -75,6 +75,7 @@ private static VCFHeader createFakeHeader() { final SAMSequenceDictionary sequenceDict = createArtificialSequenceDictionary(); final Set metaData = new HashSet<>(); final Set additionalColumns = new HashSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java index 8cff545f78..e04910eb0e 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java @@ -221,6 +221,7 @@ private final static void addHeaderLine(final Set metaData, final private static void createSyntheticHeader() { Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java index 9e7f7e45cb..379130407c 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java @@ -89,7 +89,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (final FileInputStream fis = new FileInputStream(fakeVCFFile)) { final AsciiLineReaderIterator iterator = new AsciiLineReaderIterator(new AsciiLineReader(fis)); @@ -110,6 +110,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { */ public static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index ca2afcbec0..ceac4f95a8 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -42,11 +42,7 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.*; import java.io.File; import java.io.FileInputStream; @@ -154,7 +150,7 @@ public void testWriteAndReadVCFHeaderless(final String extension) throws IOExcep writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (BlockCompressedInputStream bcis = new BlockCompressedInputStream(fakeVCFFile); FileInputStream fis = new FileInputStream(fakeVCFFile)) { @@ -228,7 +224,7 @@ public void testChangeHeaderAfterWritingBody() { */ private static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); + metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); @@ -330,6 +326,7 @@ public void TestWritingLargeVCF(final String extension) throws FileNotFoundExcep @DataProvider(name = "vcfExtensionsDataProvider") public Object[][]vcfExtensionsDataProvider() { return new Object[][] { + //TODO: fix this BCF problem! // TODO: BCF doesn't work because header is not properly constructed. // {".bcf"}, {FileExtensions.VCF}, diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 273b0f24af..97e7493a6f 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -13,6 +13,7 @@ import java.util.Iterator; import java.util.List; + public class AbstractVCFCodecTest extends VariantBaseTest { @Test @@ -31,11 +32,28 @@ public void shouldPreserveSymbolicAlleleCase() { public void TestSpanDelParseAlleles() { final List list = VCF3Codec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); } + @DataProvider(name="AllVCFCodecs") + public Object[][] allVCFCodecs() { + return new Object[][] { + {new VCF3Codec() }, + {new VCFCodec() }, + }; + } + + @Test(dataProvider = "AllVCFCodecs") + public void TestSpanDelParseAlleles(final AbstractVCFCodec vcfCodec){ + // TODO: why is there no Assert here ?? + vcfCodec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); + } @Test(expectedExceptions = TribbleException.class) public void TestSpanDelParseAllelesException() { final List list1 = VCF3Codec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); } + @Test(dataProvider = "AllVCFCodecs", expectedExceptions = TribbleException.class) + public void TestSpanDelParseAllelesException(final AbstractVCFCodec vcfCodec){ + vcfCodec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); + } @DataProvider(name = "thingsToTryToDecode") public Object[][] getThingsToTryToDecode() { @@ -47,16 +65,49 @@ public Object[][] getThingsToTryToDecode() { }; } - @Test(dataProvider = "thingsToTryToDecode") - public void testCanDecodeFile(String potentialInput, boolean canDecode) { - Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); - } + @Test(dataProvider = "thingsToTryToDecode") + public void testCanDecodeFile(String potentialInput, boolean canDecode) { + //TODO: add VCF43Codec when available + //TODO: its not sufficient to test for ANY v4 prefix since it will succeed on 4.3 as well + Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); + } - @Test - public void testGetTabixFormat() { - Assert.assertEquals(new VCFCodec().getTabixFormat(), TabixFormat.VCF); - Assert.assertEquals(new VCF3Codec().getTabixFormat(), TabixFormat.VCF); - } + @Test(dataProvider = "AllVCFCodecs") + public void testGetTabixFormat(final AbstractVCFCodec vcfCodec) { + Assert.assertEquals(vcfCodec.getTabixFormat(), TabixFormat.VCF); + } + + @DataProvider(name="otherHeaderLines") + public Object[][] otherHeaderLines() { + return new Object[][] { + { "key=<", new VCFHeaderLine("key", "<") }, + // taken from Funcotator test file as ##ID= + // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse + // into a VCFHeaderLine (but noa VCFSimpleHeaderLine + { "ID=", + new VCFHeaderLine("ID", "") }, + }; + } + + @Test(dataProvider="otherHeaderLines") + public void testGetOtherHeaderLine(final String headerLineString, final VCFHeaderLine headerLine) { + Assert.assertEquals(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2), headerLine); + } + + @DataProvider(name="badOtherHeaderLines") + public Object[][] badOtherHeaderLines() { + return new Object[][] { + { "=" }, + { "=<" }, + { "=<>" }, + { "key" }, + }; + } + + @Test(dataProvider="badOtherHeaderLines", expectedExceptions=TribbleException.InvalidHeader.class) + public void testBadOtherHeaderLine(final String headerLineString) { + Assert.assertNull(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2)); + } @Test public void testGLnotOverridePL() { diff --git a/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java new file mode 100644 index 0000000000..ed6a1d2b96 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFAltHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String ALT_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFAltHeaderLine vcfLine = new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java index cbc027ab5d..8dbf6dd30d 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java @@ -43,23 +43,31 @@ public class VCFCodec43FeaturesTest extends VariantBaseTest { private Object[][] allVCF43Files() { return new Object[][] { // a .vcf, .vcf.gz, .vcf with UTF8 chars, and .vcf.gz with UTF8 chars - { TEST_43_FILE }, - { TEST_43_UTF8_FILE }, - { TEST_43_GZ_FILE }, - { TEST_43_UTF8_GZ_FILE } + + // these first two files have a duplicate INFO header line in them that differ + // from each other only by virtue of having different descriptions: + //WARNING 2021-02-23 15:37:13 VCFMetaDataLines Attempt to add header line (INFO=) collides with existing line header line (INFO=). + // The existing line will be retained + { TEST_43_FILE, 69 }, + { TEST_43_UTF8_FILE, 69 }, + + { TEST_43_GZ_FILE, 70 }, + { TEST_43_UTF8_GZ_FILE, 70 } }; } @Test(dataProvider="all43Files") - public void testReadAllVCF43Features(final Path testFile) { + public void testReadAllVCF43Features(final Path testFile, int expectedHeaderLineCount) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); - Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), 70); + Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), expectedHeaderLineCount); Assert.assertEquals(entireVCF.b.size(), 25); } @Test(dataProvider="all43Files") - public void testVCF43SampleLine(final Path testFile) { + public void testVCF43SampleLine(final Path testFile, int ignored) { // ##SAMPLE= final VCFSampleHeaderLine sampleLine = getHeaderLineFromTestFile( @@ -77,7 +85,7 @@ public void testVCF43SampleLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43AltLine(final Path testFile) { + public void testVCF43AltLine(final Path testFile, int ignored) { // ##ALT= final VCFAltHeaderLine altLine = getHeaderLineFromTestFile( testFile, @@ -90,7 +98,7 @@ public void testVCF43AltLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PedigreeLine(final Path testFile) { + public void testVCF43PedigreeLine(final Path testFile, int ignored) { // ##PEDIGREE= final VCFPedigreeHeaderLine pedigreeLine = getHeaderLineFromTestFile( testFile, @@ -116,7 +124,7 @@ public void testV43PedigreeParsing() { } @Test(dataProvider="all43Files") - public void testVCF43MetaLine(final Path testFile) { + public void testVCF43MetaLine(final Path testFile, int ignored) { // ##META= final VCFMetaHeaderLine metaLine = getHeaderLineFromTestFile( testFile, @@ -129,7 +137,7 @@ public void testVCF43MetaLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PercentEncoding(final Path testFile) { + public void testVCF43PercentEncoding(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE @@ -142,7 +150,7 @@ public void testVCF43PercentEncoding(final Path testFile) { } @Test(dataProvider="all43Files") - public void testSymbolicAlternateAllele(final Path testFile) { + public void testSymbolicAlternateAllele(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE @@ -241,7 +249,7 @@ public void testVCF43PercentEncodingWithUTF8() { // given a vcf file, extract a header line with the given key and ID, cast to the target // header line type (T) via the transformer function - private static T getHeaderLineFromTestFile( + private static T getHeaderLineFromTestFile( final Path testVCFFile, final String key, final String ID, diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index f94435a833..96924b4e3a 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -25,22 +25,245 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; /** - * User: ebanks - * Date: Apr 2, 2014 + * Tests for VCFCompoundHeaderLine. + * + * NOTE: This class uses VCFInfoHeaderLine instances to test shared VCFCompoundHeaderLine functionality since + * VCFCompoundHeaderLine abstract. */ public class VCFCompoundHeaderLineUnitTest extends VariantBaseTest { + @DataProvider (name = "badOrMissingAttributes") + public Object[][] getMissingAttributes() { + return new Object[][] { + {""}, // no Type + {""}, // no Type + {""}, // no Number + {""}, // bogus Type + {""}, // bogus Number + }; + } + + @Test(dataProvider= "badOrMissingAttributes", expectedExceptions=TribbleException.class) + public void testBadOrMissingAttributes(final String lineString) { + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "acceptedAttributes") + public Object[][] getAcceptedAttributes() { + return new Object[][] { + {"", "Description", "foo"}, + //next two cases from https://github.com/samtools/htsjdk/issues/517 + {"", "Version", "3"}, + {"", "Source", "mySource"}, + }; + } + + @Test(dataProvider= "acceptedAttributes") + public void testAcceptedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @DataProvider (name = "invalidIDs") + public Object[][] getInvalidLines() { + return new Object[][] { + // ID cannot start with number + {""}, + // ID cannot start with '.'' + {""}, + // Test that IDs with the special thousand genomes key as a prefix are rejected + // The thousand genomes key is only accepted for VCFInfoHeaderLine and is tested in VCFInfoHeaderLineUnitTest + {""}, + // Contains invalid character '&' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testGetValidationError(final String lineString) { + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + new VCFInfoHeaderLine(lineString, VCFHeaderVersion.VCF4_3); + } + + @DataProvider (name = "headerLineTypes") + public Object[][] getHeaderLineTypes() { + return new Object[][] { + {"", VCFHeaderLineType.Float}, + {"", VCFHeaderLineType.Integer}, + {"", VCFHeaderLineType.String}, + {"", VCFHeaderLineType.Character}, + // Number must be 0 for flag type + {"", VCFHeaderLineType.Flag}, + }; + } + + @Test(dataProvider = "headerLineTypes") + public void testGetType(final String lineString, final VCFHeaderLineType expectedType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getType(), expectedType); + } + + @DataProvider (name = "headerLineCountTypes") + public Object[][] getLineCountTypes() { + return new Object[][] { + {"", VCFHeaderLineCount.A}, + {"", VCFHeaderLineCount.R}, + {"", VCFHeaderLineCount.G}, + {"", VCFHeaderLineCount.INTEGER}, + {"", VCFHeaderLineCount.UNBOUNDED}, + }; + } + + @Test(dataProvider= "headerLineCountTypes") + public void testGetLineCountType(final String lineString, final VCFHeaderLineCount expectedCountType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getCountType(), expectedCountType); + Assert.assertEquals(headerline.isFixedCount(), expectedCountType == VCFHeaderLineCount.INTEGER); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIntegerTypeWithNegativeCount() { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + @Test - public void supportsVersionFields() { - final String line = ""; - new VCFInfoHeaderLine(line, VCFHeaderVersion.VCF4_2); - // if we don't support version fields then we should fail before we ever get here - Assert.assertTrue(true); + public void testRepairFlagTypeWithNegativeCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", + VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(infoLine.getCount(), 0); } + + @DataProvider (name = "equalsData") + public Object[][] getEqualsData() { + return new Object[][] { + //pos + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + + //neg + {"", + "", false}, // different ID + {"", + "", false}, // different Type + {"", + "", false}, // different Number + {"", + "", false}, // different integer Number + {"", + "", false}, // different description + {"", + "", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, resolve as new unbounded + }, + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine) { + VCFCompoundHeaderLine mergedLine = VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + Assert.assertEquals(mergedLine, expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("",VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2) { + VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> { throw new IllegalArgumentException("lambda should never execute - this exception should never be thrown"); } + ); + } + + @Test + public void testEncodeWithUnescapedQuotes() { + + VCFFilterHeaderLine unescapedFilterLine = new VCFFilterHeaderLine( + "aFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + + final String encodedAttributes = unescapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEncodeWithEscapedQuotes() { + + VCFFilterHeaderLine escapedFilterLine = new VCFFilterHeaderLine("aFilter", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + final String encodedAttributes = escapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + } diff --git a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java new file mode 100644 index 0000000000..ad33575bef --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java @@ -0,0 +1,184 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +public class VCFContigHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedIDs") + public Object[][] getAllowedIDs() { + return new Object[][]{ + {"", "1"}, + {"", "10"}, + {"", "X"}, + {"", "Y"}, + {"", "MT"}, + {"", "NC_007605"}, + {"", "GL000191.1"}, + {"", "HLA-A*01:01:01:01"}, //https://github.com/samtools/hts-specs/issues/124 + }; + } + + @Test(dataProvider= "allowedIDs") + public void testAllowedIDs(final String lineString, final String expectedIDString) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getID(), expectedIDString); + } + + @DataProvider(name = "invalidIDs") + public Object[][] getInvalidIDs() { + return new Object[][]{ + // IDs cannot start with '*' + {""}, + // IDs cannot start with '=' + // The parser cannot handle attributes starting with '=' so we cannot express this test case + // {""}, + // IDs cannot contain '{' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testInvalidIDs(final String lineString) { + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + new VCFContigHeaderLine(lineString, VCFHeaderVersion.VCF4_3, 1); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectNegativeIndex() { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, -1); + } + + @DataProvider(name = "allowedAttributes") + public Object[][] getAllowedAttributes() { + return new Object[][] { + {"", "ID", "contig1"}, // https://github.com/samtools/htsjdk/issues/389 (no length) + {"", "length", "100"}, + {"", "taxonomy", "Homo sapiens"}, + {"", "assembly", "b37"}, + {"", "md5", "1a258fe76dfc8abd926f81f0e9b82ed7"}, + {"", + "URL", "http://www.refserve.org:8080/path/"}, + {"", + "species", "Homo sapiens"}, + }; + } + + @Test(dataProvider= "allowedAttributes") + public void testAllowedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @Test + public void testRoundTripThroughSequenceRecord() { + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 0); + + final String lengthString = "100"; + final String assemblyString = "b37"; + final String md5String = "1a258fe76dfc8abd926f81f0e9b82ed7"; + final String URLString = "http://www.refserve.org:8080/path/"; + final String speciesString = "Homo sapiens"; + + final SAMSequenceRecord sequenceRecord = contigLine.getSAMSequenceRecord(); + + Assert.assertEquals(Integer.toString(sequenceRecord.getSequenceLength()), lengthString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.LENGTH_ATTRIBUTE), lengthString); + + Assert.assertEquals(sequenceRecord.getAssembly(), assemblyString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.ASSEMBLY_ATTRIBUTE), assemblyString); + + Assert.assertEquals(sequenceRecord.getMd5(), md5String); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.MD5_ATTRIBUTE), md5String); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG), URLString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.URL_ATTRIBUTE), URLString); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG), speciesString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.SPECIES_ATTRIBUTE), speciesString); + + // now turn the SAMSequenceRecord back into a contig line, and compare the result to the + // original contig line + Assert.assertEquals( + new VCFContigHeaderLine(sequenceRecord, assemblyString), + contigLine); + } + + @DataProvider (name = "hashEqualsCompareData") + public Object[][] getHashEqualsCompareData() { + return new Object[][] { + + // For contig lines, equals and hash depend on the id, all other attributes, and the contig index, + // but compareTo only cares about the index. + + // line, index, line, line, index -> expected hash equals, expected equals, expected compare, + {"", 0, "", 0, true, true, 0 }, // identical + {"", 0, "", 1, false, false, -1 }, // identical except contig index + {"", 1, "", 0, false, false, 1 }, // identical except contig index + + {"", 0, "", 0, false, false, 0 }, // identical except attributes + {"", 0, "", 1, false, false, -1 }, // different attributes, different index + + {"", 0, "", 0, false, false, 0 }, // identical except ID + // different ID, same attributes and index, -> not equal, different hash, compare==0 + {"", 0, "", 0, false, false, 0 }, // different ID, attributes, same index + }; + } + + @Test(dataProvider = "hashEqualsCompareData") + public void testHashEqualsCompare( + final String line1, + final int index1, + final String line2, + final int index2, + final boolean expectedHashEquals, + final boolean expectedEquals, + final int expectedCompare) + { + final VCFContigHeaderLine headerLine1 = new VCFContigHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION, index1); + final VCFContigHeaderLine headerLine2 = new VCFContigHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION, index2); + + Assert.assertEquals(headerLine1.hashCode() == headerLine2.hashCode(), expectedHashEquals); + Assert.assertEquals(headerLine1.equals(headerLine2), expectedEquals); + Assert.assertEquals(headerLine1.compareTo(headerLine2), expectedCompare); + } + + @Test + public void testSortOrder() { + + final List expectedLineOrder = new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + }}; + + final TreeSet sortedLines = new TreeSet<>( + new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + }} + ); + + final Iterator sortedIt = sortedLines.iterator(); + for (final VCFContigHeaderLine cl : expectedLineOrder) { + Assert.assertTrue(sortedIt.hasNext()); + Assert.assertEquals(cl, sortedIt.next()); + } + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java index 547549aa81..f51589783b 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java @@ -148,6 +148,7 @@ public void testMissingFormatFields(final VCFEncoder encoder, final VariantConte private static Set createSyntheticMetadata() { final Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x")); diff --git a/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java new file mode 100644 index 0000000000..1e07ff9c2d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java @@ -0,0 +1,19 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to FORMAT lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFFormatHeaderLineUnitTest extends HtsjdkTest { + + // FORMAT lines aren't allowed to have type==Flag + @Test(expectedExceptions=TribbleException.class) + public void testRejectInfoLineWithFlagField() { + new VCFFormatHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java index 73116f53f0..94859c8717 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java @@ -102,12 +102,14 @@ private Object[][] getInvalidHeaderLines() { List sourceVersion = Arrays.asList("Source", "Version"); return new Object[][]{ // to parse, expected, recommended, error message - {"", idDesc, none, "Tag Description in wrong order (was #1, expected #2)"}, - {"", idDesc, none, "Unexpected tag Desc"}, - {"<>", idDesc, none, "Unexpected tag "}, - - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"}, - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"} + {"", idDesc, none, "Unexpected tag or tag order for tag \"Description\""}, + {"", idDesc, none, "Unexpected tag or tag order for tag \"Desc\""}, + {"<>", idDesc, none, "Unexpected tag or tag order for tag \"\""}, + + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""}, + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""} }; } @@ -119,7 +121,7 @@ private static void callTranslator(final String line, VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } else { - VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder, recommendedTags); + VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } } @@ -153,13 +155,4 @@ private Object[][] getVcfV3Versions() { }; } - @Test(dataProvider = "vcfv3", expectedExceptions = TribbleException.class) - public void testVcfV3FailsRecommendedTags(final VCFHeaderVersion vcfVersion) { - VCFHeaderLineTranslator.parseLine( - vcfVersion, - "", - Arrays.asList("ID"), - Arrays.asList("Description") - ); - } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index e04d3c69c8..d5d7e47ec9 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -1,6 +1,9 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.LinkedHashMap; @@ -9,46 +12,146 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; + public class VCFHeaderLineUnitTest extends VariantBaseTest { @Test public void testEncodeVCFHeaderLineWithUnescapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } @Test public void testEncodeVCFHeaderLineWithEscapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testFormatNumberExeptions() { + @Test + public void testIsNotStructuredHeaderLine() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertFalse(hl.isIDHeaderLine()); + Assert.assertNull(hl.getID()); + } + + @Test + public void testStringEncoding() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertEquals(hl.toStringEncoding(), "key=value"); + } + + @DataProvider(name = "headerLineEquals") + public Object[][] headerLineEquals() { + return new Object[][]{ + { + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), + true + }, + { + new VCFHeaderLine("key", "value1"), + new VCFHeaderLine("key", "value2"), + false + }, + { + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), + false + }, + { + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), + false + } + }; + } + + @Test(dataProvider = "headerLineEquals") + public void testEquals(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectedEquals) { + Assert.assertEquals(hl1.equals(hl2), expectedEquals); + } + + @DataProvider(name = "invalidHeaderLineKeys") + public Object[][] invalidHeaderLineKeys() { + return new Object[][]{ + {null}, + {"embedded<"}, + {"embedded="}}; + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testInvalidKeys(final String testKey) { + new VCFHeaderLine(testKey, ""); + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testValidateAsIdInvalid(final String testKey) { + VCFHeaderLine.validateKeyOrID(testKey, "test"); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "vcfVersions") + public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(vcfVersion); + } + + @DataProvider(name = "incompatibleVersions") + public Object[][] incompatibleVersionPairs() { + return new Object[][]{ + // each pair just has to be different + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2} + }; + } + + @Test(dataProvider="incompatibleVersions", expectedExceptions= TribbleException.VersionValidationFailure.class) + public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(incompatibleVersion); + } + + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testFormatNumberExceptions() { new VCFFormatHeaderLine("test", 0, VCFHeaderLineType.Integer, ""); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testInfoNumberExeptions() { + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testInfoNumberExceptions() { new VCFInfoHeaderLine("test", 0, VCFHeaderLineType.Integer, diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java new file mode 100644 index 0000000000..1be8bdf085 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java @@ -0,0 +1,554 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.IntStream; + +import static htsjdk.variant.vcf.VCFConstants.PEDIGREE_HEADER_KEY; + +public class VCFHeaderMergerUnitTest extends VariantBaseTest { + + @DataProvider(name="mergeValidVersions") + public Object[][] getMergeValidVersions() { + + // only v4.2+ headers can be merged, merge result version is always the highest version presented + return new Object[][] { + // headers to merge, expected result version + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + }; + } + + @DataProvider(name="mergeInvalidVersions") + public Object[][] getMergeInvalidVersions() { + // only v4.2+ headers can be merged + return new Object[][] { + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1)}, + }; + } + + @Test(dataProvider="mergeValidVersions") + public void testMergeValidVersions(final List headerVersions, final VCFHeaderVersion expectedVersion) { + // merge the headers, and then verify that the merged lines have the expected version by + // instantiating a VCFMetaDataLines instance to determine the resulting version + final Set mergedHeaderLines = doHeaderMergeForVersions(headerVersions); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); + + // now create a new header using the merged VersionLines, and make sure *it* has the expected version + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getVCFHeaderVersion(), expectedVersion); + + // also verify that all the header lines in the merged set are also in the resulting header + Assert.assertEquals(mergedHeader.getMetaDataInInputOrder(), mergedHeaderLines); + } + + @Test(dataProvider="mergeInvalidVersions", expectedExceptions = TribbleException.class) + public void testMergeInvalidVersions(final List headerVersions) { + doHeaderMergeForVersions(headerVersions); + } + + @Test(expectedExceptions = TribbleException.class) + public void testMergeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final Set oldHeaderLines = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + oldHeaderLines.add(new VCFHeaderLine(PEDIGREE_HEADER_KEY, "")); + final VCFHeader oldHeader = new VCFHeader(oldHeaderLines); + Assert.assertEquals(oldHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + + // now create a simple 4.3 header; the merge should fail because the old PEDIGREE line isn't valid + // for 4.3 (for which pedigree lines mut have an ID) + final VCFHeader newHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + Assert.assertEquals(newHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + + VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(oldHeader, newHeader),true); + } + + private Set doHeaderMergeForVersions(final List headerVersions) { + // This is a somewhat sketchy way to write a test...for each header we create here, we're + // using the same fixed set of VCF42-conforming VCFHeader lines, and then we add a fileformat + // line with whatever VCFVersion the test calls for. Its conceivable that as time goes on + // and we add new versions, the VCFHeader constructor could throw if any of the lines don't + // conform to the requested version. + final List headerList = new ArrayList<>(headerVersions.size()); + for (final VCFHeaderVersion version : headerVersions) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(version)); + final VCFHeader header = new VCFHeader(metaDataSet); + Assert.assertEquals(header.getVCFHeaderVersion(), version); + headerList.add(header); + } + + return VCFUtils.smartMergeHeaders(headerList, false); + } + + @DataProvider(name = "subsetHeaders") + public Iterator getSubsetHeaders() { + final List headerLineList = new ArrayList<>(new VCFHeaderUnitTestData().getTestMetaDataLinesSet()); + final Collection mergeTestCase = new ArrayList<>(); + // For each header line in the list of test lines, create a test case consisting of a pair of headers, + // one of which is a header created with all of the lines, and one of which is a subset of the full header + // with one line removed. Skip the case where the line to be removed is a fileformat line, since thats + // required to create a header. + for (int i = 0; i < headerLineList.size(); i++) { + // take the header line set and remove the ith line, unless its a fileformat line, since if we remove + // that, then we won't be able to create a header using the resulting lines at all. + final VCFHeaderLine candidateLine = headerLineList.get(i); + if (!VCFHeaderVersion.isFormatString(candidateLine.getKey())) { + List subsetList = new ArrayList<>(headerLineList); + subsetList.remove(i); + mergeTestCase.add( + new Object[] { + new VCFHeader(VCFHeaderUnitTestData.getTestMetaDataLinesSet()), + new VCFHeader(new LinkedHashSet<>(subsetList)) + }); + } + } + + return mergeTestCase.iterator(); + } + + @Test(dataProvider = "subsetHeaders") + public void testMergeSubsetHeaders( + final VCFHeader fullHeader, + final VCFHeader subsetHeader) + { + final List headerList = new ArrayList() {{ + add(fullHeader); + add(subsetHeader); + add(subsetHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(headerList, false), + fullHeader.getMetaDataInSortedOrder()); + + // now again, in the reverse order + final List reverseHeaderList = new ArrayList() {{ + add(subsetHeader); + add(subsetHeader); + add(fullHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(reverseHeaderList, false), + fullHeader.getMetaDataInSortedOrder()); + } + + @Test + public void testDictionaryMergeDuplicateFile() { + final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + "diagnosis_targets_testfile.vcf"), false).getFileHeader(); + final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy + final List sampleList = new ArrayList<>(); + sampleList.addAll(headerOne.getSampleNamesInOrder()); + + // Check that the two dictionaries start out the same + headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); + + // Run the merge command + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(headerOne, headerTwo), false), sampleList); + + // Check that the mergedHeader's sequence dictionary matches the first two + mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); + } + + @DataProvider(name="dictionaryMergePositive") + private Object[][] getDictionaryMergePositive() { + return new Object[][] { + // input dictionary list, expected merged dictionary + { + // one dictionary + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // two identical dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three different subsets; superset first + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset second + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset third (requires the merge implementation to sort on dictionary size) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // one non-null dictionary, one null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // one non-null dictionary, one null, in reverse direction + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, non-null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: subset, null, superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // all null dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + null + } + }; + } + + @Test(dataProvider = "dictionaryMergePositive") + private void testDictionaryMergePositive( + final List sourceHeaders, final SAMSequenceDictionary expectedDictionary) { + final Set mergedHeaderLines = VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getSequenceDictionary(), expectedDictionary); + } + + @DataProvider(name="dictionaryMergeNegative") + private Object[][] getDictionaryMergeNegative() { + final SAMSequenceDictionary forwardDictionary = createTestSAMDictionary(1, 2); + final SAMSequenceDictionary reverseDictionary = createReverseDictionary(forwardDictionary); + + return new Object[][] { + { + // SequenceDictionaryCompatibility.NO_COMMON_CONTIGS + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2))) + }, + { + // SequenceDictionaryCompatibility.OUT_OF_ORDER + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(forwardDictionary), + createTestVCFHeaderWithSAMDictionary(reverseDictionary)) + }, + { + // SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS common subset has contigs that have the same name but different lengths + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(100)), + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(200))) + }, + { + // SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER human reference detected but the order of the contigs is non-standard (lexicographic, for example) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryInCanonicalHumanOrder()), + createTestVCFHeaderWithSAMDictionary(createDictionaryInNonCanonicalHumanOrder())) + }, + { + // three mutually disjoint dictionaries, no superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(4, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(6, 2)) + ) + }, + }; + } + + @Test(dataProvider = "dictionaryMergeNegative", expectedExceptions = TribbleException.class) + private void testDictionaryMergeNegative(final List sourceHeaders) { + VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + } + + @Test + final void testDuplicateNonStructuredKeys() { + // merge 2 headers, one has "##sample=foo", one has "##sample=bar", both should survive the merge + final VCFHeaderLine fooLine = new VCFHeaderLine("sample", "foo"); + final Set fooLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + fooLines.add(fooLine); + final VCFHeader fooHeader = new VCFHeader(fooLines); + + final VCFHeaderLine barLine = new VCFHeaderLine("sample", "bar"); + final Set barLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + barLines.add(barLine); + final VCFHeader barHeader = new VCFHeader(barLines); + + final Set mergedLines = VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(fooHeader, barHeader), false); + Assert.assertEquals(mergedLines.size(), 3); + Assert.assertTrue(mergedLines.contains(fooLine)); + Assert.assertTrue(mergedLines.contains(barLine)); + } + + @DataProvider(name = "compatibleInfoLines") + public Object[][] getMergerData() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number, promote to "." + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type in reverse direction, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "compatibleInfoLines") + public void testMergeCompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + Assert.assertEquals(mergedHeader.getInfoHeaderLine(id), expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + } + + private final SAMSequenceDictionary createTestSAMDictionary(final int startSequence, final int numSequences) { + final SAMSequenceDictionary samDictionary = new SAMSequenceDictionary(); + IntStream.range(startSequence, startSequence + numSequences).forEachOrdered( + i -> samDictionary.addSequence(new SAMSequenceRecord(Integer.toString(i), i))); + return samDictionary; + } + + private final VCFHeader createTestVCFHeaderWithSAMDictionary(final SAMSequenceDictionary samDictionary) { + final VCFHeader vcfHeader = createTestVCFHeader(); + vcfHeader.setSequenceDictionary(samDictionary); + return vcfHeader; + } + + private SAMSequenceDictionary createDictionaryInNonCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryInCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryWithLengths(final int length) { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", length)); + sequences.add(new SAMSequenceRecord("2", length)); + sequences.add(new SAMSequenceRecord("3", length)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createReverseDictionary(final SAMSequenceDictionary forwardDictionary){ + // its not sufficient to reuse the existing sequences by just reordering them, since + // SAMSequenceDictionary *mutates* the sequence indices to match the input order. So we need + // to create the new sequence dictionary using entirely new sequence records, and let + // SAMSequenceDictionary assign them indices that match the input order. + final List reverseSequences = new ArrayList<>(forwardDictionary.getSequences()); + Collections.reverse(reverseSequences); + final SAMSequenceDictionary reverseDictionary = new SAMSequenceDictionary(); + + int count = 0; + for (final SAMSequenceRecord samSequenceRecord : reverseSequences) { + final SAMSequenceRecord newSequenceRecord = new SAMSequenceRecord( + samSequenceRecord.getSequenceName(), + samSequenceRecord.getSequenceLength()); + reverseDictionary.addSequence(newSequenceRecord); + Assert.assertEquals(newSequenceRecord.getSequenceIndex(), count); + count++; + } + return reverseDictionary; + } + + private final VCFHeader createTestVCFHeader() { + return new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index e4d5099eda..8ee9ccab26 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -29,7 +29,6 @@ import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; -import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.TestUtil; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.AsciiLineReader; @@ -42,66 +41,64 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.*; -import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.stream.Collectors; -/** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 30, 2010 - * Time: 3:32:08 PM - * To change this template use File | Settings | File Templates. - */ public class VCFHeaderUnitTest extends VariantBaseTest { - private File tempDir; - - private VCFHeader createHeader(String headerStr) { - VCFCodec codec = new VCFCodec(); - VCFHeader header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new SynchronousLineReader( - new StringReader(headerStr)))); - Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount); - return header; - } - - @BeforeClass - private void createTemporaryDirectory() { - tempDir = TestUtil.getTempDirectory("VCFHeader", "VCFHeaderTest"); + @DataProvider(name="headerRoundTrip") + private Object[][] getHeaderRoundTrip() { + return new Object[][] { + { VCFHeaderUnitTestData.getVCFV42TestHeaderString() }, + { VCFHeaderUnitTestData.VCF42headerStrings_with_negativeOne } + }; } - @AfterClass - private void deleteTemporaryDirectory() { - for (File f : tempDir.listFiles()) { - f.delete(); - } - tempDir.delete(); + @Test(dataProvider = "headerRoundTrip") + public void test42HeaderRoundTrip(final String headerString) throws IOException { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(headerString); + Assert.assertEquals(header.getMetaDataInSortedOrder(), getRoundTripEncoded(header)); } @Test - public void testVCF4ToVCF4() { - VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "91c33dadb92e01ea349bd4bcdd02d6be"); - } + public void test42FileRoundtrip() throws Exception { + // this test validates that source/version fields are round-tripped properly - @Test - public void testVCF4ToVCF4_alternate() { - VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "39318d9713897d55be5ee32a2119853f"); + // read an existing VCF + final File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + + // write the file out into a new copy + final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); + actualFile.deleteOnExit(); + + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); + final VariantContextWriter copyWriter = new VariantContextWriterBuilder() + .setOutputFile(actualFile) + .setReferenceDictionary(createArtificialSequenceDictionary()) + .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) + .build() + ) { + final VCFHeader originalHeader = originalFileReader.getFileHeader(); + + copyWriter.writeHeader(originalHeader); + for (final VariantContext variantContext : originalFileReader) { + copyWriter.add(variantContext); + } + } + + final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); + final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); + Assert.assertEquals(actualContents, expectedContents); } @Test - public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { + public void testSampleRenamingSingleSample() throws Exception { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "HiSeq.10000.vcf"))); @@ -120,57 +117,25 @@ public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { } } - @DataProvider - public Object[][] testVCFHeaderDictionaryMergingData() { + @DataProvider(name="testSampleRenamingFailsTests") + public Object[][] testSampleRenamingFailsTests() { return new Object[][]{ - {"diagnosis_targets_testfile.vcf"}, // numerically ordered contigs - {"dbsnp_135.b37.1000.vcf"} // lexicographically ordered contigs + {variantTestDataRoot + "ex2.vcf"}, // multi sample vcf + {variantTestDataRoot + "dbsnp_135.b37.1000.vcf"} // sites only vcf }; } - @Test(dataProvider = "testVCFHeaderDictionaryMergingData") - public void testVCFHeaderDictionaryMerging(final String vcfFileName) { - final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + vcfFileName), false).getFileHeader(); - final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy - final List sampleList = new ArrayList(); - sampleList.addAll(headerOne.getSampleNamesInOrder()); - - // Check that the two dictionaries start out the same - headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); - - // Run the merge command - final VCFHeader mergedHeader = new VCFHeader(VCFUtils.smartMergeHeaders(Arrays.asList(headerOne, headerTwo), false), sampleList); - - // Check that the mergedHeader's sequence dictionary matches the first two - mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingMultiSampleVCF() throws Exception { - final VCFCodec codec = new VCFCodec(); - codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "ex2.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingSitesOnlyVCF() throws Exception { + @Test(dataProvider = "testSampleRenamingFailsTests", expectedExceptions = TribbleException.class) + public void testSampleRenamingFails(final String fileName) throws IOException { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - private VCFHeader getHiSeqVCFHeader() { - final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); - final VCFFileReader reader = new VCFFileReader(vcf, false); - final VCFHeader header = reader.getFileHeader(); - reader.close(); - return header; + final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator( + AsciiLineReader.from(new FileInputStream(fileName))); + codec.readHeader(vcfIterator).getHeaderValue(); } @Test - public void testVCFHeaderAddInfoLine() { + public void testAddInfoLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("TestInfoLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info line"); header.addMetaDataLine(infoLine); @@ -185,13 +150,8 @@ public void testVCFHeaderAddInfoLine() { Assert.assertFalse(header.getOtherHeaderLines().contains(infoLine), "TestInfoLine present in other header lines"); } - private static Collection asCollectionOfVCFHeaderLine(Collection headers) { - // create a collection of VCFHeaderLine so that contains tests work correctly - return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); - } - @Test - public void testVCFHeaderAddFormatLine() { + public void testAddFormatLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFFormatHeaderLine formatLine = new VCFFormatHeaderLine("TestFormatLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test format line"); header.addMetaDataLine(formatLine); @@ -207,11 +167,11 @@ public void testVCFHeaderAddFormatLine() { } @Test - public void testVCFHeaderAddFilterLine() { + public void testAddFilterLine() { final VCFHeader header = getHiSeqVCFHeader(); final String filterDesc = "TestFilterLine Description"; - final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine",filterDesc); - Assert.assertEquals(filterDesc,filterLine.getDescription()); + final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine", filterDesc); + Assert.assertEquals(filterDesc, filterLine.getDescription()); header.addMetaDataLine(filterLine); Assert.assertTrue(header.getFilterLines().contains(filterLine), "TestFilterLine not found in filter header lines"); @@ -225,10 +185,15 @@ public void testVCFHeaderAddFilterLine() { } @Test - public void testVCFHeaderAddContigLine() { + public void testAddContigLine() { final VCFHeader header = getHiSeqVCFHeader(); + // no contig lines in this header + Assert.assertTrue(header.getContigLines().isEmpty()); + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); + Assert.assertEquals(contigLine.getKey(), VCFHeader.CONTIG_KEY); + Assert.assertEquals(contigLine.getID(), "chr1"); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); @@ -241,10 +206,70 @@ public void testVCFHeaderAddContigLine() { } @Test - public void testVCFHeaderContigLineMissingLength() { + public void testAddContigLineExactDuplicateSilentlyDropped() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); + + final int numContigLinesBefore = header.getContigLines().size(); + // try to read the first contig line + header.addMetaDataLine(header.getContigLines().get(0)); + final int numContigLinesAfter = header.getContigLines().size(); + + // assert that we have the same number of contig lines before and after + Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); + } + + @Test + public void testAddContigLineWithDifferentAttributesSilentlyDropped() { + final VCFContigHeaderLine contigOneNoAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + }}, + 0); + final VCFContigHeaderLine contigOneWithAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + put("assembly", "b37"); + }}, + 1); + Assert.assertNotEquals(contigOneNoAssembly.hashCode(), contigOneWithAssembly.hashCode()); + + final Set headerLineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + headerLineSet.add(contigOneNoAssembly); + headerLineSet.add(contigOneWithAssembly); + Assert.assertEquals(headerLineSet.size(), 3); // one fileformat line, plus 2 contig lines + + // silently drops contigOneNoAssembly since it has the same ID AND contig index as contigOneWithAssembly + final VCFHeader vcfHeader = new VCFHeader(headerLineSet); + final Set allMetaDataInput = vcfHeader.getMetaDataInInputOrder(); + Assert.assertEquals(allMetaDataInput.size(), 2); + final Set allMetaDataSorted = vcfHeader.getMetaDataInSortedOrder(); + Assert.assertEquals(allMetaDataSorted.size(), 2); + final List allContigLines = vcfHeader.getContigLines(); + Assert.assertEquals(allContigLines.size(), 1); // one contig + Assert.assertEquals(allContigLines.get(0).getGenericFieldValue("assembly"), "b37"); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineRejectDuplicateContigIndex() { + final VCFHeader header = new VCFHeader(); + // add two contig lines that share an index, but have different IDs and represetn different contifs + final VCFContigHeaderLine contigLine1 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + final VCFContigHeaderLine contigLine2 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + + header.addMetaDataLine(contigLine1); + header.addMetaDataLine(contigLine2); + } + + @Test + public void testAddContigLineMissingLength() { final VCFHeader header = getHiSeqVCFHeader(); final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(contigLine), "Test contig line not found in set of all header lines"); @@ -252,58 +277,66 @@ public void testVCFHeaderContigLineMissingLength() { final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary(); Assert.assertNotNull(sequenceDictionary); Assert.assertEquals(sequenceDictionary.getSequence("chr1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); - } - @Test - public void testVCFHeaderHonorContigLineOrder() throws IOException { + @Test + public void testGetContigLinesHonorsSortOrder() { + // NOTE: this test file has *lexicographically* ordered contigs try (final VCFFileReader vcfReader = new VCFFileReader(new File(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"), false)) { // start with a header with a bunch of contig lines final VCFHeader header = vcfReader.getFileHeader(); - final List originalHeaderList = header.getContigLines(); - Assert.assertTrue(originalHeaderList.size() > 0); - - // copy the contig lines to a new list, sticking an extra contig line in the middle - final List orderedList = new ArrayList<>(); - final int splitInTheMiddle = originalHeaderList.size() / 2; - orderedList.addAll(originalHeaderList.subList(0, splitInTheMiddle)); - final VCFContigHeaderLine outrageousContigLine = new VCFContigHeaderLine( - "", + final List originalContigsInSortedOrder = header.getContigLines(); + Assert.assertTrue(originalContigsInSortedOrder.size() > 0); + + // copy the contig lines to a new list + final List confoundedList = new ArrayList<>(); + final int midPoint = originalContigsInSortedOrder.size() / 2; + confoundedList.addAll(originalContigsInSortedOrder.subList(0, midPoint)); + + // deliberately stick an extra contig line in the middle of the list, but using a contig index + // that will cause the line to sort to the end + final String newContigID = "newContigID"; + final int newContigIndex = originalContigsInSortedOrder.size(); + final VCFContigHeaderLine newContigLine = new VCFContigHeaderLine( + String.format( + "", newContigID), VCFHeaderVersion.VCF4_2, - VCFHeader.CONTIG_KEY, - 0); - orderedList.add(outrageousContigLine); - // make sure the extra contig line is outrageous enough to not collide with a real contig ID - Assert.assertTrue(orderedList.contains(outrageousContigLine)); - orderedList.addAll(originalHeaderList.subList(splitInTheMiddle, originalHeaderList.size())); - Assert.assertEquals(originalHeaderList.size() + 1, orderedList.size()); - - // crete a new header from the ordered list, and test that getContigLines honors the input order - final VCFHeader orderedHeader = new VCFHeader(); - orderedList.forEach(hl -> orderedHeader.addMetaDataLine(hl)); - Assert.assertEquals(orderedList, orderedHeader.getContigLines()); + newContigIndex); + confoundedList.add(newContigLine); + confoundedList.addAll(originalContigsInSortedOrder.subList(midPoint, originalContigsInSortedOrder.size())); + + // make sure the new contig line was actually added + Assert.assertEquals(originalContigsInSortedOrder.size() + 1, confoundedList.size()); + Assert.assertTrue(confoundedList.contains(newContigLine)); + + // create a new header from the confounded list, call getContigLines() on the header, and validate + // that the new line is included in the resulting list, and is at the end + final VCFHeader newHeader = new VCFHeader(); + confoundedList.forEach(hl -> newHeader.addMetaDataLine(hl)); + final List roundTrippedLines = newHeader.getContigLines(); + Assert.assertEquals(roundTrippedLines.size(), originalContigsInSortedOrder.size() + 1); + Assert.assertEquals(roundTrippedLines.get(roundTrippedLines.size() - 1), newContigLine); + + // make sure the sequence dictionary has the contig with the correct contig index, and in + // the same relative location in the dictionary (at the end of the list) + final SAMSequenceDictionary orderedSeqDict = newHeader.getSequenceDictionary(); + Assert.assertEquals( + orderedSeqDict.getSequence(newContigID).getSequenceIndex(), + roundTrippedLines.size() - 1); + Assert.assertEquals( + orderedSeqDict.getSequences().get(newHeader.getContigLines().size() - 1).getSequenceName(), + newContigID); } } @Test - public void testVCFSimpleHeaderLineGenericFieldGetter() { - VCFHeader header = createHeader(VCF4headerStrings); - List filters = header.getFilterLines(); - VCFFilterHeaderLine filterHeaderLine = filters.get(0); - Map genericFields = filterHeaderLine.getGenericFields(); - Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); - Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); - } - - @Test - public void testVCFHeaderAddOtherLine() { + public void testAddOtherLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFHeaderLine otherLine = new VCFHeaderLine("TestOtherLine", "val"); header.addMetaDataLine(otherLine); Assert.assertTrue(header.getOtherHeaderLines().contains(otherLine), "TestOtherLine not found in other header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(otherLine), "TestOtherLine not found in set of all header lines"); - Assert.assertNotNull(header.getOtherHeaderLine("TestOtherLine"), "Lookup for TestOtherLine by key failed"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getInfoHeaderLines()).contains(otherLine), "TestOtherLine present in info header lines"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getFormatHeaderLines()).contains(otherLine), "TestOtherLine present in format header lines"); @@ -312,15 +345,16 @@ public void testVCFHeaderAddOtherLine() { } @Test - public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddMetaDataLineDoesNotDuplicateContigs() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); final int numContigLinesBefore = header.getContigLines().size(); - VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine("test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); + final VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine( + "test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); header.addMetaDataLine(newInfoField); // getting the sequence dictionary was failing due to duplicating contigs in issue #214, @@ -333,109 +367,280 @@ public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { } @Test - public void testVCFHeaderAddDuplicateContigLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); - - - final int numContigLinesBefore = header.getContigLines().size(); - // try to readd the first contig line - header.addMetaDataLine(header.getContigLines().get(0)); - final int numContigLinesAfter = header.getContigLines().size(); - - // assert that we have the same number of contig lines before and after - Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); - } - - @Test - public void testVCFHeaderAddDuplicateHeaderLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddDuplicateKeyValueHeaderLine() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); - VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); + final VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); // add this new header line header.addMetaDataLine(newHeaderLine); final int numHeaderLinesBefore = header.getOtherHeaderLines().size(); - // readd the same header line + // add the same header line again header.addMetaDataLine(newHeaderLine); final int numHeaderLinesAfter = header.getOtherHeaderLines().size(); - // assert that we have the same number of other header lines before and after + // Note: we don't allow duplicate unstructured lines with the same key unless they have + // different content + // assert that we have the one more other header line after Assert.assertEquals(numHeaderLinesBefore, numHeaderLinesAfter); } + @Test + public void testSimpleHeaderLineGenericFieldGetter() { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(VCFHeaderUnitTestData.getVCFV42TestHeaderString()); + final List filters = header.getFilterLines(); + final VCFFilterHeaderLine filterHeaderLine = filters.get(0); + final Map genericFields = filterHeaderLine.getGenericFields(); + Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); + Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); + } + + @Test + public void testSerialization() throws Exception { + final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); + final VCFHeader originalHeader = reader.getFileHeader(); + reader.close(); + + final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + + Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + } + @DataProvider(name="validHeaderVersionTransitions") public Object[][] validHeaderVersionTransitions() { - // v4.3 can never transition, all other version transitions are allowed + // all (forward) version transitions are allowed return new Object[][] { + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_0}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3} }; } @DataProvider(name="invalidHeaderVersionTransitions") public Object[][] invalidHeaderVersionTransitions() { - // v4.3 can never transition with, all other version transitions are allowed return new Object[][] { - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + //reject any attempt to go backwards in time {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2}, - {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2}, }; } @Test(dataProvider="validHeaderVersionTransitions") - public void testValidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionLineValidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()); + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), toVersion); } @Test(dataProvider="invalidHeaderVersionTransitions", expectedExceptions = TribbleException.class) - public void testInvalidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionInvalidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()) + .addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(expectedExceptions = TribbleException.class) + public void testVersionUpgradeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + + // now try to force a version upgrade to 4.3, old style pedigree line should cause a failure + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddLineWithValidationFailure() { + // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) + // which should cause a failure + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + } + + + @Test(expectedExceptions = TribbleException.class) + public void testConstructorRequiresFileFormatLine() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + // create a new header from this set (containing no fileformat line), no requested version in constructor + new VCFHeader(metaDataSet, Collections.emptySet()); //defaults to v4.2 } - private void doHeaderTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - final VCFHeader vcfHeader = - fromVersion == null ? - new VCFHeader() : - new VCFHeader(fromVersion, Collections.EMPTY_SET, Collections.EMPTY_SET); - vcfHeader.setVCFHeaderVersion(toVersion); + @Test(dataProvider = "vcfVersions") + public void testConstructorWithSingleFileFormatLine(final VCFHeaderVersion vcfVersion) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + + // add in the corresponding fileformat line; create a new versioned header + // since the version requested in the constructor and the format lines are in sync, there is + // no conflict, and the resulting header's version should always match the requested version + metaDataSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), vcfVersion); } @Test - public void testVCFHeaderSerialization() throws Exception { - final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); - final VCFHeader originalHeader = reader.getFileHeader(); - reader.close(); + public void testConstructorWithMultipleFileFormatLines() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); - final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + // multiple version lines will be ignored, with only the last one retained + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 2); - Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testConstructorWithInvalidLineForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + metaDataSet.add(new VCFPedigreeHeaderLine(attributes)); + new VCFHeader(metaDataSet, Collections.emptySet()); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testAddMetaDataLineInvalidForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final VCFHeader header = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + header.addMetaDataLine(new VCFPedigreeHeaderLine(attributes)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineFileFormat() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); + + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 1); + + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + + // add a new line that uses the same header version already established + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + + // add a new line that tries to move the version forward + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + + // now try to go backwards (throws) + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + } + + @Test + public void testPreserveSequenceDictionaryAttributes() { + // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back + // to a SAMSequenceDictionary with the same attributes. + // https://github.com/samtools/htsjdk/issues/730 + + final String assemblyString = "hg37"; + final String md5String = "68b329da9893e34099c7d8ad5cb9c940"; + final String speciesString = "Home Sapiens"; + final String urlString = "http://www.refserve.org:8080/path/"; + + final SAMSequenceDictionary samDict = new SAMSequenceDictionary(); + + final SAMSequenceRecord seqRec1 = new SAMSequenceRecord("1", 1); + seqRec1.setAssembly(assemblyString); + seqRec1.setMd5(md5String); + seqRec1.setAttribute(SAMSequenceRecord.URI_TAG, urlString); + seqRec1.setSpecies(speciesString); + final SAMSequenceRecord seqRec2 = new SAMSequenceRecord("2", 1); + samDict.addSequence(seqRec1); + samDict.addSequence(seqRec2); + + final VCFHeader vcfHeader = new VCFHeader(); + vcfHeader.setSequenceDictionary(samDict); + final SAMSequenceDictionary roundTrippedDict = vcfHeader.getSequenceDictionary(); + + final SAMSequenceRecord rtRec1 = roundTrippedDict.getSequence("1"); + Assert.assertEquals(assemblyString, rtRec1.getAssembly()); + Assert.assertEquals(md5String, rtRec1.getMd5()); + Assert.assertEquals(urlString, rtRec1.getAttribute(SAMSequenceRecord.URI_TAG)); + Assert.assertEquals(speciesString, rtRec1.getSpecies()); + + Assert.assertEquals(seqRec1, roundTrippedDict.getSequence("1")); // somewhat redundant check on full record + Assert.assertEquals(seqRec2, roundTrippedDict.getSequence("2")); } + ///////////////////////////////////////////////////////////////// + ////////////////************************* End new tests block... + ///////////////////////////////////////////////////////////////// + @Test public void testVCFHeaderQuoteEscaping() throws Exception { // this test ensures that the end-to-end process of quote escaping is stable when headers are @@ -449,10 +654,9 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFHeader originalHeader = originalFileReader.getFileHeader(); // add a header line with quotes to the header - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final VCFSimpleHeaderLine addedHeaderLine = new VCFSimpleHeaderLine("GATKCommandLine.Test", attributes); + final VCFSimpleHeaderLine addedHeaderLine = new VCFFilterHeaderLine( + "FakeFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); originalHeader.addMetaDataLine(addedHeaderLine); final VCFFilterHeaderLine originalCopyAnnotationLine1 = originalHeader.getFilterHeaderLine("ANNOTATION"); @@ -487,7 +691,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { firstCopyWriter.writeHeader(originalHeader); final CloseableIterator firstCopyVariantIterator = originalFileReader.iterator(); while (firstCopyVariantIterator.hasNext()) { - VariantContext variantContext = firstCopyVariantIterator.next(); + final VariantContext variantContext = firstCopyVariantIterator.next(); firstCopyWriter.add(variantContext); } originalFileReader.close(); @@ -496,7 +700,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { // read the copied file back in final VCFFileReader firstCopyReader = new VCFFileReader(firstCopyVCFFile, false); final VCFHeader firstCopyHeader = firstCopyReader.getFileHeader(); - final VCFHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(firstCopyNewHeaderLine); final VCFFilterHeaderLine firstCopyAnnotationLine1 = firstCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -530,7 +734,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { secondCopyWriter.writeHeader(firstCopyHeader); final CloseableIterator secondCopyVariantIterator = firstCopyReader.iterator(); while (secondCopyVariantIterator.hasNext()) { - VariantContext variantContext = secondCopyVariantIterator.next(); + final VariantContext variantContext = secondCopyVariantIterator.next(); secondCopyWriter.add(variantContext); } secondCopyWriter.close(); @@ -539,7 +743,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFFileReader secondCopyReader = new VCFFileReader(secondCopyVCFFile, false); final VCFHeader secondCopyHeader = secondCopyReader.getFileHeader(); - final VCFHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(secondCopyNewHeaderLine); final VCFFilterHeaderLine secondCopyAnnotationLine1 = secondCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -549,8 +753,8 @@ public void testVCFHeaderQuoteEscaping() throws Exception { Assert.assertNotNull(secondCopyAnnotationLine2); Assert.assertEquals(firstCopyNewHeaderLine, secondCopyNewHeaderLine); - Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); - Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); + Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "FILTER="); + Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "FILTER="); Assert.assertEquals(firstCopyAnnotationLine1, secondCopyAnnotationLine1); Assert.assertEquals(secondCopyAnnotationLine1.getGenericFieldValue("Description"), "ANNOTATION != \"NA\" || ANNOTATION <= 0.01"); @@ -574,136 +778,153 @@ public void testVCFHeaderQuoteEscaping() throws Exception { } - @Test - public void testVcf42Roundtrip() throws Exception { - // this test ensures that source/version fields are round-tripped properly + ///////////////////////////////////////////////////////////////////// + // Private helper methods + ///////////////////////////////////////////////////////////////////// - // read an existing VCF - File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + // Serialize/encode the header to a file, read metaData back in + private Set getRoundTripEncoded(final VCFHeader header) throws IOException { + final File myTempFile = File.createTempFile("VCFHeader", "vcf"); + try (final VariantContextWriter vcfWriter = + new VariantContextWriterBuilder() + .setOutputFile(myTempFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOptions(VariantContextWriterBuilder.NO_OPTIONS) + .build()) { + vcfWriter.writeHeader(header); + } + final VCFHeader vcfHeader = (VCFHeader) new VCFCodec().readActualHeader(new LineIteratorImpl( + new SynchronousLineReader(new FileReader(myTempFile.getAbsolutePath())))); + return vcfHeader.getMetaDataInSortedOrder(); + } - // write the file out into a new copy - final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); - actualFile.deleteOnExit(); - try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); - final VariantContextWriter copyWriter = new VariantContextWriterBuilder() - .setOutputFile(actualFile) - .setReferenceDictionary(createArtificialSequenceDictionary()) - .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) - .build() - ) { - final VCFHeader originalHeader = originalFileReader.getFileHeader(); - - copyWriter.writeHeader(originalHeader); - for (final VariantContext variantContext : originalFileReader) { - copyWriter.add(variantContext); - } - } + private VCFHeader getHiSeqVCFHeader() { + final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); + final VCFFileReader reader = new VCFFileReader(vcf, false); + final VCFHeader header = reader.getFileHeader(); + reader.close(); + return header; + } - final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); - final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); - Assert.assertEquals(actualContents, expectedContents); + private static Collection asCollectionOfVCFHeaderLine(final Collection headers) { + // create a collection of VCFHeaderLine so that contains tests work correctly + return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); } + @DataProvider(name="duplicateHeaderLineCases") + private Object[][] getDuplicateHeaderLineCases() { + return new Object[][] { - /** - * a little utility function for all tests to md5sum a file - * Shameless taken from: - *

- * http://www.javalobby.org/java/forums/t84420.html - * - * @param file the file - * @return a string - */ - private static String md5SumFile(File file) { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Unable to find MD5 digest"); - } - InputStream is; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to open file " + file); - } - byte[] buffer = new byte[8192]; - int read; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - - } catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } finally { - try { - is.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } + // these tests use VCFAltHeaderLine to test structured/ID lines, but the behavior should be the same + // for any header ID line + + // duplicate IDs, duplicate description; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description1"), false }, + // duplicate IDs, different descriptions; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description2"), false }, + // different IDs, different descriptions; line is retained + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("Y", "description2"), true }, + // different IDs, duplicate descriptions; line is retained + { new VCFAltHeaderLine("X", "description"), + new VCFAltHeaderLine("Y", "description"), true }, + + // .......unstructured header lines........ + + // duplicate key, duplicate value, line is dropped + { new VCFHeaderLine("CommandLine", "command"), new VCFHeaderLine("CommandLine", "command"), false }, + // duplicate key, different value, line is retained + { new VCFHeaderLine("CommandLine", "command1"), new VCFHeaderLine("CommandLine", "command2"), true }, + + /////////////////////////////////////////////////////////////////////////////////////////// + // since the VCFHeaderLine constructor is public, it can be used erroneously to model header + // lines that have structured syntax, but which will not obey structured header line rules, + // since those are enabled via VCFSimpleHeaderLine, and VCFHeaderLine is intended to be used + // for non-structured lines. so include some tests that simulate this + + // duplicate key, duplicate value (...duplicate ID), line is dropped + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), false }, + // duplicate key, different value (different ID), line is retained + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + //NOTE: this case illustrates how its possible to use the API to cause two structured lines + // with duplicate IDs to be retained if they are not modeled as VCFStructuredHeaderLines + // duplicate key, different value (but IDENTICAL ID), line is RETAINED + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + // different key, duplicate value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + // different key, different value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + }; } - private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { - File myTempFile = null; - PrintWriter pw = null; - try { - myTempFile = File.createTempFile("VCFHeader", "vcf"); - myTempFile.deleteOnExit(); - pw = new PrintWriter(myTempFile); - } catch (IOException e) { - Assert.fail("Unable to make a temp file!"); - } - for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) - pw.println(line); - pw.close(); - Assert.assertEquals(md5SumFile(myTempFile), md5sum); - } - - public static final int VCF4headerStringCount = 16; - - public static final String VCF4headerStrings = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - - - public static final String VCF4headerStrings_with_negativeOne = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + @Test(dataProvider = "duplicateHeaderLineCases") + private void testDuplicateHeaderLine(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectHL2Retained) { + final Set lineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + lineSet.add(hl1); + lineSet.add(hl2); + final VCFHeader vcfHeader = new VCFHeader(lineSet); + + Assert.assertEquals(vcfHeader.getMetaDataInInputOrder().size(), expectHL2Retained ? 3 : 2); + } + + @Test + public void testAddOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final List otherLines1 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines1.size(), 1); + Assert.assertTrue(otherLines1.contains(otherLine1)); + + // now add a second line + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + final List otherLines2 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines2.size(), 2); + Assert.assertTrue(otherLines2.contains(otherLine1)); + Assert.assertTrue(otherLines2.contains(otherLine2)); + + // now call addOtherHeaderLineUnique with a 3rd line, the first two should be removed + final VCFHeaderLine otherLine3= new VCFHeaderLine(TEST_KEY, "Test Value 3"); + vcfHeader.addOtherHeaderLineUnique(otherLine3); + final List otherLines3 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines3.size(), 1); + Assert.assertFalse(otherLines3.contains(otherLine1)); + Assert.assertFalse(otherLines3.contains(otherLine2)); + Assert.assertTrue(otherLines3.contains(otherLine3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddOtherHeaderLineUniqueRejectsIDLines() { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("testKey", "testID","test description"); + vcfHeader.addOtherHeaderLineUnique(simpleHeaderLine); + } + + @Test(expectedExceptions = TribbleException.class) + public void testGetOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + + // now add two lines with the same key + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + + final List otherLines = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines.size(), 2); + Assert.assertTrue(otherLines.contains(otherLine1)); + Assert.assertTrue(otherLines.contains(otherLine2)); + + // now call getOtherHeaderLineUnique, should throw + vcfHeader.getOtherHeaderLineUnique(TEST_KEY); + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java new file mode 100644 index 0000000000..7b57a19b5a --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -0,0 +1,203 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.readers.LineIteratorImpl; +import htsjdk.tribble.readers.SynchronousLineReader; +import org.testng.Assert; + +import java.io.StringReader; +import java.util.*; + +// Unit test data used by unit tests for VCFHeader, VCFMetaDataLines, and VCFHeaderLine hierarchy. +public class VCFHeaderUnitTestData { + public final static VCFHeaderVersion TEST_VERSION = VCFHeader.DEFAULT_VCF_VERSION; + + // fileformat line + public static List getTestDefaultFileFormatLine() { + return new ArrayList() {{ + add(VCFHeader.makeHeaderVersionLine(TEST_VERSION)); + }}; + } + + // FILTER lines + public static List getTestFilterLines() { + return new ArrayList() {{ + add(new VCFFilterHeaderLine("LowQual", "Description=\"Low quality\"")); + add(new VCFFilterHeaderLine("highDP", "Description=\"DP < 8\"")); + add(new VCFFilterHeaderLine("TruthSensitivityTranche98.50to98.80", "Truth sensitivity tranche level at VSQ Lod: -0.1106 <= x < 0.6654")); + }}; + } + + // FORMAT lines + public static List getTestFormatLines() { + return new ArrayList() {{ + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + add(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + add(new VCFFormatHeaderLine("MLPSAF", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + }}; + } + + // INFO lines + public static List getTestInfoLines() { + return new ArrayList() {{ + add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + add(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + add(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + }}; + } + + // CONTIG lines + public static List getTestContigLines() { + return new ArrayList() {{ + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "2"), 1)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "3"), 2)); + }}; + } + + //misc lines + public static List getTestMiscellaneousLines() { + return new ArrayList() {{ + add(new VCFHeaderLine("reference", "g37")); + add(new VCFHeaderLine("GATKCommandLine", "SelectVariants and such.")); + }}; + } + + //Return a full set of metadata lines, retaining order in a LinkedHashSet. + public static LinkedHashSet getTestMetaDataLinesSet() { + final LinkedHashSet allHeaderLines = new LinkedHashSet() {{ //preserve order + addAll(getTestDefaultFileFormatLine()); + addAll(getTestFilterLines()); + addAll(getTestFormatLines()); + addAll(getTestInfoLines()); + addAll(getTestContigLines()); + addAll(getTestMiscellaneousLines()); + }}; + Assert.assertEquals(allHeaderLines.size(), + 1 + // file format line + getTestFilterLines().size() + getTestFormatLines().size() + + getTestInfoLines().size() + getTestContigLines().size() + getTestMiscellaneousLines().size()); + return allHeaderLines; + } + + //Return a full set of metadata lines as a VCFMetaDataLines. + public static VCFMetaDataLines getTestMetaDataLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + md.addMetaDataLines(getTestMetaDataLinesSet()); + return md; + } + + private static final int VCF_4_HEADER_STRING_COUNT = 16; // 17 -1 for the #CHROM... line + + public static String getVCFV42TestHeaderString() { + return "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + } + + public static final String VCF42headerStrings_with_negativeOne = + "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + + public static Set getV42HeaderLinesWITHOUTFormatString() { + // precondition - create a v42 VCFMetaDataLines and make sure its v42 + final Set metaDataSet = getV42HeaderLinesWITHFormatString(); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(metaDataSet); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals( + VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), + VCFHeaderVersion.VCF4_2); + + // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string + metaDataSet.remove(versionLine); + Assert.assertNull(getVersionLineFromHeaderLineSet(metaDataSet)); + return metaDataSet; + } + + public static Set getV42HeaderLinesWITHFormatString() { + // precondition - create a v42 header and make sure its v42 + final VCFHeader header = createHeaderFromString(getVCFV42TestHeaderString()); + Assert.assertEquals( + header.getVCFHeaderVersion(), + VCFHeaderVersion.VCF4_2); + + // return a mutable set for test use + return new LinkedHashSet<>(header.getMetaDataInInputOrder()); + } + + public static VCFHeader createHeaderFromString(final String headerStr) { + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader( + new LineIteratorImpl(new SynchronousLineReader(new StringReader(headerStr)))); + Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF_4_HEADER_STRING_COUNT); + return header; + } + + /** + * Find and return the VCF fileformat/version line + * + * Return null if no fileformat/version lines are found + */ + private static VCFHeaderLine getVersionLineFromHeaderLineSet(final Set metaDataLines) { + VCFHeaderLine versionLine = null; + final List formatLines = new ArrayList<>(); + for (final VCFHeaderLine headerLine : metaDataLines) { + if (VCFHeaderVersion.isFormatString(headerLine.getKey())) { + formatLines.add(headerLine); + } + } + + if (!formatLines.isEmpty()) { + if (formatLines.size() > 1) { + //throw if there are duplicate version lines + throw new TribbleException("Multiple version header lines found in header line list"); + } + return formatLines.get(0); + } + + return versionLine; + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java new file mode 100644 index 0000000000..9e2a82f15a --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -0,0 +1,86 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to INFO lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFInfoHeaderLineUnitTest extends HtsjdkTest { + + @Test + public void testRepairInfoLineFlagTypeWithNonzeroCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(0, infoLine.getCount()); + } + + @DataProvider(name = "mergeCompatibleInfoLines") + public Object[][] getMergeCompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeCompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2, + final VCFInfoHeaderLine expectedHeaderLine) { + Assert.assertEquals( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)), + expectedHeaderLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test + public void testAllow1000GKey() { + final VCFInfoHeaderLine line = new VCFInfoHeaderLine( + "INFO=", + VCFHeader.DEFAULT_VCF_VERSION + ); + + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + Assert.assertFalse(line.getValidationFailure(VCFHeaderVersion.VCF4_3).isPresent()); + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions= TribbleException.class) + public void testMergeIncompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2) { + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java new file mode 100644 index 0000000000..2e41536abe --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -0,0 +1,354 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFMetaDataLinesUnitTest extends HtsjdkTest { + + @DataProvider(name="keyCollisions") + public Object[][] keyCollisions() { + return new Object[][] { + // line 1, line 2, expected to collide + + // Unstructured key collisions + { // same key, same value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), true + }, + { // same key, different value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value1"), false + }, + { // different key, same value + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), false + }, + { // different key, different value + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), false + }, + + // Structured key collisions + { // same key, same ID, same (base VCFSimpleHeaderLine) class + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), true + }, + { // same key, same ID, same (derived-VCFSimpleHeaderLine) class, same attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "unused description"), true + }, + { // same key, same ID, same class, different attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "different unused description"), true + }, + { // same key, different ID + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName2", "unused description"), false + }, + { // This is an unfortunate case that is allowed by the existing permissive VCFHeader + // APIs; two header lines that have identical content, one of which is modeled by the + // VCFSimpleHeaderLine base class, and one of which is modeled by the specialized , + // derived VCFFilterHeaderLine class + new VCFFilterHeaderLine("id", "unused description"), + new VCFSimpleHeaderLine("FILTER", new LinkedHashMap() {{ + put("ID", "id"); + put("Description", "unused description"); + }}), true } + }; + } + + @Test(dataProvider="keyCollisions") + public void testKeyCollisions(final VCFHeaderLine line1, final VCFHeaderLine line2, final boolean expectCollision) { + final VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + Assert.assertEquals(mdLines.getMetaDataInInputOrder().size(), expectCollision ? 1 : 2); + } + + @Test + public void testRetainFullHeaderLines() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + Assert.assertEquals(md.getMetaDataInSortedOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + + Assert.assertEquals(unitTestData.getTestFormatLines(), md.getFormatHeaderLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + Assert.assertEquals(unitTestData.getTestInfoLines(), md.getInfoHeaderLines()); + Assert.assertEquals(unitTestData.getTestContigLines(), md.getContigLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + + final Set otherLines = new LinkedHashSet<>(); + otherLines.addAll(unitTestData.getTestDefaultFileFormatLine()); + otherLines.addAll(unitTestData.getTestMiscellaneousLines()); + Assert.assertEquals(otherLines, md.getOtherHeaderLines()); + } + + @Test + public void testAddRemoveOtherMetaDataLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getIDHeaderLines().size(); + int beforeOtherSize = md.getOtherHeaderLines().size(); + + final VCFHeaderLine newLine = new VCFHeaderLine("foo", "bar"); + + // add one other line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize + 1); + + // remove the other line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // still remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); + } + + @Test + public void testAddRemoveUniqueStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + final int beforeOtherSize = md.getOtherHeaderLines().size(); + + // add a new, unique, structured line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // remains the same + + // remove the new line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // still remains the same + } + + @Test + public void testAddRemoveDuplicateStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + + // add a new, unique, structured (filter) line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + + // now try to re-add the same structured filter line again, this second one is rejected, count remains the same + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getFilterHeaderLine("filterID"), newLine); + + // remove the first structured line and we're back to the original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + } + +// @Test +// public void testAddRemoveContigLine() { +// final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); +// } + + @Test + public void testHasEquivalentHeaderLinePositive() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines sourceMetaDataLines = unitTestData.getTestMetaDataLines(); + + // for each headerLine in the set, make sure findEquivalentHeaderLine returns it + for (final VCFHeaderLine headerLine : sourceMetaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = sourceMetaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + } + } + + @Test + public void testHasEquivalentHeaderLineNegative() { + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + // add a few test lines + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "test value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "other value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("reference", "assembly37")); + + // for each other headerLine in the starting set, make another header line with the same key but a different + // value, and ensure findEquivalentHeaderLine does NOT return it + for (final VCFHeaderLine headerLine : metaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = metaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + + final VCFHeaderLine modifiedHeaderLine = new VCFHeaderLine(headerLine.getKey(), headerLine.getValue() + "zzz"); + Assert.assertNull(metaDataLines.findEquivalentHeaderLine(modifiedHeaderLine)); + } + } + + @Test + public void testGetFilterHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getFilterHeaderLine(unitTestData.getTestFilterLines().get(0).getID()), unitTestData.getTestFilterLines().get(0)); + } + + @Test + public void testGetInfoHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getInfoHeaderLine(unitTestData.getTestInfoLines().get(0).getID()), unitTestData.getTestInfoLines().get(0)); + } + + @Test + public void testGetFormatHeaderLine() { + final VCFHeaderUnitTestData testData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = testData.getTestMetaDataLines(); + Assert.assertEquals(md.getFormatHeaderLine(testData.getTestFormatLines().get(0).getID()), testData.getTestFormatLines().get(0)); + } + + @Test + public void testAddRemoveVersionLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + + final int originalMetaDataLineCount = md.getMetaDataInInputOrder().size(); + + // now, remove the version line, make sure the removed line is actually the version line, that the + // resulting metadataLines version is now null, and the line count drops by 1 + final VCFHeaderLine queryVersionLine = VCFHeader.makeHeaderVersionLine(unitTestData.TEST_VERSION); + final VCFHeaderLine oldVersionLine = md.removeMetaDataLine(queryVersionLine); + Assert.assertEquals(oldVersionLine, queryVersionLine); + Assert.assertNull(md.getVCFVersion()); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount - 1); + + // now put it back... + md.addMetaDataLine(oldVersionLine); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount); + } + + @Test + public void testAddContigLineExactDuplicate() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // add in the duplicate line + md.addMetaDataLine(vcfContigLine1); + Assert.assertEquals(md.getContigLines(), contigLines); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineConflicting() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + + final Set contigLines = new LinkedHashSet<>(); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0)); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1)); + + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // try to add a contg line with a duplicate index, but with a different name than the existing line with that index + md.addMetaDataLine(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0)); + } + + @Test + public void testRemoveAndReplaceContigLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 1); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 2); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + //make sure the initial contig index order is honored; it happens to be the same as the input + // order a this point, but check anyway + final List sortedLines1 = md.getContigLines(); + Assert.assertEquals(sortedLines1.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines1.get(1), vcfContigLine2); + + // now remove the first contig line; only one should remain + final VCFHeaderLine removedContigLine = md.removeMetaDataLine(vcfContigLine1); + Assert.assertEquals(removedContigLine, vcfContigLine1); + final List sortedContigHeaderLines = md.getContigLines(); + Assert.assertEquals(sortedContigHeaderLines.size(), 1); + + // now add the first line back in, so the input order is different than the sorted order, + // and make sure the order is honored + md.addMetaDataLine(vcfContigLine1); + final List sortedLines2 = md.getContigLines(); + Assert.assertEquals(sortedLines2.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines2.get(1), vcfContigLine2); + + // now add in ANOTHER contig line at the end that has an index that puts it BEFORE the existing lines + final VCFContigHeaderLine vcfContigLine3 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0); + md.addMetaDataLine(vcfContigLine3); + final List sortedLines3 = md.getContigLines(); + Assert.assertEquals(sortedLines3.size(), 3); + Assert.assertEquals(sortedLines3.get(0), vcfContigLine3); + Assert.assertEquals(sortedLines3.get(1), vcfContigLine1); + Assert.assertEquals(sortedLines3.get(2), vcfContigLine2); + } + +} + diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java new file mode 100644 index 0000000000..518f6a6928 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java @@ -0,0 +1,44 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFMetaHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + }; + } + + private static final String META_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFMetaHeaderLine vcfLine = new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java new file mode 100644 index 0000000000..43179c6862 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java @@ -0,0 +1,50 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFPedigreeHeaderLineUnitTest extends HtsjdkTest { + + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String PEDIGREE_STRING_4_2 = "PEDIGREE="; + private static final String PEDIGREE_STRING_4_3 = "PEDIGREE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFPedigreeHeaderLine vcfLine = new VCFPedigreeHeaderLine( + vcfAllowedVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ? + PEDIGREE_STRING_4_3 : + PEDIGREE_STRING_4_2, + vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFPedigreeHeaderLine(PEDIGREE_STRING_4_2, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java new file mode 100644 index 0000000000..355827e27b --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFSampleHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String SAMPLE_STRING = "SAMPLE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFSampleHeaderLine vcfLine = new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java new file mode 100644 index 0000000000..c9f8841d3d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java @@ -0,0 +1,151 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import java.util.LinkedHashMap; + +public class VCFSimpleHeaderLineUnitTest extends HtsjdkTest { + + private VCFSimpleHeaderLine getStructuredHeaderLine() { + return new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }} + ); + } + + @Test + public void testConstructorFromStrings() { + final VCFSimpleHeaderLine hl = new VCFSimpleHeaderLine("testKey", "testId", "test description"); + Assert.assertEquals("testKey", hl.getKey()); + Assert.assertEquals("testId", hl.getID()); + Assert.assertEquals("test description", hl.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + Assert.assertEquals("testKey=", hl.toStringEncoding()); + } + + @Test + public void testConstructorFromEncodedLine() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testConstructorFromAttributeMap() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }}); + + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromEncodedLine() { + new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromAttributeMap() { + new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("attr1", "value1"); + put("attr2", "value2"); + }}); + } + + @DataProvider(name = "violateIDRequirements") + public Object[][] getViolateIDRequirements() { + return new Object[][]{ + {""}, + {""}, + {""}, + {""} + }; + } + + @Test(dataProvider="violateIDRequirements",expectedExceptions=TribbleException.class) + public void testViolateIDRequirements(final String headerLine) { + new VCFSimpleHeaderLine("key", headerLine, VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test + public void testGetID() { + Assert.assertEquals(getStructuredHeaderLine().getID(), "id"); + } + + @Test + public void testIsIDLine() { + Assert.assertTrue(getStructuredHeaderLine().isIDHeaderLine()); + } + + @Test + public void testGetGenericFieldValue() { + Assert.assertEquals(getStructuredHeaderLine().getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testStringEncoding() { + final VCFSimpleHeaderLine structuredHL = getStructuredHeaderLine(); + Assert.assertEquals(structuredHL.toStringEncoding(),"key="); + } + + @Test + public void testUnescapedQuotedStringEncoding() { + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEscapedQuotedStringEncoding() { + // test Source and Version attributes + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index c9efaa59ef..45009ce211 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -31,8 +31,9 @@ import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; /** * Created by IntelliJ IDEA. @@ -188,7 +189,11 @@ public Object[][] makeRepairHeaderTest() { @Test(dataProvider = "RepairHeaderTest") public void testRepairHeaderTest(final RepairHeaderTest cfg) { - final VCFHeader toRepair = new VCFHeader(Collections.singleton((VCFHeaderLine)cfg.original)); + final Set headerLines = new LinkedHashSet<>(); + headerLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + headerLines.add(cfg.original); + + final VCFHeader toRepair = new VCFHeader(headerLines); final VCFHeader repaired = VCFStandardHeaderLines.repairStandardHeaderLines(toRepair); VCFCompoundHeaderLine repairedLine = (VCFCompoundHeaderLine)repaired.getFormatHeaderLine(cfg.original.getID()); diff --git a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java index ed943feac1..5629798c61 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java @@ -1,6 +1,7 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -11,45 +12,55 @@ public class VCFUtilsTest extends HtsjdkTest { @DataProvider(name="validHeaderVersionMerger") public Object[][] validHeaderMergerVersions() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + + // header version must be at least v4.2 to merge, result is always highest version return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.0")}, - {Arrays.asList("VCFv4.1", "VCFv4.1")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.3", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.2")}, + // headers to merge, expected result version + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2}, + {Arrays.asList("VCFv4.3", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, }; } @DataProvider(name="invalidHeaderVersionMerger") public Object[][] invalidHeaderVersionMerger() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + // header version must be at least v4.2 to merge return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.3")}, - {Arrays.asList("VCFv4.1", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.0", "VCFv4.0", "VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.3", "VCFv4.0", "VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.2")}, + {Arrays.asList("VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.1", "VCFv4.2", "VCFv4.3")}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.1", "VCFv4.0")}, }; } @Test(dataProvider="validHeaderVersionMerger") - public void testValidHeaderVersionMerger(final List headerVersions) { - final List headersToMerge = new ArrayList<>(headerVersions.size()); - headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) - ); - final Set resultHeaders = VCFUtils.smartMergeHeaders(headersToMerge, true); + public void testValidHeaderVersionMerger(final List headerVersions, final VCFHeaderVersion expectedVersion) { + final Set mergedHeaderLines = doHeaderMerge(headerVersions); + + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); } - @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = IllegalArgumentException.class) + @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = TribbleException.class) public void testInvalidHeaderVersionMerger(final List headerVersions) { + doHeaderMerge(headerVersions); + } + + private Set doHeaderMerge(final List headerVersions) { final List headersToMerge = new ArrayList<>(headerVersions.size()); headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) + new VCFHeader( + VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.toHeaderVersion(hv)), + Collections.emptySet())) ); - VCFUtils.smartMergeHeaders(headersToMerge, true); + return VCFUtils.smartMergeHeaders(headersToMerge, true); } @DataProvider(name = "caseIntolerantDoubles") diff --git a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf index a304ba24da..75c9f9b537 100644 --- a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf +++ b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FORMAT= ##FORMAT= diff --git a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf index 9af0cb3e64..097d0b034f 100644 --- a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf +++ b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FILTER= ##FILTER= From 3d08ef8aeeac36efe8d1efa11f144b5b27ad9844 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 08:42:06 -0500 Subject: [PATCH 03/22] Eliminate redundant modeling of VCFHeaderVersion in VCFHeader. --- .../java/htsjdk/variant/vcf/VCFHeader.java | 42 ++++++++----------- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 5 ++- .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 17 ++++---- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 637c04c4fc..1dcb5e07f9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -65,9 +65,6 @@ public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - // the VCF version for this header - private VCFHeaderVersion vcfHeaderVersion; - // header meta data private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); @@ -163,7 +160,7 @@ public VCFHeader(final Set metaData, final List genotypeS // lines are presented in the set, a warning will be issued, only the last one will be retained, // and the header version will be established using the last version line encountered mMetaData.addMetaDataLines(metaData); - vcfHeaderVersion = initializeHeaderVersion(); + final VCFHeaderVersion vcfHeaderVersion = initializeHeaderVersion(); mMetaData.validateMetaDataLines(vcfHeaderVersion); checkForDeprecatedGenotypeLikelihoodsKey(); @@ -180,7 +177,7 @@ public VCFHeader(final Set metaData, final List genotypeS * @return the VCFHeaderVersion for this header. will not be null */ public VCFHeaderVersion getVCFHeaderVersion() { - return vcfHeaderVersion; + return mMetaData.getVCFVersion(); } /** @@ -191,16 +188,12 @@ public VCFHeaderVersion getVCFHeaderVersion() { * @param headerLine header line to attempt to add */ public void addMetaDataLine(final VCFHeaderLine headerLine) { - // propagate the new line to the metadata lines object + // propagate the new line to the metadata lines object, and if the version changed, validate + // the lines against the new version + final VCFHeaderVersion oldHeaderVersion = mMetaData.getVCFVersion(); mMetaData.addMetaDataLine(headerLine); - - // update the current version in case this line triggered a version change final VCFHeaderVersion newHeaderVersion = mMetaData.getVCFVersion(); - if (!newHeaderVersion.equals(vcfHeaderVersion)) { - validateVersionTransition(vcfHeaderVersion, newHeaderVersion); - } - vcfHeaderVersion = newHeaderVersion; - headerLine.validateForVersion(vcfHeaderVersion); + validateVersionTransition(headerLine, oldHeaderVersion, newHeaderVersion); checkForDeprecatedGenotypeLikelihoodsKey(); } @@ -574,7 +567,6 @@ public boolean equals(final Object o) { if (samplesWereAlreadySorted != vcfHeader.samplesWereAlreadySorted) return false; if (writeEngineHeaders != vcfHeader.writeEngineHeaders) return false; if (writeCommandLine != vcfHeader.writeCommandLine) return false; - if (vcfHeaderVersion != vcfHeader.vcfHeaderVersion) return false; if (!mMetaData.equals(vcfHeader.mMetaData)) return false; if (mGenotypeSampleNames != null ? !mGenotypeSampleNames.equals(vcfHeader.mGenotypeSampleNames) : vcfHeader.mGenotypeSampleNames != null) @@ -588,8 +580,7 @@ public boolean equals(final Object o) { @Override public int hashCode() { - int result = vcfHeaderVersion.hashCode(); - result = 31 * result + mMetaData.hashCode(); + int result = mMetaData.hashCode(); result = 31 * result + (mGenotypeSampleNames != null ? mGenotypeSampleNames.hashCode() : 0); result = 31 * result + (samplesWereAlreadySorted ? 1 : 0); result = 31 * result + (sampleNamesInOrder != null ? sampleNamesInOrder.hashCode() : 0); @@ -614,26 +605,29 @@ private VCFHeaderVersion initializeHeaderVersion() { } private void validateVersionTransition( - final VCFHeaderVersion previousVersion, + final VCFHeaderLine newHeaderLine, + final VCFHeaderVersion currentVersion, final VCFHeaderVersion newVersion) { - final int compareTo = newVersion.compareTo(previousVersion); + final int compareTo = newVersion.compareTo(currentVersion); + + // We only allow going forward to a newer version, not backwards to an older one, since there + // is really no way to validate old header lines (pre vcfV4.2). If the version moved forward, + // revalidate all the lines, otherwise only validate the new header line. if (compareTo < 0) { - // We only allow going forward to a newer version, not backwards to an older one, since there - // is really no way to validate old header lines (pre vcfV4.2). The only way to create a header with - // an old version is to create it that way from the start. - // to be created with the old version from the start. throw new TribbleException(String.format( "When changing a header version, the new header version %s must be > the previous version %s", newVersion, - previousVersion)); + currentVersion)); } else if (compareTo > 0) { logger.debug(() -> String.format("Updating VCFHeader version from %s to %s", - previousVersion.getVersionString(), + currentVersion.getVersionString(), newVersion.getVersionString())); // the version moved forward, so validate ALL of the existing lines in the list to ensure // that the transition is valid mMetaData.validateMetaDataLines(newVersion); + } else { + newHeaderLine.validateForVersion(newVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 843fdf98cc..5f68a61113 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -144,10 +144,11 @@ public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { } /** - * Validate all metadata lines except the file format line against a target version. + * Validate all metadata lines, excluding the file format line against a target version. * Throws {@link TribbleException.VersionValidationFailure} if any line is incompatible with the given version. * @param targetVersion the target version to validate against - * @throws TribbleException if any existing line fails to validate against {@code targetVersion} + * @throws {@link TribbleException.VersionValidationFailure} if any existing line fails to validate against + * {@code targetVersion} */ //TODO: we need to tell users how to resolve the case where this fails due to version validation //i.e, use a custom upgrade tool diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 8ee9ccab26..b604b91899 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -515,15 +515,6 @@ public void testVersionUpgradeWithValidationFailure() { vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_3)); } - @Test(expectedExceptions = TribbleException.class) - public void testAddLineWithValidationFailure() { - // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) - // which should cause a failure - final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); - vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); - } - - @Test(expectedExceptions = TribbleException.class) public void testConstructorRequiresFileFormatLine() { final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions @@ -579,6 +570,14 @@ public void testAddMetaDataLineInvalidForVersion() { header.addMetaDataLine(new VCFPedigreeHeaderLine(attributes)); } + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineWithValidationFailure() { + // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) + // which should cause a failure + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + } + @Test(expectedExceptions = TribbleException.class) public void testAddMetaDataLineFileFormat() { final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions From ca31a2b017e069ecbe3ed7b02912a0fbd8eeb8ac Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 11:16:20 -0500 Subject: [PATCH 04/22] Eliminate redundant modeling of file format lines in VCFMetaDataLines. --- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 131 ++++++++++-------- .../variant/vcf/VCFHeaderMergerUnitTest.java | 3 +- .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 20 +++ .../variant/vcf/VCFHeaderUnitTestData.java | 2 +- .../variant/vcf/VCFMetaDataLinesUnitTest.java | 26 +++- 5 files changed, 113 insertions(+), 69 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 5f68a61113..97f208e7b4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -13,16 +13,16 @@ * Class for managing the set of VCFHeaderLines maintained by a VCFHeader. * * Since this class is used to incrementally build up a set of header lines for use with a VCFHeader, - * it does not require that the list always contain a fileformat line (its VCFHeader's job to enforce + * it does not require that the list always contain a file format line (its VCFHeader's job to enforce * that condition). * * This class maintains several invariants: * - * - The list keeps track of the "current version" by tracking whether a version line (a line that - * establishes the VCFHeaderVersion, such as format/fileformat line) is contained in the list. If - * no version line has been added, the list will have a null current version, and contain 0 version - * lines. If a version line has been added, it will have a non-null version, and contain 1 version line. - * If the version line is manually removed, the "current version" is reset to null. + * - The "current version" of the lines is tracked by recording whether a version line (a line that + * establishes the VCFHeaderVersion, such as format/fileformat line) has been added to the list. If + * no version line has been added, the list will have a null current version; if a version line has + * been added, it will have a non-null version. If the version line is manually removed, the "current + * version" is reset to null. * * - Each contig line that is retained is guaranteed to have a unique contig index. This does * NOT guarantee that the contig indices are contiguous, or ordered, only that they are unique. @@ -34,7 +34,7 @@ * getInfoHeaderLines(), but would still be serialized on write.) * * This class does NOT validate that the lines contained are valid for the current version (that is - * the caller's responsibilty). + * the caller's responsibility). */ //Visible to allow disq Kryo registration for serialization @InternalAPI @@ -53,9 +53,10 @@ final class VCFMetaDataLines implements Serializable { private VCFHeaderVersion vcfVersion; /** - * Add all metadata lines from Set. If a duplicate line is encountered (duplicate content for - * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only - * the new line will be retained. + * Add all metadata lines from Set. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. * * @param newMetaData Set of lines to be added to the list. * @throws IllegalArgumentException if a version is established or if any line fails validation for that version @@ -65,9 +66,10 @@ public void addMetaDataLines(final Set newMetaData) { } /** - * Add a metadata line to the list. If a duplicate line is encountered (duplicate content for - * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only - * the newest line will be retained. + * Add a metadata line to the list. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. * * @param newMetaDataLine header line to attempt to add * @returns an existing (equivalent) header line that was replaced by newMetaDataLine, if any, @@ -94,25 +96,32 @@ public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { } /** - * Remove a metadata line from the list. This is the inverse of addMetaDataLine - it removes a - * line that has an identical key and value as lineToRemove if lineToRemove is an unstructured (non-ID) - * but if lineToRemove is a structured line, it will remove the line that has the same key/ID pair as - * lineToRemove, regardless of other content. + * Remove an equivalent metadata line from the list. This is the inverse of addMetaDataLine, and removes + * any equivalent line that already exists (any existing file format line if the line to be removed is + * an unstructured file format line; any existing identical line if the line to be removed is an unstructured + * non-file format line, or any existing line with a duplicate key/ID pair if the line to be removed is a + * structured line). * * The removed value is returned, and can be used by the caller to determine if the removed line has a * different value than the line presented. * * @param lineToRemove the header line to remove - * @return The actual headerline removed, or null of no equivalent headerline was found to remove + * @return The actual header line removed, or null of no equivalent header line was found to remove */ public VCFHeaderLine removeMetaDataLine(final VCFHeaderLine lineToRemove) { - final VCFHeaderLine removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); - if (removedLine != null) { - // only synchronize the dependent version and contig map variables if a line was ACTUALLY removed - if (VCFHeaderVersion.isFormatString(removedLine.getKey())) { + VCFHeaderLine removedLine = null; + if (VCFHeaderVersion.isFormatString(lineToRemove.getKey()) && vcfVersion != null) { + final VCFHeaderVersion versionToRemove = VCFHeaderVersion.toHeaderVersion(lineToRemove.getValue()); + if (versionToRemove.equals(vcfVersion)) { + // simulate "removal" of the line by recreating the line that we're dropping as the return value + removedLine = VCFHeader.makeHeaderVersionLine(versionToRemove); vcfVersion = null; - } else if (lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { - removeFromContigIndexMap((VCFContigHeaderLine) lineToRemove); + } + } else { + removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); + // only synchronize the dependent contig map variables if a line was ACTUALLY removed + if (removedLine != null && lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { + removeFromContigIndexMap((VCFContigHeaderLine) removedLine); } } return removedLine; @@ -128,19 +137,29 @@ public VCFHeaderVersion getVCFVersion() { /** * Return the existing line from the list that is "equivalent" to the query line, where - * equivalent is defined as having the same key and value for unstructured header lines, or the - * same key and ID, but not necessarily the same value (for structured header lines). The - * "equivalent" line returned by this method is not guaranteed to be equal to the queryLine, - * in the case where the queryLine is an ID line. + * equivalent is defined as having the same key and value for unstructured header lines, + * or the same key and ID, but not necessarily the same value, for structured header lines. + * The "equivalent" line returned by this method is not guaranteed to be equal to the + * queryLine, in the case where the queryLine is an ID line. * - * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, what - * line, if any, would it replace". + * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, + * what line, if any, would it replace". + * + * Note that for file format (VCF version) lines, this returns an existing file format line + * if there is one, even if the key is different than the query line (since that behavior + * mirrors the behavior of addMetaDataLine and removeMetaDataLine). * * @param queryLine the source line to use to check for equivalents * @return The existing header line of the type/key provided, otherwise NULL. */ public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { - return mMetaData.get(makeKeyForLine(queryLine)); + if (VCFHeaderVersion.isFormatString(queryLine.getKey())) { + return vcfVersion == null ? + null : + VCFHeader.makeHeaderVersionLine(vcfVersion); + } else { + return mMetaData.get(makeKeyForLine(queryLine)); + } } /** @@ -183,7 +202,7 @@ public Collection getValidationErrors(final VCFHeaderVersi * @return a set of the meta data */ public Set getMetaDataInInputOrder() { - return Collections.unmodifiableSet(new LinkedHashSet<>(mMetaData.values())); + return makeMetaDataLineSet(mMetaData.values()); } /** @@ -197,7 +216,7 @@ public Set getMetaDataInSortedOrder() { // `contains` implementation based on comparator equality that can lead to inconsistent // results for header line types like VCFContigHeaderLine that have a compareTo // implementation that is inconsistent with equals. - return Collections.unmodifiableSet(new LinkedHashSet<>(new TreeSet<>(mMetaData.values()))); + return makeMetaDataLineSet(new TreeSet<>(mMetaData.values())); } /** @@ -286,7 +305,7 @@ public VCFFilterHeaderLine getFilterHeaderLine(final String id) { * VCFHeaderLine that is not a contig, info, format or filter header line. */ public Collection getOtherHeaderLines() { - return mMetaData.values().stream().filter( + return getMetaDataInInputOrder().stream().filter( hl -> !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && @@ -297,31 +316,11 @@ public Collection getOtherHeaderLines() { } /** - * The version/fileformat header line if one exists, otherwise null. - * @return The version/fileformat header line if one exists, otherwise null. + * A version/fileformat header line representing the version for these lines, otherwise null. + * @return The version file format header line if a version has been established, otherwise null. */ public VCFHeaderLine getFileFormatLine() { - // find any existing version line(s). since there are multiple possible keys that - // represent version lines (old V3 specs used "format" instead of "fileformat") - final List existingVersionLines = mMetaData.values() - .stream() - .filter(line -> VCFHeaderVersion.isFormatString(line.getKey())) - .collect(Collectors.toList()); - - // This class doesn't mandate that the list it maintains always contains a fileformat line - // (its VCFHeader's job to maintain that condition for the header). - if (!existingVersionLines.isEmpty()) { - if (existingVersionLines.size() > 1) { - throw new IllegalStateException( - String.format("The metadata lines class contains more than one version line (%s)", - existingVersionLines.stream() - .map(VCFHeaderLine::toString) - .collect(Collectors.joining(",")))); - } - return existingVersionLines.get(0); - } else { - return null; - } + return vcfVersion == null ? null : VCFHeader.makeHeaderVersionLine(vcfVersion); } @Override @@ -469,7 +468,6 @@ private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { VCFHeaderVersion.isFormatString(newMetaDataLine.getKey()), "a file format line is required"); - final VCFHeaderLine currentVersionLine = getFileFormatLine(); final VCFHeaderVersion newVCFVersion = VCFHeaderVersion.toHeaderVersion(newMetaDataLine.getValue()); if (vcfVersion == null) { @@ -480,12 +478,23 @@ private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { vcfVersion + " to " + newVCFVersion); - removeFromMapOrThrow(currentVersionLine); } - mMetaData.put(makeKeyForLine(newMetaDataLine), newMetaDataLine); + final VCFHeaderLine oldVersionLine = getFileFormatLine(); vcfVersion = newVCFVersion; - return currentVersionLine; + return oldVersionLine; + } + + // make a new metadata line set to hand out to callers that includes + private Set makeMetaDataLineSet(final Collection orderedLines) { + if (vcfVersion != null) { + final Set orderedSet = new LinkedHashSet<>(orderedLines.size() + 1); + orderedSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + orderedSet.addAll(orderedLines); + return Collections.unmodifiableSet(orderedSet); + } else { + return Collections.unmodifiableSet(new LinkedHashSet<>(orderedLines)); + } } // composite keys used by the metadata lines map diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java index 1be8bdf085..818aae84a0 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java @@ -92,8 +92,7 @@ public void testMergeValidVersions(final List headerVersions, final Set mergedHeaderLines = doHeaderMergeForVersions(headerVersions); final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); metaDataLines.addMetaDataLines(mergedHeaderLines); - final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); - Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); + Assert.assertEquals(metaDataLines.getVCFVersion(), expectedVersion); // now create a new header using the merged VersionLines, and make sure *it* has the expected version final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index b604b91899..9f51901f91 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -600,6 +600,26 @@ public void testAddMetaDataLineFileFormat() { vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); } + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(); + orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeader vcfHeader = new VCFHeader(orderedLineSet, Collections.EMPTY_SET); + + final Collection inputOrderLines = vcfHeader.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = vcfHeader.getMetaDataInSortedOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); + } + @Test public void testPreserveSequenceDictionaryAttributes() { // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java index 7b57a19b5a..286fcecfa6 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -147,7 +147,7 @@ public static Set getV42HeaderLinesWITHOUTFormatString() { metaDataLines.addMetaDataLines(metaDataSet); final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); Assert.assertEquals( - VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), + metaDataLines.getVCFVersion(), VCFHeaderVersion.VCF4_2); // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java index 2e41536abe..f79331a7eb 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -172,11 +172,6 @@ public void testAddRemoveDuplicateStructuredLine() { Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); } -// @Test -// public void testAddRemoveContigLine() { -// final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); -// } - @Test public void testHasEquivalentHeaderLinePositive() { final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); @@ -350,5 +345,26 @@ public void testRemoveAndReplaceContigLines() { Assert.assertEquals(sortedLines3.get(2), vcfContigLine2); } + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(); + orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(orderedLineSet); + + final Collection inputOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); + } + } From d23389121ee87245c38c2737599dc3a4b37b8e0d Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 15:16:54 -0500 Subject: [PATCH 05/22] More code review comments. --- src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java | 4 ++-- src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 60eb4fc90f..7f0f255883 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -323,11 +323,11 @@ private int decodeCount(final String countString, final VCFHeaderLineCount reque // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. - updateGenericField(NUMBER_ATTRIBUTE, "0"); - lineCount = 0; logger.warn(String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", lineCount, getID())); + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; } } else if (lineCount <= 0) { throw new TribbleException.InvalidHeader( diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 97e7493a6f..9709af8cc6 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -83,7 +83,7 @@ public Object[][] otherHeaderLines() { { "key=<", new VCFHeaderLine("key", "<") }, // taken from Funcotator test file as ##ID= // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse - // into a VCFHeaderLine (but noa VCFSimpleHeaderLine + // into a VCFHeaderLine (just not a VCFSimpleHeaderLine) { "ID=", new VCFHeaderLine("ID", "") }, }; From e9178005150be55c81baa6438a6852407c8ea85c Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Thu, 15 Apr 2021 13:23:09 -0400 Subject: [PATCH 06/22] Add VCF 4.3 writing --- src/main/java/htsjdk/samtools/Defaults.java | 11 + .../variantcontext/GenotypeBuilder.java | 62 +++++- .../variantcontext/VariantContext.java | 116 +++++++--- .../variantcontext/VariantContextBuilder.java | 23 +- .../variantcontext/writer/BCF2Writer.java | 3 +- .../writer/VCFVersionUpgradePolicy.java | 40 ++++ .../variantcontext/writer/VCFWriter.java | 70 ++---- .../writer/VariantContextWriterBuilder.java | 24 +- .../htsjdk/variant/vcf/AbstractVCFCodec.java | 113 ++++++---- .../variant/vcf/VCFCompoundHeaderLine.java | 22 +- .../java/htsjdk/variant/vcf/VCFEncoder.java | 94 +++++--- .../htsjdk/variant/vcf/VCFFileReader.java | 63 +++++- .../java/htsjdk/variant/vcf/VCFHeader.java | 6 +- .../htsjdk/variant/vcf/VCFHeaderLine.java | 48 ++-- .../htsjdk/variant/vcf/VCFInfoHeaderLine.java | 13 +- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 25 ++- .../vcf/VCFPassThruTextTransformer.java | 10 + .../vcf/VCFPercentEncodedTextTransformer.java | 207 +++++++++++++++--- .../variant/vcf/VCFSimpleHeaderLine.java | 62 +++++- .../variant/vcf/VCFTextTransformer.java | 8 + .../variant/vcf/VCFVersionUpgrader.java | 30 +++ .../java/htsjdk/variant/VariantBaseTest.java | 18 +- .../variant/bcf2/BCF2UtilsUnitTest.java | 36 +-- .../variantcontext/GenotypeBuilderTest.java | 69 ++++++ .../VariantContextBuilderTest.java | 5 +- .../VariantContextUnitTest.java | 5 +- .../variant/vcf/VCFCodec43FeaturesTest.java | 89 +++++++- .../vcf/VCFCompoundHeaderLineUnitTest.java | 3 +- .../vcf/VCFContigHeaderLineUnitTest.java | 3 +- .../variant/vcf/VCFHeaderLineUnitTest.java | 5 - .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 98 ++++++--- .../variant/vcf/VCFHeaderUnitTestData.java | 2 + .../vcf/VCFInfoHeaderLineUnitTest.java | 3 +- .../variant/vcf/VCFMetaDataLinesUnitTest.java | 19 ++ .../vcf/VCFStandardHeaderLinesUnitTest.java | 4 +- .../variant/vcf/VCFTextTransformerTest.java | 91 +++++--- .../variant/diagnosis_targets_testfile.vcf | 2 +- .../vcf43/42AutomaticallyConvertible.vcf | 90 ++++++++ .../htsjdk/variant/vcf43/42Pedigree.vcf | 91 ++++++++ .../variant/vcf43/invalid43ContigName.vcf | 90 ++++++++ .../variant/vcf43/valid43ContigName.vcf | 90 ++++++++ 41 files changed, 1485 insertions(+), 378 deletions(-) create mode 100644 src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java create mode 100644 src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java create mode 100644 src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf create mode 100644 src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf create mode 100644 src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf create mode 100644 src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index b3db211e20..5aa3e9052e 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -1,6 +1,7 @@ package htsjdk.samtools; import htsjdk.samtools.util.Log; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.File; import java.util.Collections; @@ -115,6 +116,11 @@ public class Defaults { */ public static final boolean STRICT_VCF_VERSION_VALIDATION; + /** + * How to treat files from VCF versions older than the current version. Default = UPGRADE_OR_FALLBACK + */ + public static final VCFVersionUpgradePolicy VCF_VERSION_TRANSITION_POLICY; + public static final String SAMJDK_PREFIX = "samjdk."; static { @@ -140,6 +146,10 @@ public class Defaults { SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true); + VCF_VERSION_TRANSITION_POLICY = VCFVersionUpgradePolicy.valueOf(getStringProperty( + "vcf_version_transition_policy", + VCFVersionUpgradePolicy.UPGRADE_OR_FALLBACK.name() + )); } /** @@ -163,6 +173,7 @@ public static SortedMap allDefaults(){ result.put("CUSTOM_READER_FACTORY", CUSTOM_READER_FACTORY); result.put("SAM_FLAG_FIELD_FORMAT", SAM_FLAG_FIELD_FORMAT); result.put("DISABLE_SNAPPY_COMPRESSOR", DISABLE_SNAPPY_COMPRESSOR); + result.put("VCF_VERSION_TRANSITION_POLICY", VCF_VERSION_TRANSITION_POLICY); return Collections.unmodifiableSortedMap(result); } diff --git a/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java b/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java index 483e1c617d..fd6bdd1fe6 100644 --- a/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java @@ -25,15 +25,18 @@ package htsjdk.variant.variantcontext; -import htsjdk.tribble.util.ParsingUtils; +import htsjdk.tribble.TribbleException; import htsjdk.variant.vcf.VCFConstants; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; /** * A builder class for genotypes @@ -71,7 +74,7 @@ public final class GenotypeBuilder { private int[] AD = null; private int[] PL = null; private Map extendedAttributes = null; - private String filters = null; + private Set filters; private int initialAttributeMapSize = 5; private final static Map NO_ATTRIBUTES = @@ -199,7 +202,7 @@ public final void reset(final boolean keepSampleName) { */ public Genotype make() { final Map ea = (extendedAttributes == null) ? NO_ATTRIBUTES : extendedAttributes; - return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea); + return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, buildFilterString(), ea); } /** @@ -216,7 +219,7 @@ public Genotype makeWithShallowCopy() { final List al = new ArrayList<>(alleles); final int[] copyAD = (AD == null) ? null : Arrays.copyOf(AD, AD.length); final int[] copyPL = (PL == null) ? null : Arrays.copyOf(PL, PL.length); - return new FastGenotype(sampleName, al, isPhased, GQ, DP, copyAD, copyPL, filters, ea); + return new FastGenotype(sampleName, al, isPhased, GQ, DP, copyAD, copyPL, buildFilterString(), ea); } /** @@ -373,12 +376,32 @@ public GenotypeBuilder attribute(final String key, final Object value) { * @return this builder */ public GenotypeBuilder filters(final List filters) { - if ( filters.isEmpty() ) - return filter(null); - else if ( filters.size() == 1 ) - return filter(filters.get(0)); - else - return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters))); + for (final String filter : filters) { + if (!VariantContext.VALID_FILTER.matcher(filter).matches()) { + throw new TribbleException("Filter '" + filter + + "' contains an illegal character. It must conform to the regex ;'" + VariantContext.VALID_FILTER); + } else if (filter.equals("0")) { + throw new TribbleException("Filter cannot use reserved string '0'"); + } + } + // Filters must be unique + final Set uniqueFilters = new HashSet<>(filters.size()); + for (final String filter : filters) { + if (uniqueFilters.contains(filter)) { + throw new TribbleException("BUG: Attempting to add duplicate filter " + filter + " at " + this); + } else { + uniqueFilters.add(filter); + } + } + + final boolean hasUnfilteredString = uniqueFilters.contains(VCFConstants.UNFILTERED); + final boolean hasPassesString = uniqueFilters.contains(VCFConstants.PASSES_FILTERS_v4); + if ((hasUnfilteredString || hasPassesString) && uniqueFilters.size() > 1) { + throw new TribbleException("Filters cannot contain missing value '.' or passing value 'PASS' in addition to filters"); + } + + this.filters = hasPassesString ? null : uniqueFilters; + return this; } /** @@ -397,10 +420,27 @@ public GenotypeBuilder filters(final String ... filters) { * @return */ public GenotypeBuilder filter(final String filter) { - this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter; + // TODO should this split the string on semicolon, or should it be in the function's contract + // that only one filter and no semicolons can be included in the passed in string + if (filter == null || filter.isEmpty() || VCFConstants.PASSES_FILTERS_v4.equals(filter)) { + this.filters = null; + } else { + // Internal adjacent separators such as a;;b produce an empty string in the split array, which is + // handled by the valid filter regular expression, which rejects empty filter strings + if (filter.startsWith(";") || filter.endsWith(";")) { + throw new TribbleException("Filter string cannot start or end with filter separator ';'"); + } + filters(filter.split(";")); + } return this; } + private String buildFilterString() { + return this.filters == null || this.filters.isEmpty() + ? null + : this.filters.stream().sorted().collect(Collectors.joining(";")); + } + /** * This genotype is unfiltered * diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java index 2d50955bd1..a63d940670 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -26,6 +26,7 @@ package htsjdk.variant.variantcontext; import htsjdk.beta.plugin.HtsRecord; +import htsjdk.samtools.util.QualityUtil; import htsjdk.tribble.Feature; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; @@ -47,7 +48,7 @@ import java.util.stream.Collectors; /** - * + * *

High-level overview

* * The VariantContext object is a single general class system for representing genetic variation data composed of: @@ -89,7 +90,7 @@ *

* A [ref] / T at 10 *

- *
 
+ *
  * GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10);
  *
*

@@ -208,7 +209,7 @@ * * * - *

Fully decoding.

+ *

Fully decoding.

* Currently VariantContexts support some fields, particularly those * stored as generic attributes, to be of any type. For example, a field AB might * be naturally a floating point number, 0.51, but when it's read into a VC its @@ -266,6 +267,8 @@ public class VariantContext implements HtsRecord, Feature, Serializable { /* cached monomorphic value: null -> not yet computed, False, True */ private Boolean monomorphic = null; + private final VCFHeaderVersion version; + /* * Determine which genotype fields are in use in the genotypes in VC * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first @@ -321,7 +324,7 @@ public List calcVCFGenotypeKeys(final VCFHeader header) { // // --------------------------------------------------------------------------------------------------------- - //no controls and white-spaces characters, no semicolon. + // No controls and white-spaces characters, no semicolon, filter string cannot be empty public static final Pattern VALID_FILTER = Pattern.compile("^[!-:<-~]+$"); public enum Validation { @@ -396,13 +399,15 @@ private static void validateFilters(final VariantContext variantContext) { return; } - for (String filter : filters) { + for (final String filter : filters) { if ( filter == null) { - throw new IllegalStateException("'null' is not a valid filter string."); + throw new TribbleException("'null' is not a valid filter string."); } if (!VALID_FILTER.matcher(filter).matches()) { - throw new IllegalStateException("Filter '" + filter + + throw new TribbleException("Filter '" + filter + "' contains an illegal character. It must conform to the regex ;'" + VALID_FILTER); + } else if (filter.equals("0")) { + throw new TribbleException("Filter cannot use reserved string '0'"); } } } @@ -421,12 +426,14 @@ private static void validateFilters(final VariantContext variantContext) { * * @param other the VariantContext to copy */ - protected VariantContext(VariantContext other) { + protected VariantContext(final VariantContext other) { this(other.getSource(), other.getID(), other.getContig(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getLog10PError(), other.getFiltersMaybeNull(), other.getAttributes(), - other.fullyDecoded, NO_VALIDATION); + other.fullyDecoded, + other.version, + NO_VALIDATION); } /** @@ -454,8 +461,9 @@ protected VariantContext(final String source, final Set filters, final Map attributes, final boolean fullyDecoded, + final VCFHeaderVersion version, final EnumSet validationToPerform ) { - if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } + if ( contig == null || contig.isEmpty() ) { throw new IllegalArgumentException("Contig cannot be null or the empty string"); } this.contig = contig; this.start = start; this.stop = stop; @@ -488,6 +496,7 @@ protected VariantContext(final String source, } this.fullyDecoded = fullyDecoded; + this.version = version; if ( ! validationToPerform.isEmpty() ) { validate(validationToPerform); @@ -617,7 +626,7 @@ private final Set allelesOfGenotypes(Collection genotypes) { *
  • Mixed
  • *
  • Mix of other classes
  • * - * + * * Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population * * @@ -814,6 +823,10 @@ public String getID() { return ID; } + public VCFHeaderVersion getVersion() { + return version; + } + // --------------------------------------------------------------------------------------------------------- // @@ -1654,7 +1667,7 @@ private final Map fullyDecodeAttributes(final Map fullyDecodeAttributes(final Map values = new ArrayList<>(splits.length); - for ( int i = 0; i < splits.length; i++ ) - values.add(decodeOne(field, splits[i], format)); + for (final String split : splits) + values.add(decodeOne(field, split, format, percentDecode)); return values; } else { - return decodeOne(field, string, format); + return decodeOne(field, string, format, percentDecode); } - } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { + } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { final List asList = (List)value; final List values = new ArrayList<>(asList.size()); for ( final String s : asList ) - values.add(decodeOne(field, s, format)); + values.add(decodeOne(field, s, format, percentDecode)); return values; } else { return value; @@ -1703,7 +1726,7 @@ private final Object decodeValue(final String field, final Object value, final V // allowMissingValuesComparedToHeader } - private final Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format) { + private Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format, final boolean percentDecode) { try { if ( string.equals(VCFConstants.MISSING_VALUE_v4) ) return null; @@ -1711,12 +1734,12 @@ private final Object decodeOne(final String field, final String string, final VC switch ( format.getType() ) { case Character: return string; case Flag: - final boolean b = Boolean.valueOf(string) || string.equals("1"); - if ( b == false ) + final boolean b = Boolean.parseBoolean(string) || string.equals("1"); + if (!b) throw new TribbleException("VariantContext FLAG fields " + field + " cannot contain false values" + " as seen at " + getContig() + ":" + getStart()); - return b; - case String: return string; + return true; + case String: return percentDecode ? VCFPercentEncodedTextTransformer.percentDecode(string) : string; case Integer: return Integer.valueOf(string); case Float: return VCFUtils.parseVcfDouble(string); default: throw new TribbleException("Unexpected type for field" + field); @@ -1727,7 +1750,36 @@ private final Object decodeOne(final String field, final String string, final VC } } - private final void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { + private static List decodeGPKey(final String value, final VCFHeaderVersion version) { + final String[] splits = value.split(","); + // We need to special-case GP because there is a discrepancy in the scale used to record + // its values between pre-4.3 and 4.3+ VCF. Pre-4.3 GP is phred scale encoded while + // 4.3+ GP is a linear probability, bringing it in line with other standard keys that + // use the P suffix (c.f. VCF 4.3 spec section 7.2). + + // Some tools in the wild apparently already use linear scaled GP, so we have to + // be careful about converting inputs. We check whether GP values are already linear + // scaled by seeing if the values' sum is approximately equal to 1, like we + // would expect if the values were linear scale probabilities. + // c.f. https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/CEBCD558.FA29%25browning%40u.washington.edu/ + double sum = 0; + + final List rawGPValues = new ArrayList<>(splits.length); + for (final String s : splits) { + final double GP = VCFUtils.parseVcfDouble(s); + rawGPValues.add(GP); + sum += GP; + } + + final boolean wasLinearScale = GeneralUtils.compareDoubles(sum, 1, VCFConstants.VCF_ENCODING_EPSILON) == 0; + if (!wasLinearScale && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + rawGPValues.replaceAll(GP -> QualityUtil.getErrorProbabilityFromPhredScore((int) Math.round(GP))); + } + return rawGPValues; + + } + + private void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { final GenotypesContext gc = new GenotypesContext(); for ( final Genotype g : getGenotypes() ) { gc.add(fullyDecodeGenotypes(g, header)); @@ -1862,9 +1914,9 @@ public int[] getGLIndicesOfAlternateAllele(Allele targetAllele) { return GenotypeLikelihoods.getPLIndicesOfAlleles(0, index); } - /** - * Search for the INFO=SVTYPE and return the type of Structural Variant - * @return the StructuralVariantType of null if there is no property SVTYPE + /** + * Search for the INFO=SVTYPE and return the type of Structural Variant + * @return the StructuralVariantType of null if there is no property SVTYPE * */ public StructuralVariantType getStructuralVariantType() { final String svType = this.getAttributeAsString(VCFConstants.SVTYPE, null); diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java b/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java index fae8d81514..b52ed0a936 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java @@ -25,7 +25,10 @@ package htsjdk.variant.variantcontext; +import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderVersion; import java.io.Serializable; import java.util.ArrayList; @@ -80,6 +83,7 @@ public class VariantContextBuilder { private Map attributes = null; private boolean attributesCanBeModified = false; private boolean filtersCanBeModified = false; + private VCFHeaderVersion version = VCFHeader.DEFAULT_VCF_VERSION; /** enum of what must be validated */ final private EnumSet toValidate = EnumSet.noneOf(VariantContext.Validation.class); @@ -161,6 +165,10 @@ public Map getAttributes() { return attributes; } + public VCFHeaderVersion getVersion() { + return version; + } + /** * Returns a new builder based on parent -- the new VC will have all fields initialized * to their corresponding values in parent. This is the best way to create a derived VariantContext @@ -181,6 +189,7 @@ public VariantContextBuilder(final VariantContext parent) { this.start = parent.getStart(); this.stop = parent.getEnd(); this.fullyDecoded = parent.isFullyDecoded(); + this.version = parent.getVersion(); this.attributes(parent.getAttributes()); if (parent.filtersWereApplied()) { @@ -205,6 +214,7 @@ public VariantContextBuilder(final VariantContextBuilder parent) { this.start = parent.start; this.stop = parent.stop; this.fullyDecoded = parent.fullyDecoded; + this.version = parent.version; this.attributes(parent.attributes); this.filters(parent.filters); @@ -214,6 +224,17 @@ public VariantContextBuilder copy() { return new VariantContextBuilder(this); } + /** + * Tells this builder to create a VariantContext conforming to this version of VCF + * + * @param version the version of VCF to which the VariantContext produced by this builder conforms + * @return this builder + */ + public VariantContextBuilder version(final VCFHeaderVersion version) { + this.version = version; + return this; + } + /** * Tells this builder to use this collection of alleles for the resulting VariantContext * @@ -646,6 +667,6 @@ public VariantContext make(final boolean leaveModifyableAsIs) { return new VariantContext(source, ID, contig, start, stop, alleles, genotypes, log10PError, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, version, toValidate); } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index 07b2d0d41e..78990f5f3f 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -43,6 +43,7 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFUtils; import java.io.ByteArrayOutputStream; @@ -169,7 +170,7 @@ public void writeHeader(VCFHeader header) { // write out the header into a byte stream, get its length, and write everything to the file final ByteArrayOutputStream capture = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(capture); - this.header = VCFWriter.writeHeader(this.header, writer, VCFWriter.getVersionLine(), "BCF2 stream"); + VCFWriter.writeHeader(this.header, writer, "BCF2 stream"); writer.append('\0'); // the header is null terminated by a byte writer.close(); diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java new file mode 100644 index 0000000000..1d43b1c486 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java @@ -0,0 +1,40 @@ +package htsjdk.variant.variantcontext.writer; + +/** + * The policy {@link VCFWriter} will use to determine the version of VCF to write from a given VCF file. + *

    + * htsjdk's behavior to this point has been to stamp the most recent version of VCF onto all VCF files + * written by VCFWriter regardless of the input VCF's original version. This had been possible as new versions + * of VCF were backwards compatible and version upgrading was infallible. VCF 4.3 is stricter than previous versions, + * meaning that some previously valid files are invalid 4.3 and upgrading from pre-4.3 to 4.3+ can sometimes fail. + *

    + * This class is a temporary workaround to allow opt-in 4.3 writing support in a way that does not break + * workflows that may process pre-4.3 files that are invalid 4.3, but should be removed once proper versioning + * support for VCF is incorporated into htsjdk. + */ +public enum VCFVersionUpgradePolicy { + /** + * Interpret VCF files with exactly the version that they have on read. The VCF is assumed to be valid + * for its version and no version validation will be performed. The written VCF will have the same version + * as the one which was read. + */ + DO_NOT_UPGRADE, + + /** + * Write pre-4.3 files as 4.2, to which automatic upgrading should always be possible, and + * write 4.3+ files as 4.3. + */ + ONLY_INFALLIBLE_UPGRADE, + + /** + * Inspect the headers of pre-4.3 files to determine if they can be automatically upgraded to 4.3, + * and if automatic upgrade is possible write them as 4.3, or else write them as 4.2. + */ + UPGRADE_OR_FALLBACK, + + /** + * Inspect the headers of pre 4.3 files to determine if they can be automatically upgraded to 4.3, + * and abort with an error if automatic upgrade is not possible + */ + UPGRADE_OR_FAIL, +} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java index 1b6edae1d8..d9977a66d8 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java @@ -29,9 +29,7 @@ import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; -import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; -import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFConstants; @@ -39,7 +37,6 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderVersion; -import htsjdk.variant.vcf.VCFUtils; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; @@ -57,8 +54,6 @@ class VCFWriter extends IndexingVariantContextWriter { protected final static Log logger = Log.getInstance(VCFWriter.class); - private static final String DEFAULT_VERSION_LINE = VCFHeader.DEFAULT_VCF_VERSION.toHeaderVersionLine(); - // Initialized when the header is written to the output stream private VCFEncoder vcfEncoder = null; @@ -155,12 +150,11 @@ private void writeAndResetBuffer() throws IOException { @Override public void writeHeader(final VCFHeader header) { - - // note we need to update the mHeader object after this call because they header + // note we need to update the mHeader object after this call because the header // may have genotypes trimmed out of it, if doNotWriteGenotypes is true setHeader(header); try { - writeHeader(this.mHeader, writer, getVersionLine(), getStreamName()); + writeHeader(this.mHeader, writer, getStreamName()); writeAndResetBuffer(); outputHasBeenWritten = true; } catch ( IOException e ) { @@ -168,24 +162,26 @@ public void writeHeader(final VCFHeader header) { } } - public static String getVersionLine() { - return DEFAULT_VERSION_LINE; - } - - public static VCFHeader writeHeader(VCFHeader header, + @Deprecated // starting after version 2.24.1 + public static VCFHeader writeHeader(final VCFHeader header, final Writer writer, final String versionLine, final String streamNameForError) { + // Determine requested version from versionLine + final VCFHeaderVersion requestedVersion = VCFHeaderVersion.fromHeaderVersionLine(versionLine); + final VCFHeaderLine requestedVersionLine = VCFHeader.makeHeaderVersionLine(requestedVersion); + // Set version inside header and validate lines + header.addMetaDataLine(requestedVersionLine); + return writeHeader(header, writer, streamNameForError); + } + public static VCFHeader writeHeader(final VCFHeader header, + final Writer writer, + final String streamNameForError) { try { - rejectVCFV43Headers(header); - - // Validate that the file version we're writing is version-compatible this header's version. - validateHeaderVersion(header, versionLine); - // The file format field needs to be written first; below any file format lines // embedded in the header will be removed - writer.write(versionLine + "\n"); + writer.write(header.getVCFHeaderVersion().toHeaderVersionLine() + "\n"); for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { // Remove the fileformat header lines @@ -201,8 +197,8 @@ public static VCFHeader writeHeader(VCFHeader header, // write out the column line writer.write(VCFHeader.HEADER_INDICATOR); writer.write(header.getHeaderFields().stream() - .map(f -> f.name()) - .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString()); + .map(Enum::name) + .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR))); if ( header.hasGenotypingData() ) { writer.write(VCFConstants.FIELD_SEPARATOR); @@ -266,42 +262,10 @@ public void add(final VariantContext context) { @Override public void setHeader(final VCFHeader header) { - rejectVCFV43Headers(header); - if (outputHasBeenWritten) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } this.mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header; this.vcfEncoder = new VCFEncoder(this.mHeader, this.allowMissingFieldsInHeader, this.writeFullFormatField); } - - // writing vcf v4.3 is not implemented - private static void rejectVCFV43Headers(final VCFHeader targetHeader) { - if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion())); - } - } - - // Given a header and a requested target output version, see if the header's version is compatible with the - // requested version (where compatible means its ok to just declare that the header has the requested - // version). - private static void validateHeaderVersion(final VCFHeader header, final String requestedVersionLine) { - ValidationUtils.nonNull(header); - ValidationUtils.nonNull(requestedVersionLine); - - final VCFHeaderVersion vcfCurrentVersion = header.getVCFHeaderVersion(); - final VCFHeaderVersion vcfRequestedVersion = VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine); - if (!vcfCurrentVersion.equals(vcfRequestedVersion)) { - if (!VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine), vcfCurrentVersion)) { - final String message = String.format("Attempting to write a %s VCF header to a %s VCFWriter", - vcfRequestedVersion, - vcfCurrentVersion.getVersionString()); - if (VCFUtils.isStrictVCFVersionValidation()) { - throw new TribbleException(message); - } - logger.warn(message); - } - } - } - } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java index 67656fbe03..215eaf996b 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java @@ -571,17 +571,19 @@ private static boolean isCompressedVCF(final Path outPath) { private VariantContextWriter createVCFWriter(final Path writerPath, final OutputStream writerStream) { if (idxCreator == null) { return new VCFWriter(writerPath, writerStream, refDict, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES), - options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), - options.contains(Options.WRITE_FULL_FORMAT_FIELD)); - } - else { - return new VCFWriter(writerPath, writerStream, refDict, idxCreator, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES), - options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), - options.contains(Options.WRITE_FULL_FORMAT_FIELD)); + options.contains(Options.INDEX_ON_THE_FLY), + options.contains(Options.DO_NOT_WRITE_GENOTYPES), + options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), + options.contains(Options.WRITE_FULL_FORMAT_FIELD) + ); + } else { + return new VCFWriter( + writerPath, writerStream, refDict, idxCreator, + options.contains(Options.INDEX_ON_THE_FLY), + options.contains(Options.DO_NOT_WRITE_GENOTYPES), + options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), + options.contains(Options.WRITE_FULL_FORMAT_FIELD) + ); } } diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index 1a1267e5c8..cacff036b5 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -25,9 +25,11 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.QualityUtil; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; @@ -38,6 +40,7 @@ import htsjdk.utils.ValidationUtils; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.FileNotFoundException; import java.io.IOException; @@ -45,6 +48,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { @@ -58,11 +62,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected VCFHeader header = null; protected VCFHeaderVersion version = null; - private final static VCFTextTransformer percentEncodingTextTransformer = new VCFPercentEncodedTextTransformer(); - private final static VCFTextTransformer passThruTextTransformer = new VCFPassThruTextTransformer(); - //by default, we use the passThruTextTransformer (assume pre v4.3) - private VCFTextTransformer vcfTextTransformer = passThruTextTransformer; - // a mapping of the allele protected final Map> alleleMap = new HashMap<>(3); @@ -86,7 +85,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec /** * If true, then we'll magically fix up VCF headers on the fly when we read them in */ - protected boolean doOnTheFlyModifications = true; + protected VCFVersionUpgradePolicy policy = Defaults.VCF_VERSION_TRANSITION_POLICY; /** * If non-null, we will replace the sample name read from the VCF header with this sample name. This feature works @@ -191,8 +190,6 @@ protected VCFHeaderVersion readFormatVersionLine(final LineIterator headerLineIt * @return a VCFHeader object */ protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion sourceVersion ) { - this.version = sourceVersion; - final Set metaData = new LinkedHashSet<>(); Set sampleNames = new LinkedHashSet<>(); int contigCounter = 0; @@ -471,22 +468,49 @@ public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion */ public VCFHeader setVCFHeader(final VCFHeader newHeader) { ValidationUtils.nonNull(newHeader); - - if (this.doOnTheFlyModifications) { - // calling this with a header that has any pre-v4.3 version will always result in a header - // with version vcfV4.2, no matter what the header version originally was, since the "repair" - // operation is essentially a transform of the header so that it conforms with header line rules - // as of 4.2 - this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); - } else { - this.header = newHeader; + final VCFHeaderVersion originalVersion = newHeader.getVCFHeaderVersion(); + + switch(this.policy) { + case DO_NOT_UPGRADE: + this.header = newHeader; + break; + case ONLY_INFALLIBLE_UPGRADE: + // Upgrade pre-4.3 versions to 4.2, and keep 4.3 at 4.3 + // calling this with a header that has any pre-v4.3 version will always result in a header + // with version vcfV4.2, no matter what the header version originally was, since the "repair" + // operation is essentially a transform of the header so that it conforms with header line rules + // as of 4.2 + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + break; + case UPGRADE_OR_FAIL: + case UPGRADE_OR_FALLBACK: + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + final Collection> errors = this.header.getValidationErrors(VCFHeader.DEFAULT_VCF_VERSION); + if (!errors.isEmpty()) { + final String message = String.format( + "Version transition from VCF version %s to %s failed with validation error(s):\n%s%s", + originalVersion.getVersionString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString(), + errors.stream() + .limit(5) + .map(VCFValidationFailure::getSourceMessage) + .collect(Collectors.joining("\n")), + errors.size() > 5 ? "\n+ " + (errors.size() - 5) + " additional error(s)" : "" + ); + if (this.policy == VCFVersionUpgradePolicy.UPGRADE_OR_FAIL) { + throw new TribbleException(message); + } else { + logger.info(message + ", header will be kept at original version: " + originalVersion.getVersionString()); + } + } else { + // Only upgrade if no errors resulting from version upgrading would occur + this.header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + } + break; + default: + throw new TribbleException("Unrecognized VCF Version Upgrade Policy: " + this.policy); } - this.version = this.header.getVCFHeaderVersion(); - // Obtain a text transformer (technically, this should be based on the ORIGINAL header version, not - // the updated version after repairStandardHeaderLines is called), but it doesn't matter in practice - // since the transformer only differs starting with 4.3. - this.vcfTextTransformer = getTextTransformerForVCFVersion(this.version); + this.version = this.header.getVCFHeaderVersion(); return this.header; } @@ -509,18 +533,6 @@ public VariantContext decode(String line) { return decodeLine(line, true); } - /** - * For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded - * on read. Return a version-aware text transformer that can decode encoded text. - * @param targetVersion the version for which a transformer is bing requested - * @return a {@link VCFTextTransformer} suitable for the targetVersion - */ - private VCFTextTransformer getTextTransformerForVCFVersion(final VCFHeaderVersion targetVersion) { - return targetVersion != null && targetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ? - percentEncodingTextTransformer : - passThruTextTransformer; - } - private VariantContext decodeLine(final String line, final boolean includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -551,6 +563,7 @@ private VariantContext decodeLine(final String line, final boolean includeGenoty */ private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); + builder.version(version); builder.source(getName()); // increment the line count @@ -728,16 +741,16 @@ protected Map parseInfo(String infoField) { String valueString = infoFields.get(i).substring(eqI + 1); // split on the INFO field separator - List infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR); + final List infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR); if ( infoValueSplit.size() == 1 ) { - value = vcfTextTransformer.decodeText(infoValueSplit.get(0)); + value = infoValueSplit.get(0); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) { // deal with the case where a flag field has =0, such as DB=0, by skipping the add continue; } } else { - value = vcfTextTransformer.decodeText(infoValueSplit); + value = infoValueSplit; } } else { key = infoFields.get(i); @@ -884,8 +897,12 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.isEmpty() ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); - if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { - System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); + if ( MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { + logger.warn(String.format( + "Allele detected with length %d exceeding max size %d at approximately line %d, " + + "likely resulting in degraded VCF processing performance", + allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo + )); } if (Allele.wouldBeSymbolicAllele(allele.getBytes())) { @@ -996,8 +1013,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, // cycle through the genotype strings boolean PlIsSet = false; for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { - List genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - genotypeValues = vcfTextTransformer.decodeText(genotypeValues); + final List genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); final String sampleName = sampleNameIterator.next(); final GenotypeBuilder gb = new GenotypeBuilder(sampleName); @@ -1071,8 +1087,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } private static final int[] decodeInts(final String string) { - List split = ParsingUtils.split(string, ','); - int [] values = new int[split.size()]; + final List split = ParsingUtils.split(string, ','); + final int [] values = new int[split.size()]; try { for (int i = 0; i < values.length; i++) { values[i] = Integer.parseInt(split.get(i)); @@ -1089,7 +1105,16 @@ private static final int[] decodeInts(final String string) { * raw VCF records */ public final void disableOnTheFlyModifications() { - doOnTheFlyModifications = false; + setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + } + + /** + * Forces all VCFCodecs to not perform any on the fly modifications to the VCF header + * of VCF records. Useful primarily for raw comparisons such as when comparing + * raw VCF records + */ + public final void setVersionUpgradePolicy(final VCFVersionUpgradePolicy policy) { + this.policy = policy; } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 7f0f255883..028798757a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -180,16 +180,16 @@ public Optional> getValidationFailure(final // However, the key values correspond to INFO/FORMAT header lines defining the attribute and its type, // so we do the validation here if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - if (!validHeaderID(getID())) { - final VCFValidationFailure validationFailure = new VCFValidationFailure<>( - vcfTargetVersion, - this, - String.format("ID tag \"%s\" does not conform to tag restrictions", getID())); + final Optional> validationFailure = validateKeyOrID(getID()) + .map(e -> new VCFValidationFailure<>(vcfTargetVersion, this, e)); + if (validationFailure.isPresent()) { + // TODO thinking that these getValidationFailure should be a pure function and its caller + // decides whether to pass the error up or just log if not using strict validation if (VCFUtils.isStrictVCFVersionValidation()) { - return Optional.of(validationFailure); + return validationFailure; } else { // warn for older versions - this line can't be used as a v4.3 line - logger.warn(validationFailure.getFailureMessage()); + logger.warn(validationFailure.get().getFailureMessage()); } } } @@ -201,9 +201,11 @@ public Optional> getValidationFailure(final * @param id the candidate ID * @return true if ID conforms to header line id requirements, otherwise false */ - //TODO: the existing VCFHeaderLine.validateKeyOrID method should be refactored so it can be used instead of this - protected boolean validHeaderID(final String id) { - return VALID_HEADER_ID_PATTERN.matcher(id).matches(); + @Override + protected Optional validateKeyOrID(final String id) { + return VALID_HEADER_ID_PATTERN.matcher(id).matches() + ? Optional.empty() + : Optional.of(String.format("Key: %s does not match header line key regex: %s", id, VALID_HEADER_ID_PATTERN)); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java index 9cffb45837..7be6e32de6 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java +++ b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java @@ -15,7 +15,9 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -26,10 +28,7 @@ */ public class VCFEncoder { - /** - * The encoding used for VCF files: ISO-8859-1. When writing VCF4.3 is implemented, this should change to UTF-8. - */ - public static final Charset VCF_CHARSET = StandardCharsets.ISO_8859_1; + public static final Charset VCF_CHARSET = StandardCharsets.UTF_8; private static final String QUAL_FORMAT_STRING = "%.2f"; private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00"; @@ -41,6 +40,8 @@ public class VCFEncoder { private boolean outputTrailingFormatFields = false; + private final VCFTextTransformer vcfTextTransformer; + /** * Prepare a VCFEncoder that will encode records appropriate to the given VCF header, optionally * allowing missing fields in the header. @@ -52,6 +53,9 @@ public VCFEncoder(final VCFHeader header, final boolean allowMissingFieldsInHead this.header = header; this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; this.outputTrailingFormatFields = outputTrailingFormatFields; + this.vcfTextTransformer = header.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) + ? new VCFPercentEncodedTextTransformer() + : new VCFPassThruTextTransformer(); } /** @@ -148,7 +152,7 @@ public void write(final Appendable vcfOutput, final VariantContext context) thro fieldIsMissingFromHeaderError(context, field.getKey(), "INFO"); } - final String outputValue = formatVCFField(field.getValue()); + final String outputValue = formatVCFField(field.getValue(), context.isFullyDecoded()); if (outputValue != null) { infoFields.put(field.getKey(), outputValue); } @@ -218,34 +222,71 @@ private void fieldIsMissingFromHeaderError(final VariantContext vc, final String } } - @SuppressWarnings("rawtypes") - String formatVCFField(final Object val) { - final String result; + String formatVCFField(final Object val, final boolean fullyDecoded) { if (val == null) { - result = VCFConstants.MISSING_VALUE_v4; + return VCFConstants.MISSING_VALUE_v4; } else if (val instanceof Double) { - result = formatVCFDouble((Double) val); + return formatVCFDouble((Double) val); } else if (val instanceof Boolean) { - result = (Boolean) val ? "" : null; // empty string for true, null for false + return (Boolean) val ? "" : null; // empty string for true, null for false } else if (val instanceof List) { - result = formatVCFField(((List) val).toArray()); + return formatList((List) val, fullyDecoded); } else if (val.getClass().isArray()) { - final int length = Array.getLength(val); - if (length == 0) { - return formatVCFField(null); + return val.getClass().getComponentType().isPrimitive() + ? formatPrimitiveArray(val) + : formatList(Arrays.asList((Object[]) val), fullyDecoded); + } else if (val instanceof String) { + final String s = val.toString(); + // If the VariantContext from which this string was obtained was already fully decoded, + // its in-memory representation may contain special characters which must be re-encoded, + // while strings which have not been decoded yet represent the field as read directly + // from the source VCF, so they are written back out without encoding + return fullyDecoded ? vcfTextTransformer.encodeText(s) : s; + } else { + return val.toString(); + } + } + + private static String formatPrimitiveArray(final Object v) { + final int len = Array.getLength(v); + if (len == 0) return VCFConstants.MISSING_VALUE_v4; + int i = 0; + final StringBuilder s = new StringBuilder(); + if (v instanceof int[]) { + final int[] a = (int[]) v; + for (;;) { + s.append(a[i++]); + if (i == len) break; + s.append(','); } - final StringBuilder sb = new StringBuilder( - formatVCFField(Array.get(val, 0))); - for (int i = 1; i < length; i++) { - sb.append(','); - sb.append(formatVCFField(Array.get(val, i))); + } else if (v instanceof double[]) { + final double[] a = (double[]) v; + for (;;) { + s.append(formatVCFDouble(a[i++])); + if (i == len) break; + s.append(','); + } + } else if (v instanceof long[]) { + final long[] a = (long[]) v; + for (;;) { + s.append(a[i++]); + if (i == len) break; + s.append(','); } - result = sb.toString(); - } else { - result = val.toString(); } + return s.toString(); + } - return result; + private String formatList(final List list, final boolean fullyDecoded) { + if (list.isEmpty()) return VCFConstants.MISSING_VALUE_v4; + final StringBuilder s = new StringBuilder(); + final Iterator it = list.iterator(); + for (;;) { + s.append(formatVCFField(it.next(), fullyDecoded)); + if (!it.hasNext()) break; + s.append(','); + } + return s.toString(); } /** @@ -310,7 +351,8 @@ public void addGenotypeData(final VariantContext vc, final Map a * @param vcfoutput VCF output * @throws IOException */ - private void appendGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final Appendable vcfoutput) throws IOException {final int ploidy = vc.getMaxPloidy(2); + private void appendGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final Appendable vcfoutput) throws IOException { + final int ploidy = vc.getMaxPloidy(2); for (final String sample : this.header.getGenotypeSamples()) { vcfoutput.append(VCFConstants.FIELD_SEPARATOR); @@ -357,7 +399,7 @@ public void addGenotypeData(final VariantContext vc, final Map a } } else { Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - outputValue = formatVCFField(val); + outputValue = formatVCFField(val, vc.isFullyDecoded()); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java index c6ff6158e0..2ab29ddcb4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java @@ -24,6 +24,7 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; @@ -35,6 +36,7 @@ import htsjdk.tribble.TribbleException; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.File; import java.io.IOException; @@ -115,6 +117,23 @@ public VCFFileReader(final File file, final File indexFile, final boolean requir return isBCF(path) ? new BCF2Codec() : new VCFCodec(); } + /** + * returns Correct Feature codec for Path depending whether + * the name seems to indicate that it's a BCF. + * + * @param path to vcf/bcf + * @return FeatureCodec for input Path + */ + private static FeatureCodec getCodecForPath(Path path, final VCFVersionUpgradePolicy policy) { + if (isBCF(path)) { + return new BCF2Codec(); + } else { + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(policy); + return codec; + } + } + /** * Returns the SAMSequenceDictionary from the provided VCF file. */ @@ -142,21 +161,49 @@ public VCFFileReader(final Path path, final Path indexPath) { * Allows construction of a VCFFileReader that will or will not assert the presence of an index as desired. */ public VCFFileReader(final Path path, final boolean requireIndex) { - this.reader = AbstractFeatureReader.getFeatureReader( - path.toUri().toString(), - getCodecForPath(path), - requireIndex); + this(path, requireIndex, Defaults.VCF_VERSION_TRANSITION_POLICY); } /** * Allows construction of a VCFFileReader with a specified index path. */ public VCFFileReader(final Path path, final Path indexPath, final boolean requireIndex) { + this(path, indexPath, requireIndex, Defaults.VCF_VERSION_TRANSITION_POLICY); + } + + /** + * Constructs a VCFFileReader that requires the index to be present. + */ + public VCFFileReader(final Path path, final VCFVersionUpgradePolicy policy) { + this(path, true, policy); + } + + /** + * Constructs a VCFFileReader with a specified index. + */ + public VCFFileReader(final Path path, final Path indexPath, final VCFVersionUpgradePolicy policy) { + this(path, indexPath, true, policy); + } + + /** + * Allows construction of a VCFFileReader that will or will not assert the presence of an index as desired. + */ + public VCFFileReader(final Path path, final boolean requireIndex, final VCFVersionUpgradePolicy policy) { + this.reader = AbstractFeatureReader.getFeatureReader( + path.toUri().toString(), + getCodecForPath(path, policy), + requireIndex); + } + + /** + * Allows construction of a VCFFileReader with a specified index path. + */ + public VCFFileReader(final Path path, final Path indexPath, final boolean requireIndex, final VCFVersionUpgradePolicy policy) { this.reader = AbstractFeatureReader.getFeatureReader( - path.toUri().toString(), - indexPath.toUri().toString(), - getCodecForPath(path), - requireIndex); + path.toUri().toString(), + indexPath.toUri().toString(), + getCodecForPath(path, policy), + requireIndex); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 1dcb5e07f9..e1a0cf7a4e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -58,7 +58,7 @@ public class VCFHeader implements HtsHeader, Serializable { public static final long serialVersionUID = 1L; protected static final Log logger = Log.getInstance(VCFHeader.class); - public static final VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_2; + public static final VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_3; // the mandatory header fields public enum HEADER_FIELDS { @@ -604,6 +604,10 @@ private VCFHeaderVersion initializeHeaderVersion() { return metaDataVersion; } + public Collection> getValidationErrors(final VCFHeaderVersion targetVersion) { + return mMetaData.getValidationErrors(targetVersion); + } + private void validateVersionTransition( final VCFHeaderLine newHeaderLine, final VCFHeaderVersion currentVersion, diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 94a3a0849e..9214f7095f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -85,6 +85,16 @@ public String getValue() { */ public String getID() { return null; } + /** + * Validate the state of this header line. Require the key be valid as an "id". + */ + private void validate() { + final Optional validationFailure = validateKeyOrID(mKey); + if (validationFailure.isPresent()) { + throw new TribbleException(validationFailure.get()); + } + } + /** * Validates this header line against {@code vcfTargetVersion}. * Subclasses can override this to provide line type-specific version validation, and the @@ -138,19 +148,15 @@ public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { /** * Validate a string that is to be used as a unique id or key field. */ - protected static void validateKeyOrID(final String keyString, final String sourceName) { - ValidationUtils.nonNull(sourceName); + protected Optional validateKeyOrID(final String keyString) { if (keyString == null) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot be null or empty", sourceName)); - } - if ( keyString.contains("<") || keyString.contains(">") ) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot contain angle brackets", sourceName)); - } - if ( keyString.contains("=") ) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot contain an equals sign", sourceName)); + return Optional.of("VCFHeaderLine: key cannot be null or empty"); + } else if ( keyString.contains("<") || keyString.contains(">") ) { + return Optional.of(String.format("VCFHeaderLine: key %s contains illegal character: angle brackets", keyString)); + } else if ( keyString.contains("=") ) { + return Optional.of(String.format("VCFHeaderLine: key %s contains illegal character: equals sign", keyString)); + } else { + return Optional.empty(); } } @@ -207,7 +213,8 @@ public int compareTo(Object other) { * @param line the line * @return true if the line is a VCF meta data line, or false if it is not */ - public static boolean isHeaderLine(String line) { + @Deprecated // starting after version 2.24.1 + static boolean isHeaderLine(String line) { return line != null && !line.isEmpty() && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); } @@ -230,22 +237,15 @@ public static String toStringEncoding(Map keyValues) { builder.append(entry.getKey()); builder.append('='); builder.append(entry.getValue().toString().contains(",") || - entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") || - entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes - entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); + entry.getValue().toString().contains(" ") || + entry.getKey().equals("Description") || + entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes + entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); } builder.append('>'); return builder.toString(); } - /** - * Validate the state of this header line. Require the key be valid as an "id". - */ - private void validate() { - validateKeyOrID(mKey, "key"); - } - private static String escapeQuotes(final String value) { // java escaping in a string literal makes this harder to read than it should be // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 12a29a1f6c..4a116e1381 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -29,6 +29,8 @@ import htsjdk.samtools.util.Log; import htsjdk.utils.ValidationUtils; +import java.util.Optional; + /** *

    * Class VCFInfoHeaderLine @@ -102,14 +104,15 @@ public static VCFInfoHeaderLine getMergedInfoHeaderLine( } @Override - public boolean shouldBeAddedToDictionary() { - return true; + protected Optional validateKeyOrID(final String id) { + return id.equals(VCFConstants.THOUSAND_GENOMES_KEY) + ? Optional.empty() + : super.validateKeyOrID(id); } @Override - //TODO: integrate this with the existing validateKeyOrID method - protected boolean validHeaderID(final String id) { - return super.validHeaderID(id) || id.equals(VCFConstants.THOUSAND_GENOMES_KEY); + public boolean shouldBeAddedToDictionary() { + return true; } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 97f208e7b4..3055c93889 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -83,6 +83,18 @@ public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { // than the new line, since old VCF versions use a different format key than modern versions) return updateVersion(newMetaDataLine); } else { + // Enforce restriction that contig and ALT line IDs cannot share IDs (c.f. VCF 4.3 spec section 1.4.7) + // We do not store them in the same namespace so that we can distinguish cases of two lines + // of the same type clashing vs an ALT line clashing with an existing contig line or vice versa + switch (newMetaDataLine.getKey()) { + case VCFConstants.CONTIG_HEADER_KEY: + validateContigAndALTLinesDisjoint(VCFConstants.ALT_HEADER_KEY, newMetaDataLine.getID()); + break; + case VCFConstants.ALT_HEADER_KEY: + validateContigAndALTLinesDisjoint(VCFConstants.CONTIG_HEADER_KEY, newMetaDataLine.getID()); + break; + } + // otherwise, see if there is an equivalent line that the new line will replace final HeaderLineMapKey newMapKey = makeKeyForLine(newMetaDataLine); final VCFHeaderLine equivalentMetaDataLine = mMetaData.get(newMapKey); @@ -95,6 +107,13 @@ public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { } } + private void validateContigAndALTLinesDisjoint(final String namespace, final String id) { + if (mMetaData.containsKey(makeKey(namespace, id))) { + throw new IllegalStateException( + String.format("ALT and contig line IDs must be disjoint, but both were found for ID: %s", id)); + } + } + /** * Remove an equivalent metadata line from the list. This is the inverse of addMetaDataLine, and removes * any equivalent line that already exists (any existing file format line if the line to be removed is @@ -187,12 +206,12 @@ public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { * @return an Collection describing the lines that failed to validate * incompatible with targetVersion. The collections is empty if validation succeeded for all lines. */ - public Collection getValidationErrors(final VCFHeaderVersion targetVersion) { + public Collection> getValidationErrors(final VCFHeaderVersion targetVersion) { return mMetaData.values().stream() .filter(line -> !VCFHeaderVersion.isFormatString(line.getKey())) .map(l -> l.getValidationFailure(targetVersion)) - .filter(o -> o.isPresent()) - .map(o -> o.get()) + .filter(Optional::isPresent) + .map(Optional::get) .collect(Collectors.toList()); } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java index 24abed8eb0..55c172391c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java @@ -27,4 +27,14 @@ public String decodeText(final String rawPart) { public List decodeText(final List rawParts) { return rawParts; } + + /** + * No-op encoder for a single string + * @param rawPart the raw string to be decoded + * @return the raw string with no transformation done + */ + @Override + public String encodeText(final String rawPart) { + return rawPart; + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java index 4c8015eaa5..b98b36e3f3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java @@ -1,8 +1,8 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; - +import java.util.Arrays; import java.util.List; +import java.util.function.IntPredicate; import java.util.stream.Collectors; /** @@ -10,65 +10,204 @@ * of characters that have special meaning in VCF. */ public class VCFPercentEncodedTextTransformer implements VCFTextTransformer { - final static private String ENCODING_SENTINEL_STRING = "%"; - final static private char ENCODING_SENTNEL_CHAR = '%'; - final static private int ENCODING_BASE_RADIX = 16; + private static final char ENCODING_SENTINEL_CHAR = '%'; + + private static final byte invalidHexEncoding = ~0; + private static final byte maxPossibleHexDigit = 'f' + 1; + private static final byte[] hexToBytes = new byte[maxPossibleHexDigit]; + private static final char[] bytesToHex = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + }; + + static { + Arrays.fill(hexToBytes, invalidHexEncoding); + for (byte i = '0'; i <= '9'; i++) hexToBytes[i] = (byte) (i - '0'); + for (byte i = 'A'; i <= 'F'; i++) hexToBytes[i] = (byte) (10 + i - 'A'); + for (byte i = 'a'; i <= 'f'; i++) hexToBytes[i] = (byte) (10 + i - 'a'); + } /** - * Transform a single string, replacing % encoded values with their corresponding text. + * Transform a single string, replacing percent encoded values with their corresponding text. * * @param rawPart the raw string to be decoded * @return the decoded string - * @throws TribbleException if the the encoding is uninterpretable */ @Override public String decodeText(final String rawPart) { - return decodePercentEncodedChars(rawPart); + return percentDecode(rawPart); } /** - * Transform a list of strings, replacing % encoded values with their corresponding text in each string. + * Transform a list of strings, replacing percent encoded values with their corresponding text in each string. * - * @param rawParts a list of raw strings + * @param rawParts a list of raw strings * @return a list of decoded strings - * @throws TribbleException if the the encoding is uninterpretable */ @Override public List decodeText(final List rawParts) { - return rawParts.stream().map(this::decodeText).collect(Collectors.toList()); + return rawParts.stream().map(VCFPercentEncodedTextTransformer::percentDecode).collect(Collectors.toList()); } /** - * Transform input strings containing embedded percent=encoded characters. For example, when given the + * Transform input strings containing embedded percent encoded characters. For example, when given the * string '%3D%41' will return the string '=A'. + *

    + * This method is permissive in the input it accepts. Capitalized and lower case percent encoding are both + * accepted, although the VCF spec only allows capitalized encoding. Uninterpretable escape sequences + * (the % character followed by fewer than 2 characters before the end of the string, or the % sentinel + * followed by 2 characters either of which does not match the regular expression [0-9A-Fa-f]) are passed through + * uninterpreted. + *

    + * If the input text does not contain any valid percent encoded sequences, a new string is not allocated, + * and the original string is returned. * - * @param rawText a string containing zero or more embedded encodings + * @param rawString a string containing zero or more embedded encodings * @return a string with all encoded characters replaced with the corresponding character - * @throws TribbleException if the the encoding is uninterpretable */ - protected static String decodePercentEncodedChars(final String rawText) { - if (rawText.contains(ENCODING_SENTINEL_STRING)) { - StringBuilder builder = new StringBuilder(rawText.length()); - for (int i = 0; i < rawText.length(); i++) { - final char c = rawText.charAt(i); - if (c == ENCODING_SENTNEL_CHAR && ((i + 2) < rawText.length())) { - try { - final char[] trans = Character.toChars(Integer.parseInt(rawText.substring(i + 1, i + 3), ENCODING_BASE_RADIX)); - if (trans.length != 1) { - throw new TribbleException(String.format("escape sequence '%c' corresponds to an invalid encoding in '%s'", c, rawText)); - } - builder.append(trans[0]); - i += 2; - } catch (IllegalArgumentException e) { - builder.append(c); + public static String percentDecode(final String rawString) { + int matches = 0; + final int length = rawString.length(); + // A valid percent encoding requires at least 3 characters (the % character and 2 hex digits) + // so we do not scan for % characters in the last 2 characters of the string + // The spec does not specify how "truncated" encodings (% followed by fewer than 2 hex digits + // before the string ends) should be interpreted, but we treat them as literal characters + // and append them uninterpreted + for (int i = 0, l = length - 2; i < l; i++) { + if (rawString.charAt(i) == ENCODING_SENTINEL_CHAR) matches++; + } + + if (matches == 0) { + return rawString; + } else { + final StringBuilder s = new StringBuilder(length - 2 * matches); + int lastMatchEnd = 0; + int matched = 0; + for (int i = 0; ; i++) { + if (rawString.charAt(i) == ENCODING_SENTINEL_CHAR) { + final int hiDecoded = hexDigitToInt(rawString.charAt(++i)); + final int loDecoded = hexDigitToInt(rawString.charAt(++i)); + // Only decode and append the character if both characters after the % were interpretable + // as hex digits + if ((hiDecoded | loDecoded) != invalidHexEncoding) { + // Append on the portion of the original string that came before this matching character + s.append(rawString, lastMatchEnd, i - 2); + s.append((char) ((hiDecoded << 4) | (loDecoded & 0x0F))); + lastMatchEnd = i + 1; + } + matched++; + + // Found all sequences to decode in the string, so append the rest of the original string + if (matched == matches) { + s.append(rawString, lastMatchEnd, length); + return s.toString(); } - } else { - builder.append(c); } } - return builder.toString(); } - return rawText; } + private static int hexDigitToInt(final char c) { + return c < maxPossibleHexDigit ? hexToBytes[c] : invalidHexEncoding; + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + */ + @Override + public String encodeText(final String rawPart) { + return percentEncode(rawPart); + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + */ + public static String percentEncode(final String rawPart) { + return percentEncode(rawPart, VCFPercentEncodedTextTransformer::isVCFSpecialChar); + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + *

    + * This method is suitable for encoding a header value in a key=value pair that is of type String (e.g. Description) + * which have fewer restrictions than fields in the body of the VCF such as INFO and FORMAT. + * + * @param rawString String to encode + * @return the encoded string + */ + public static String percentEncodeHeaderText(final String rawString) { + return percentEncode(rawString, VCFPercentEncodedTextTransformer::isHeaderSpecialChar); + } + + private static String percentEncode(final String rawString, final IntPredicate charPredicate) { + int matches = 0; + final int length = rawString.length(); + for (int i = 0; i < length; i++) { + if (charPredicate.test(rawString.charAt(i))) matches++; + } + + if (matches == 0) { + return rawString; + } else { + final StringBuilder s = new StringBuilder(length + 2 * matches); + int lastMatchEnd = 0; + int matched = 0; + for (int i = 0; ; i++) { + final char c = rawString.charAt(i); + if (charPredicate.test(c)) { + // Append on the portion of the original string that came before this matching character + s.append(rawString, lastMatchEnd, i); + s.append(ENCODING_SENTINEL_CHAR); + s.append(bytesToHex[c >>> 4]); + s.append(bytesToHex[c & 0x0F]); + + lastMatchEnd = i + 1; + matched++; + + // Found all matching characters in the string, so append the rest of the original string + if (matched == matches) { + s.append(rawString, lastMatchEnd, length); + return s.toString(); + } + } + } + } + } + + // Characters that have special meaning in the value part of a structured header line key=value pair. + // Note that this is less restrictive than the full set of characters with special meaning in VCF. + // Space and comma are allowed due to the double-quoting introduced in VCF 4.2, and '=' is allowed because + // key=value pairs are comma-delimited, so internal '=' is unambiguously part of the value as long as ',' is quoted + private static boolean isHeaderSpecialChar(final int c) { + switch (c) { + case '\n': + case '\t': + case '\r': + case '%': + return true; + default: + return false; + } + } + + private static boolean isVCFSpecialChar(final int c) { + switch (c) { + case '\n': + case '\t': + case '\r': + case '%': + case ',': + case ':': + case ';': + case '=': + return true; + default: + return false; + } + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index c0a3abce5c..2c53899f1d 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -34,7 +34,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; +import java.util.Optional; /** * An abstract class representing a VCF metadata line with a key and attribute=value pairs, one of @@ -168,14 +168,27 @@ public int hashCode() { @Override protected String toStringEncoding() { //NOTE: this preserves/round-trips "extra" attributes such as SOURCE, VERSION, etc. - final StringBuilder builder = new StringBuilder(); - builder.append(getKey()); - builder.append("=<"); - builder.append(genericFields.entrySet().stream() - .map(e -> e.getKey() + "=" + quoteAttributeValueForSerialization(e.getKey(), e.getValue())) - .collect(Collectors.joining(","))); - builder.append('>'); - return builder.toString(); + final StringBuilder s = new StringBuilder(); + s.append(getKey()); + s.append('='); + s.append('<'); + boolean notFirst = false; + for (final Map.Entry e : genericFields.entrySet()) { + if (notFirst) { + s.append(','); + } else { + notFirst = true; + } + + final String k = e.getKey(); + final String v = e.getValue(); + s.append(k); + s.append('='); + s.append(encodeAttributeValueForSerialization(k, v)); + } + s.append('>'); + + return s.toString(); } // Called by VCFInfoHeaderLine to allow repairing of VCFInfoLines that have a Flag type and a non-zero count @@ -190,10 +203,10 @@ protected void updateGenericField(final String attributeName, final String value /** * Return true if the attribute name requires quotes. * @param attributeName name of the attribute being serialized - * @return boolean indicating whether the value should be embedded n quotes during serialization + * @return boolean indicating whether the value should be embedded in quotes during serialization */ protected boolean getIsQuotableAttribute(final String attributeName) { - // the (VF4.3) spec says that the DESCRIPTION, SOURCE, and VERSION attributes should be quoted + // the (VCF4.3) spec says that the DESCRIPTION, SOURCE, and VERSION attributes should be quoted // for INFO/FORMAT lines, but htsjdk seems to have historically quoted these for all structured // header lines return attributeName.equals(DESCRIPTION_ATTRIBUTE) || @@ -201,12 +214,37 @@ protected boolean getIsQuotableAttribute(final String attributeName) { attributeName.equals(VERSION_ATTRIBUTE); } + /** + * Return true if the attribute name allows percent encoding. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value may be percent encoded serialization + */ + protected boolean isPercentEncodableAttribute(final String attributeName) { + // As of VCF4.3 attribute values containing characters that have special meanings can be percent encoded. + // ID, NUMBER and TYPE values do not permit values that would require percent encoding, so they are excluded, + // but all other attributes may potentially be percent encoded. + return !(attributeName.equals(VCFSimpleHeaderLine.ID_ATTRIBUTE) || + attributeName.equals(VCFCompoundHeaderLine.NUMBER_ATTRIBUTE) || + attributeName.equals(VCFCompoundHeaderLine.TYPE_ATTRIBUTE)); + } + private void validate() { if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { throw new TribbleException( String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); } - validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE), "ID"); + final Optional validationFailure = validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE)); + if (validationFailure.isPresent()) { + throw new TribbleException.VersionValidationFailure(validationFailure.get()); + } + } + + // Perform all text transformations required to encode an attribute value + private String encodeAttributeValueForSerialization(final String attribute, final String originalValue) { + final String quotedAttributeValue = quoteAttributeValueForSerialization(attribute, originalValue); + return isPercentEncodableAttribute(attribute) + ? VCFPercentEncodedTextTransformer.percentEncodeHeaderText(quotedAttributeValue) + : quotedAttributeValue; } // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by diff --git a/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java index 36f842b20a..f928507b01 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java @@ -28,4 +28,12 @@ public interface VCFTextTransformer { */ List decodeText(final List rawParts); + /** + * Encode a single string. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + * @throws TribbleException if the the encoding is unencodable + */ + String encodeText(final String rawPart); } diff --git a/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java b/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java new file mode 100644 index 0000000000..8db6a1883c --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java @@ -0,0 +1,30 @@ +package htsjdk.variant.vcf; + +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; + +import java.util.Collection; + +final class VCFVersionUpgrader { + public static void getOutputVersion(final VCFHeader header, final VCFVersionUpgradePolicy policy) { + // Guaranteed to not be null + final VCFHeaderVersion currentVersion = header.getVCFHeaderVersion(); + switch (policy) { + case ONLY_INFALLIBLE_UPGRADE: + // 4.3+ lines are output as the latest version, pre-4.3 lines are output as 4.2 + final VCFHeaderVersion newVersion = currentVersion.isAtLeastAsRecentAs(VCFHeader.DEFAULT_VCF_VERSION) + ? VCFHeader.DEFAULT_VCF_VERSION + : VCFHeaderVersion.VCF4_2; + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(newVersion)); + case UPGRADE_OR_FALLBACK: + final Collection> failures = header.getValidationErrors(VCFHeader.DEFAULT_VCF_VERSION); + if (failures.isEmpty()) { + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + } + break; + case UPGRADE_OR_FAIL: + // If validation fails, simply pass the exception through + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + break; + } + } +} diff --git a/src/test/java/htsjdk/variant/VariantBaseTest.java b/src/test/java/htsjdk/variant/VariantBaseTest.java index dc59309e7b..749ffe69e9 100644 --- a/src/test/java/htsjdk/variant/VariantBaseTest.java +++ b/src/test/java/htsjdk/variant/VariantBaseTest.java @@ -29,11 +29,15 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.Tuple; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureReader; +import htsjdk.tribble.TribbleException; import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; +import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import org.testng.Assert; @@ -141,8 +145,16 @@ public static SAMSequenceDictionary createArtificialSequenceDictionary() { */ public static Tuple> readEntireVCFIntoMemory(final Path vcfPath) { ValidationUtils.nonNull(vcfPath); - try ( final VCFFileReader vcfReader = new VCFFileReader(vcfPath, false) ){ - return new Tuple<>(vcfReader.getFileHeader(), vcfReader.iterator().toList()); + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.UPGRADE_OR_FALLBACK); + try (final FeatureReader reader = AbstractFeatureReader.getFeatureReader( + vcfPath.toUri().toString(), + codec, + false + )) { + return new Tuple<>((VCFHeader) reader.getHeader(), reader.iterator().toList()); + } catch (final IOException e) { + throw new TribbleException("Could not create an iterator from a feature reader.", e); } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 95fb359446..e18c0d9309 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -75,10 +75,10 @@ public void testCreateDictionary() { final List inputLines = new ArrayList(); int counter = 0; inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFHeaderLine("x", "misc")); @@ -115,21 +115,22 @@ public Object[][] makeHeaderOrderTestProvider() { int counter = 0; inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + // We prefix all the line IDs with "l" because as of VCF 4.3, IDs cannot start with a number + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + inputLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); final int inputLineCounter = counter; final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); - extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFFilterHeaderLine("l" + counter++)); + extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + extraLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); extraLines.add(new VCFHeaderLine("x", "misc")); extraLines.add(new VCFHeaderLine("y", "misc")); @@ -180,7 +181,8 @@ private static boolean expectedConsistent(final VCFHeader combinationHeader, fin final List ids = new ArrayList(); for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { if ( line.isIDHeaderLine()) { - ids.add(Integer.valueOf(line.getID())); + // Strip off "l" prefix + ids.add(Integer.valueOf(line.getID().substring(1))); } } diff --git a/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java b/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java index 5e3f0b9eb8..caed6dbdf8 100644 --- a/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java @@ -25,16 +25,85 @@ package htsjdk.variant.variantcontext; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; public class GenotypeBuilderTest extends VariantBaseTest { + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFilters() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filter("x;y;x"); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFiltersCollection() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(Arrays.asList("x", "y", "x")); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFiltersArray() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("x", "y", "x"); + } + + @DataProvider + public Object[][] illegalFilterNameProvider() { + return new Object[][]{ + // Reserved string 0 + {"0"}, + // Contains whitespace + {"a b"}, + // Contains separator + {"a;b"} + }; + } + + @Test(dataProvider = "illegalFilterNameProvider", expectedExceptions = TribbleException.class) + public void testRejectIllegalFilterName(final String filter) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(Collections.singletonList(filter)); + } + + @DataProvider + public Object[][] illegalFilterSeparatorPlacementProvider() { + return new Object[][]{ + // Begins with ; + {";a"}, + // Ends with ; + {"a;"}, + // Contains adjacent internal ; + {"a;;b"} + }; + } + + @Test(dataProvider = "illegalFilterSeparatorPlacementProvider", expectedExceptions = TribbleException.class) + public void testRejectIllegalFilterSeparatorPlacement(final String filter) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(filter); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectMissingWithValueFilterString() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("a;."); + } + + @Test + public void testAcceptMissingFilterString() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("."); + } + @Test public void testMakeWithShallowCopy() { final GenotypeBuilder gb = new GenotypeBuilder("test"); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java index c8871bd2be..fdd95e1e14 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java @@ -1,5 +1,6 @@ package htsjdk.variant.variantcontext; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -258,7 +259,7 @@ public static Object[][] illegalFilterStrings() { }; } - @Test(dataProvider = "illegalFilterStrings", expectedExceptions = IllegalStateException.class) + @Test(dataProvider = "illegalFilterStrings", expectedExceptions = TribbleException.class) public void testFilterCannotUseBadFilters(final String filter) { final Set filters = new HashSet<>(); filters.add(filter); @@ -322,7 +323,7 @@ public void testCanResetFilters() { builder.filter("mayIPlease?"); } - @Test(expectedExceptions = IllegalStateException.class) + @Test(expectedExceptions = TribbleException.class) public void testCantCreateNullFilter(){ final VariantContextBuilder builder = new VariantContextBuilder("source", "contig", 1, 1, Arrays.asList(Tref, C, G)).filter("TEST"); builder.filters((String)null); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java index 8613be1e01..085bf6d10e 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java @@ -41,6 +41,7 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeSuite; @@ -1200,7 +1201,7 @@ private VariantContext createTestVariantContext(final List alleles, fina // most of the fields are not important to the tests, we just need alleles and gc set properly return new VariantContext("genotypes", VCFConstants.EMPTY_ID_FIELD, snpLoc, snpLocStart, snpLocStop, alleles, gc, VariantContext.NO_LOG10_PERROR, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, VCFHeader.DEFAULT_VCF_VERSION, toValidate); } // validateReferenceBases: PASS conditions @@ -1296,7 +1297,7 @@ private VariantContext createTestVariantContextRsIds(final String rsId) { return new VariantContext("genotypes", rsId, snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T), GenotypesContext.NO_GENOTYPES, VariantContext.NO_LOG10_PERROR, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, VCFHeader.DEFAULT_VCF_VERSION, toValidate); } private Set makeRsIDsSet(final String... rsIds) { return new HashSet<>(Arrays.asList(rsIds)); diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java index 8dbf6dd30d..d11af08105 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java @@ -10,6 +10,10 @@ import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -39,6 +43,11 @@ public class VCFCodec43FeaturesTest extends VariantBaseTest { private static final Path TEST_43_UTF8_FILE = TEST_PATH.resolve("all43Features.utf8.vcf"); private static final Path TEST_43_UTF8_GZ_FILE = TEST_PATH.resolve("all43FeaturesCompressed.utf8.vcf.gz"); + private static final Path TEST_42_PEDIGREE_FILE = TEST_PATH.resolve("42Pedigree.vcf"); + private static final Path TEST_INVALID_43_CONTIG_NAME_FILE = TEST_PATH.resolve("invalid43ContigName.vcf"); + private static final Path TEST_VALID_43_CONTIG_NAME_FILE = TEST_PATH.resolve("valid43ContigName.vcf"); + private static final Path TEST_42_AUTOMATICALLY_CONVERTIBLE_FILE = TEST_PATH.resolve("42AutomaticallyConvertible.vcf"); + @DataProvider(name="all43Files") private Object[][] allVCF43Files() { return new Object[][] { @@ -59,7 +68,7 @@ private Object[][] allVCF43Files() { } @Test(dataProvider="all43Files") - public void testReadAllVCF43Features(final Path testFile, int expectedHeaderLineCount) { + public void testReadAllVCF43Features(final Path testFile, final int expectedHeaderLineCount) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), expectedHeaderLineCount); @@ -142,7 +151,7 @@ public void testVCF43PercentEncoding(final Path testFile, int ignored) { // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE // AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth - final VariantContext vc = entireVCF.b.get(0); + final VariantContext vc = entireVCF.b.get(0).fullyDecode(entireVCF.a, false); Assert.assertEquals(vc.getContig(), "1"); Assert.assertEquals(vc.getStart(), 327); // set=fil%3AteredInBoth @@ -166,6 +175,81 @@ public void testSymbolicAlternateAllele(final Path testFile, int ignored) { Assert.assertEquals(symbolicAlternateAllele, Allele.create(Allele.UNSPECIFIED_ALTERNATE_ALLELE_STRING)); } + @Test(dataProvider = "all43Files") + public void testReadWriteRoundTrip(final Path testFile, final int ignored) throws IOException { + // Make sure 4.3 files round trip through reading into memory, writing, then reading back in + final Tuple> readVCF = readEntireVCFIntoMemory(testFile); + final VCFHeader readHeader = readVCF.a; + + final File out = File.createTempFile("testReadWriteRoundTrip", testFile.getFileName().toString()); + out.deleteOnExit(); + + final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(out) + .unsetOption(Options.INDEX_ON_THE_FLY) + .unsetOption(Options.DO_NOT_WRITE_GENOTYPES) + .build(); + + writer.writeHeader(readHeader); + for (final VariantContext vc : readVCF.b) { + writer.add(vc.fullyDecode(readHeader, false)); + } + + writer.close(); + + final Tuple> writeVCF = readEntireVCFIntoMemory(out.toPath()); + final VCFHeader writeHeader = writeVCF.a; + + Assert.assertNotNull(readHeader.getVCFHeaderVersion()); + Assert.assertNotNull(writeHeader.getVCFHeaderVersion()); + + Assert.assertEquals(readHeader.getMetaDataInSortedOrder(), writeHeader.getMetaDataInSortedOrder()); + Assert.assertEquals(readHeader.getInfoHeaderLines(), writeHeader.getInfoHeaderLines()); + Assert.assertEquals(readHeader.getFormatHeaderLines(), writeHeader.getFormatHeaderLines()); + + Assert.assertEqualsNoOrder(readHeader.getFilterLines().toArray(), writeHeader.getFilterLines().toArray()); + Assert.assertEqualsNoOrder(readHeader.getContigLines().toArray(), writeHeader.getContigLines().toArray()); + + for (int i = 0; i < writeVCF.b.size(); i++) { + VariantBaseTest.assertVariantContextsAreEqual( + writeVCF.b.get(i).fullyDecode(writeHeader, false), + readVCF.b.get(i).fullyDecode(readHeader, false) + ); + } + } + + @DataProvider(name = "automaticUpConversionTestFiles") + private Object[][] automaticUpConversionTestFiles() { + return new Object[][]{ + {TEST_42_PEDIGREE_FILE, VCFHeaderVersion.VCF4_2}, + {TEST_INVALID_43_CONTIG_NAME_FILE, VCFHeaderVersion.VCF4_2}, + {TEST_VALID_43_CONTIG_NAME_FILE, VCFHeaderVersion.VCF4_3}, + {TEST_42_AUTOMATICALLY_CONVERTIBLE_FILE, VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "automaticUpConversionTestFiles") + public void testAutomaticUpConversion(final Path testFile, final VCFHeaderVersion expectedVersion) throws IOException { + // Pre 4.3 files which can be automatically converted to 4.3 should be + // and files which cannot should be left as 4.2 + final Tuple> readVCF = readEntireVCFIntoMemory(testFile); + + final File out = File.createTempFile("test", testFile.getFileName().toString()); + out.deleteOnExit(); + + final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(out) + .unsetOption(Options.INDEX_ON_THE_FLY) + .unsetOption(Options.DO_NOT_WRITE_GENOTYPES) + .build(); + + writer.writeHeader(readVCF.a); + writer.close(); + + final Tuple> writeVCF = readEntireVCFIntoMemory(out.toPath()); + Assert.assertEquals(writeVCF.a.getVCFHeaderVersion(), expectedVersion); + } + @DataProvider(name="all43IndexableFiles") private Object[][] allVCF43IndexableFiles() { return new Object[][] { @@ -273,5 +357,4 @@ private static List getIDHeaderLinesWithKey(final VCFHeader heade .collect(Collectors.toList()); return headerLines; } - } diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index 96924b4e3a..2ba980fbb9 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -91,8 +91,7 @@ public Object[][] getInvalidLines() { @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) public void testGetValidationError(final String lineString) { - // TODO change to VCFHeader.DEFAULT_VCF_VERSION - new VCFInfoHeaderLine(lineString, VCFHeaderVersion.VCF4_3); + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); } @DataProvider (name = "headerLineTypes") diff --git a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java index ad33575bef..8c4ef944f5 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java @@ -49,8 +49,7 @@ public Object[][] getInvalidIDs() { @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) public void testInvalidIDs(final String lineString) { - // TODO change to VCFHeader.DEFAULT_VCF_VERSION - new VCFContigHeaderLine(lineString, VCFHeaderVersion.VCF4_3, 1); + new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 1); } @Test(expectedExceptions=TribbleException.class) diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index d5d7e47ec9..2cd81e7ef9 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -100,11 +100,6 @@ public void testInvalidKeys(final String testKey) { new VCFHeaderLine(testKey, ""); } - @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) - public void testValidateAsIdInvalid(final String testKey) { - VCFHeaderLine.validateKeyOrID(testKey, "test"); - } - @DataProvider(name = "vcfVersions") public Object[][] vcfVersions() { return new Object[][]{ diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 9f51901f91..188375ba58 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -27,7 +27,6 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.TestUtil; import htsjdk.tribble.TribbleException; @@ -38,16 +37,29 @@ import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.*; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.stream.Collectors; public class VCFHeaderUnitTest extends VariantBaseTest { @@ -77,7 +89,7 @@ public void test42FileRoundtrip() throws Exception { final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); actualFile.deleteOnExit(); - try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile.toPath(), false, VCFVersionUpgradePolicy.DO_NOT_UPGRADE); final VariantContextWriter copyWriter = new VariantContextWriterBuilder() .setOutputFile(actualFile) .setReferenceDictionary(createArtificialSequenceDictionary()) @@ -289,9 +301,11 @@ public void testGetContigLinesHonorsSortOrder() { Assert.assertTrue(originalContigsInSortedOrder.size() > 0); // copy the contig lines to a new list - final List confoundedList = new ArrayList<>(); final int midPoint = originalContigsInSortedOrder.size() / 2; - confoundedList.addAll(originalContigsInSortedOrder.subList(0, midPoint)); + final List confoundedList = new ArrayList<>(originalContigsInSortedOrder.subList( + 0, + midPoint + )); // deliberately stick an extra contig line in the middle of the list, but using a contig index // that will cause the line to sort to the end @@ -312,7 +326,7 @@ public void testGetContigLinesHonorsSortOrder() { // create a new header from the confounded list, call getContigLines() on the header, and validate // that the new line is included in the resulting list, and is at the end final VCFHeader newHeader = new VCFHeader(); - confoundedList.forEach(hl -> newHeader.addMetaDataLine(hl)); + confoundedList.forEach(newHeader::addMetaDataLine); final List roundTrippedLines = newHeader.getContigLines(); Assert.assertEquals(roundTrippedLines.size(), originalContigsInSortedOrder.size() + 1); Assert.assertEquals(roundTrippedLines.get(roundTrippedLines.size() - 1), newContigLine); @@ -453,6 +467,8 @@ public Object[][] validHeaderVersionTransitions() { @DataProvider(name="invalidHeaderVersionTransitions") public Object[][] invalidHeaderVersionTransitions() { + // v4.3 can never be transitioned down to pre v4.3 + // Pre v4.3 might be able to be transitioned to 4.3, and this is tested in VCFCodec43FeaturesTest return new Object[][] { //reject any attempt to go backwards in time {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2}, @@ -602,12 +618,11 @@ public void testAddMetaDataLineFileFormat() { @Test public void testFileFormatLineFirstInSet() { - final Set orderedLineSet = new LinkedHashSet<>(); - orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); - orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + final Set orderedLineSet = new LinkedHashSet<>(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); // add the file format line last orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); - final VCFHeader vcfHeader = new VCFHeader(orderedLineSet, Collections.EMPTY_SET); + final VCFHeader vcfHeader = new VCFHeader(orderedLineSet, Collections.emptySet()); final Collection inputOrderLines = vcfHeader.getMetaDataInInputOrder(); final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); @@ -708,9 +723,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) .build(); firstCopyWriter.writeHeader(originalHeader); - final CloseableIterator firstCopyVariantIterator = originalFileReader.iterator(); - while (firstCopyVariantIterator.hasNext()) { - final VariantContext variantContext = firstCopyVariantIterator.next(); + for (final VariantContext variantContext : originalFileReader) { firstCopyWriter.add(variantContext); } originalFileReader.close(); @@ -751,9 +764,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) .build(); secondCopyWriter.writeHeader(firstCopyHeader); - final CloseableIterator secondCopyVariantIterator = firstCopyReader.iterator(); - while (secondCopyVariantIterator.hasNext()) { - final VariantContext variantContext = secondCopyVariantIterator.next(); + for (final VariantContext variantContext : firstCopyReader) { secondCopyWriter.add(variantContext); } secondCopyWriter.close(); @@ -802,23 +813,58 @@ public void testVCFHeaderQuoteEscaping() throws Exception { ///////////////////////////////////////////////////////////////////// // Serialize/encode the header to a file, read metaData back in - private Set getRoundTripEncoded(final VCFHeader header) throws IOException { + private static Set getRoundTripEncoded(final VCFHeader header) throws IOException { final File myTempFile = File.createTempFile("VCFHeader", "vcf"); - try (final VariantContextWriter vcfWriter = - new VariantContextWriterBuilder() - .setOutputFile(myTempFile) - .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) - .setOptions(VariantContextWriterBuilder.NO_OPTIONS) - .build()) { + try (final VariantContextWriter vcfWriter = new VariantContextWriterBuilder() + .setOutputFile(myTempFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOptions(VariantContextWriterBuilder.NO_OPTIONS) + .build() + ) { vcfWriter.writeHeader(header); } - final VCFHeader vcfHeader = (VCFHeader) new VCFCodec().readActualHeader(new LineIteratorImpl( + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(new LineIteratorImpl( new SynchronousLineReader(new FileReader(myTempFile.getAbsolutePath())))); return vcfHeader.getMetaDataInSortedOrder(); } + @Test + public void testVcf42Roundtrip() throws Exception { + // this test ensures that source/version fields are round-tripped properly + + // read an existing VCF + final File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + + // write the file out into a new copy + final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); + actualFile.deleteOnExit(); + + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); + final VariantContextWriter copyWriter = new VariantContextWriterBuilder() + .setOutputFile(actualFile) + .setReferenceDictionary(createArtificialSequenceDictionary()) + .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) + .build() + ) { + final VCFHeader originalHeader = originalFileReader.getFileHeader(); + + copyWriter.writeHeader(originalHeader); + for (final VariantContext variantContext : originalFileReader) { + copyWriter.add(variantContext); + } + } + + final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); + final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); + Assert.assertEquals(actualContents.substring(actualContents.indexOf('\n')), expectedContents.substring(actualContents.indexOf('\n'))); + } + + private static final int VCF4headerStringCount = 16; // 17 -1 for the #CHROM... line + - private VCFHeader getHiSeqVCFHeader() { + private static VCFHeader getHiSeqVCFHeader() { final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); final VCFFileReader reader = new VCFFileReader(vcf, false); final VCFHeader header = reader.getFileHeader(); diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java index 286fcecfa6..6c197f1c30 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -3,6 +3,7 @@ import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.LineIteratorImpl; import htsjdk.tribble.readers.SynchronousLineReader; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import org.testng.Assert; import java.io.StringReader; @@ -169,6 +170,7 @@ public static Set getV42HeaderLinesWITHFormatString() { public static VCFHeader createHeaderFromString(final String headerStr) { final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); final VCFHeader header = (VCFHeader) codec.readActualHeader( new LineIteratorImpl(new SynchronousLineReader(new StringReader(headerStr)))); Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF_4_HEADER_STRING_COUNT); diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java index 9e2a82f15a..0ea2c8f1e8 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -69,8 +69,7 @@ public void testAllow1000GKey() { VCFHeader.DEFAULT_VCF_VERSION ); - // TODO change to VCFHeader.DEFAULT_VCF_VERSION - Assert.assertFalse(line.getValidationFailure(VCFHeaderVersion.VCF4_3).isPresent()); + Assert.assertFalse(line.getValidationFailure(VCFHeader.DEFAULT_VCF_VERSION).isPresent()); } @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions= TribbleException.class) diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java index f79331a7eb..f88f0fd0ba 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -70,6 +70,25 @@ public void testKeyCollisions(final VCFHeaderLine line1, final VCFHeaderLine lin Assert.assertEquals(mdLines.getMetaDataInInputOrder().size(), expectCollision ? 1 : 2); } + @DataProvider(name = "contigALTCollisions") + public Object[][] contigALTCollisions() { + return new Object[][] { + { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 0), new VCFAltHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) + }, + { + new VCFAltHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 0) + }, + }; + } + + @Test(dataProvider = "contigALTCollisions", expectedExceptions = IllegalStateException.class) + public void testContigALTCollision(final VCFHeaderLine line1, final VCFHeaderLine line2) { + final VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + } + @Test public void testRetainFullHeaderLines() { final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index 45009ce211..38a8c983f7 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -190,7 +190,9 @@ public Object[][] makeRepairHeaderTest() { @Test(dataProvider = "RepairHeaderTest") public void testRepairHeaderTest(final RepairHeaderTest cfg) { final Set headerLines = new LinkedHashSet<>(); - headerLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + // The standard header line repair facility is not sufficiently powerful to fix broken lines + // starting from version 4.3, so it is only used for versions <= 4.2, and we use version 4.2 for this test + headerLines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); headerLines.add(cfg.original); final VCFHeader toRepair = new VCFHeader(headerLines); diff --git a/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java b/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java index 8bb9927de0..0f7d9f5963 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java @@ -1,56 +1,75 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; -import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.stream.Stream; + public class VCFTextTransformerTest extends HtsjdkTest { - @DataProvider(name="validPercentEncodings") + @DataProvider(name = "validPercentEncodings") public Object[][] validPercentEncodings() { - return new Object[][] { - { "", ""}, - { "%3A", ":"}, - { "%3B", ";"}, - { "%3D", "="}, - { "%25", "%"}, - { "%2C", ","}, - { "%0D", "\r"}, - { "%0A", "\n"}, - { "%09", "\t"}, - { "%3AA", ":A"}, - { "abc%3A", "abc:"}, - { "%3Aabc", ":abc"}, - { "%3Aabc%3A", ":abc:"}, - - // valid text containing % encodings that are not valid, and are passed through in raw form (no decoding) - { "%3", "%3"}, - { "%d", "%d"}, - { "%a", "%a"}, - { "abcdefg%", "abcdefg%"}, - { "%3Aabcdefg%", ":abcdefg%"}, - { "abcdefg%0", "abcdefg%0"}, - { "abcdefg%1", "abcdefg%1"}, - { "abcdefg%a", "abcdefg%a"}, - { "abcdefg%d", "abcdefg%d"}, - { "abcdefg%g", "abcdefg%g"}, - { "abcdefg%gg", "abcdefg%gg"}, - { "abcdefg%-1", "abcdefg%-1"}, + return new Object[][]{ + {"", ""}, + {"%3A", ":"}, + {"%3B", ";"}, + {"%3D", "="}, + {"%25", "%"}, + {"%2C", ","}, + {"%0D", "\r"}, + {"%0A", "\n"}, + {"%09", "\t"}, + {"%3AA", ":A"}, + {"abc%3A", "abc:"}, + {"%3Aabc", ":abc"}, + {"%3Aabc%3A", ":abc:"}, }; } - @Test(dataProvider="validPercentEncodings") - public void testDecodeValidEncodings(final String rawText, final String decodedText) { + @DataProvider(name = "truncatedPercentEncodings") + public Object[][] truncatedPercentEncodings() { + return new Object[][]{ + // valid text containing % encodings that are not valid, and are passed through in raw form (no decoding) + {"%3", "%3"}, + {"%d", "%d"}, + {"%a", "%a"}, + {"abcdefg%", "abcdefg%"}, + {"%3Aabcdefg%", ":abcdefg%"}, + {"abcdefg%0", "abcdefg%0"}, + {"abcdefg%1", "abcdefg%1"}, + {"abcdefg%a", "abcdefg%a"}, + {"abcdefg%d", "abcdefg%d"}, + {"abcdefg%g", "abcdefg%g"}, + {"abcdefg%gg", "abcdefg%gg"}, + {"abcdefg%-1", "abcdefg%-1"}, + }; + } + + @DataProvider(name = "allPercentEncodings") + public Object[][] allPercentEncodings() { + return Stream.concat(Arrays.stream(validPercentEncodings()), Arrays.stream(truncatedPercentEncodings())) + .toArray(Object[][]::new); + } + + @Test(dataProvider = "allPercentEncodings") + public void testDecodeValidEncodings(final String encodedText, final String decodedText) { final VCFTextTransformer vcfTextTransformer = new VCFPercentEncodedTextTransformer(); - Assert.assertEquals(vcfTextTransformer.decodeText(rawText), decodedText); + Assert.assertEquals(vcfTextTransformer.decodeText(encodedText), decodedText); } - @Test(dataProvider = "validPercentEncodings") - public void testPassThruValidEncodings(final String rawText, final String unused) { + @Test(dataProvider = "allPercentEncodings") + public void testPassThruValidEncodings(final String encodedText, final String unused) { final VCFPassThruTextTransformer vcfPassThruTransformer = new VCFPassThruTextTransformer(); - Assert.assertEquals(vcfPassThruTransformer.decodeText(rawText), rawText); + Assert.assertEquals(vcfPassThruTransformer.decodeText(encodedText), encodedText); } + @Test(dataProvider = "validPercentEncodings") + public void testInverseComposition(final String encodedText, final String decodedText) { + final VCFTextTransformer vcfTextTransformer = new VCFPercentEncodedTextTransformer(); + Assert.assertEquals(vcfTextTransformer.encodeText(vcfTextTransformer.decodeText(encodedText)), encodedText); + Assert.assertEquals(vcfTextTransformer.decodeText(vcfTextTransformer.encodeText(decodedText)), decodedText); + } } diff --git a/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf b/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf index fbe8d1e405..9f96ce09ed 100644 --- a/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf +++ b/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf @@ -14,7 +14,7 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= +##INFO= ##INFO= ##INFO= ##contig= diff --git a/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf b/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf new file mode 100644 index 0000000000..1d248d2ae9 --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf b/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf new file mode 100644 index 0000000000..c9689a922c --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf @@ -0,0 +1,91 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##PEDIGREE= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf b/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf new file mode 100644 index 0000000000..a9aac29ed0 --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf b/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf new file mode 100644 index 0000000000..e2c2945beb --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 From 6a2c193955c9710cfacb678af8735573d800168f Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Thu, 15 Apr 2021 15:56:19 -0400 Subject: [PATCH 07/22] BCF 2.2 writing WIP --- .travis.yml | 2 + scripts/install-bcftools.sh | 5 + .../htsjdk/samtools/util/FileExtensions.java | 3 + .../util/ListByteBufferOutputStream.java | 138 +++++ .../tribble/TribbleIndexedFeatureReader.java | 10 +- .../htsjdk/tribble/util/ParsingUtils.java | 4 +- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 314 ++++++----- .../java/htsjdk/variant/bcf2/BCF2Decoder.java | 288 +++++++--- .../htsjdk/variant/bcf2/BCF2Dictionary.java | 283 ++++++++++ .../java/htsjdk/variant/bcf2/BCF2Encoder.java | 386 +++++++++++++ .../BCF2FieldWriter/BCF2FieldEncoder.java | 314 +++++++++++ .../bcf2/BCF2FieldWriter/BCF2FieldWriter.java | 515 ++++++++++++++++++ .../BCF2FieldWriterManager.java | 106 ++++ .../bcf2/BCF2GenotypeFieldDecoders.java | 137 ++--- .../bcf2/BCF2LazyGenotypesDecoder.java | 3 +- .../java/htsjdk/variant/bcf2/BCF2Type.java | 168 ++++-- .../java/htsjdk/variant/bcf2/BCF2Utils.java | 216 ++------ .../java/htsjdk/variant/bcf2/BCFVersion.java | 8 + .../variantcontext/VariantContext.java | 25 +- .../variantcontext/writer/BCF2Encoder.java | 261 --------- .../writer/BCF2FieldEncoder.java | 455 ---------------- .../writer/BCF2FieldWriter.java | 324 ----------- .../writer/BCF2FieldWriterManager.java | 180 ------ .../variantcontext/writer/BCF2Writer.java | 301 +++++----- .../writer/VariantContextWriterBuilder.java | 5 +- .../htsjdk/variant/vcf/AbstractVCFCodec.java | 7 +- .../htsjdk/variant/vcf/VCFFileReader.java | 11 +- .../variant/vcf/VCFFilterHeaderLine.java | 5 - .../variant/vcf/VCFFormatHeaderLine.java | 6 +- .../htsjdk/variant/vcf/VCFHeaderLine.java | 12 +- .../variant/vcf/VCFHeaderLineTranslator.java | 11 +- .../htsjdk/variant/vcf/VCFInfoHeaderLine.java | 5 - .../variant/vcf/VCFSimpleHeaderLine.java | 4 - .../variant/vcf/VCFStandardHeaderLines.java | 32 +- .../java/htsjdk/samtools/SamStreamsTest.java | 3 +- .../java/htsjdk/utils/BCFToolsTestUtils.java | 136 +++++ .../htsjdk/utils/BCFToolsTestUtilsTest.java | 35 ++ .../variant/bcf2/BCF2DictionaryTest.java | 101 ++++ .../bcf2/BCF2EncoderDecoderUnitTest.java | 446 +++++++-------- .../BCF2FieldWriter/BCF2FieldEncoderTest.java | 464 ++++++++++++++++ .../variant/bcf2/BCF2UtilsUnitTest.java | 116 +--- .../variant/bcf2/BCF2WriterUnitTest.java | 309 +++++++---- .../htsjdk/variant/bcf2/BCFCodecTest.java | 15 +- .../VariantContextTestProvider.java | 35 +- .../writer/VCFWriterUnitTest.java | 64 ++- .../variant/vcf/AbstractVCFCodecTest.java | 2 +- .../vcf/VCFCompoundHeaderLineUnitTest.java | 43 ++ .../htsjdk/variant/vcf/VCFFileReaderTest.java | 5 +- .../vcf/VCFHeaderLineTranslatorUnitTest.java | 2 - .../htsjdk/variant/vcf/VCFIteratorTest.java | 24 +- src/test/resources/htsjdk/variant/bcfV22.bcf | Bin 0 -> 613 bytes .../resources/htsjdk/variant/bcfV22.bcf.gz | Bin 0 -> 613 bytes .../htsjdk/variant/structuralvariants.vcf | 2 +- src/test/resources/htsjdk/variant/test1.vcf | 2 +- 54 files changed, 3913 insertions(+), 2435 deletions(-) create mode 100644 scripts/install-bcftools.sh create mode 100644 src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java create mode 100644 src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java create mode 100644 src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java create mode 100644 src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java create mode 100644 src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java create mode 100644 src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java delete mode 100644 src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java delete mode 100644 src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java delete mode 100644 src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java delete mode 100644 src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java create mode 100644 src/test/java/htsjdk/utils/BCFToolsTestUtils.java create mode 100644 src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java create mode 100644 src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java create mode 100644 src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java create mode 100644 src/test/resources/htsjdk/variant/bcfV22.bcf create mode 100644 src/test/resources/htsjdk/variant/bcfV22.bcf.gz diff --git a/.travis.yml b/.travis.yml index f00fe8b27e..dab05066b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ cache: env: global: - HTSJDK_SAMTOOLS_BIN=/usr/bin/samtools + - HTSJDK_BCFTOOLS_BIN=/usr/bin/bcftools jdk: - oraclejdk8 - openjdk8 @@ -32,6 +33,7 @@ matrix: before_install: - scripts/install-samtools.sh + - scripts/install-bcftools.sh - scripts/htsget-scripts/start-htsget-test-server.sh script: diff --git a/scripts/install-bcftools.sh b/scripts/install-bcftools.sh new file mode 100644 index 0000000000..fca5a62134 --- /dev/null +++ b/scripts/install-bcftools.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +wget https://github.com/samtools/bcftools/releases/download/1.13/bcftools-1.13.tar.bz2 +tar -xjvf bcftools-1.13.tar.bz2 +cd bcftools-1.13 && ./configure --prefix=/usr && make && sudo make install diff --git a/src/main/java/htsjdk/samtools/util/FileExtensions.java b/src/main/java/htsjdk/samtools/util/FileExtensions.java index fc2e37d6c6..dcb8c889f9 100755 --- a/src/main/java/htsjdk/samtools/util/FileExtensions.java +++ b/src/main/java/htsjdk/samtools/util/FileExtensions.java @@ -65,6 +65,9 @@ public final class FileExtensions { public static final String VCF = ".vcf"; public static final String VCF_INDEX = TRIBBLE_INDEX; public static final String BCF = ".bcf"; + // Note that .bcf on its own may be gzip compressed and usually is, + // but files with the extension .bcf.gz to seem to exist in the wild and should be supported + public static final String COMPRESSED_BCF = ".bcf.gz"; public static final String COMPRESSED_VCF = ".vcf.gz"; public static final String COMPRESSED_VCF_INDEX = ".tbi"; public static final List VCF_LIST = Collections.unmodifiableList(Arrays.asList(VCF, COMPRESSED_VCF, BCF)); diff --git a/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java b/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java new file mode 100644 index 0000000000..d183a3b90f --- /dev/null +++ b/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java @@ -0,0 +1,138 @@ +package htsjdk.samtools.util; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Growable byte buffer backed by a list of byte arrays, which can + * be used to buffer data without reallocating an underlying array. + * Once data is accumulated, it can either be retrieved by converting + * into a byte[] for interfaces that require a contiguous block of bytes, + * or written directly to an OutputStream to avoid array copies. + */ +public class ListByteBufferOutputStream extends OutputStream { + + private final int blockSize; + private final ArrayList blocks; + private byte[] currentBlock; + private int nextBlockIndex; + private int nextBytePosition; + private int size; + + public ListByteBufferOutputStream(final int blockSize) { + this.blockSize = blockSize; + blocks = new ArrayList<>(); + nextBlockIndex = 0; + advanceBlock(); + size = 0; + } + + @Override + public void write(final int b) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + currentBlock[nextBytePosition++] = (byte) b; + size++; + } + + public void write(final byte b, final int nCopies) { + assert nCopies >= 0; + + int bytesRemaining = nCopies; + while (bytesRemaining > 0) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + final int toIndex = Math.min(nextBytePosition + bytesRemaining, blockSize); + Arrays.fill(currentBlock, nextBytePosition, toIndex, b); + bytesRemaining -= toIndex - nextBytePosition; + nextBytePosition = toIndex; + } + size += nCopies; + } + + @Override + public void write(final byte[] b) { + write(b, 0, b.length); + } + + @Override + public void write(final byte[] b, int off, final int len) { + assert b != null; + assert off >= 0; + assert len >= 0; + assert off + len <= b.length; + + int bytesRemaining = len; + while (bytesRemaining > 0) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + final int lengthToWrite = Math.min(bytesRemaining, blockSize - nextBytePosition); + System.arraycopy(b, off, currentBlock, nextBytePosition, lengthToWrite); + nextBytePosition += lengthToWrite; + off += lengthToWrite; + bytesRemaining -= lengthToWrite; + } + size += len; + } + + public int size() { + return size; + } + + public void writeTo(final OutputStream out) throws IOException { + for (final byte[] b : blocks) { + if (b == currentBlock) { + out.write(b, 0, nextBytePosition); + break; + } else { + out.write(b); + } + } + } + + public byte[] toByteArray() { + final byte[] bytes = new byte[size]; + final ByteBuffer buff = ByteBuffer.wrap(bytes); + for (final byte[] b : blocks) { + if (b == currentBlock) { + buff.put(b, 0, nextBytePosition); + break; + } else { + buff.put(b); + } + } + return bytes; + } + + public void reset() { + currentBlock = blocks.get(0); + nextBytePosition = 0; + nextBlockIndex = 1; + size = 0; + } + + public void clear() { + reset(); + // blocks always has at least 1 element + blocks.subList(1, blocks.size()).clear(); + } + + private void advanceBlock() { + if (nextBlockIndex == blocks.size()) { + // Need to add a new block + currentBlock = new byte[blockSize]; + blocks.add(currentBlock); + } else { + // Reuse old block + currentBlock = blocks.get(nextBlockIndex); + } + nextBytePosition = 0; + nextBlockIndex++; + } +} diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index 768c797ac0..7e2c10ebc0 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -33,6 +33,7 @@ import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.util.ParsingUtils; +import htsjdk.variant.vcf.VCFFileReader; import java.io.BufferedInputStream; import java.io.IOException; @@ -252,7 +253,11 @@ private void readHeader() throws IOException { PositionalBufferedStream pbs = null; try { is = ParsingUtils.openInputStream(path, wrapper); - if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) { + // BCFs are usually gzipped but do not have the .gz extension, + // so we explicitly check for the presence of a gzip header + if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8"))) + || (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(is)) + ) { // TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944 // TODO -- warning I don't think this can work, the buffered input stream screws up position is = new GZIPInputStream(new BufferedInputStream(is)); @@ -326,7 +331,8 @@ public WFIterator() throws IOException { final InputStream inputStream = ParsingUtils.openInputStream(path, wrapper); final PositionalBufferedStream pbs; - if (IOUtil.hasBlockCompressedExtension(path)) { + // BCFs can be gzipped but usually do not have a compressed extension, so an extra check is needed + if (IOUtil.hasBlockCompressedExtension(path) || (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(inputStream))) { // Gzipped -- we need to buffer the GZIPInputStream methods as this class makes read() calls, // and seekableStream does not support single byte reads final InputStream is = new GZIPInputStream(new BufferedInputStream(inputStream, 512000)); diff --git a/src/main/java/htsjdk/tribble/util/ParsingUtils.java b/src/main/java/htsjdk/tribble/util/ParsingUtils.java index 6b4470a72a..bca147e7ca 100644 --- a/src/main/java/htsjdk/tribble/util/ParsingUtils.java +++ b/src/main/java/htsjdk/tribble/util/ParsingUtils.java @@ -101,8 +101,8 @@ public static InputStream openInputStream(final String uri, final Function { + private static final Log log = Log.getInstance(BCF2Codec.class); + + public static String IDXField = "IDX"; // BCF2.2 IDX field name + protected final static int ALLOWED_MAJOR_VERSION = 2; - protected final static int ALLOWED_MINOR_VERSION = 1; + protected final static int ALLOWED_MINOR_VERSION = 2; public static final BCFVersion ALLOWED_BCF_VERSION = new BCFVersion(ALLOWED_MAJOR_VERSION, ALLOWED_MINOR_VERSION); - /** sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header */ - public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2*Byte.BYTES; - + /** + * sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header + */ + public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2 * Byte.BYTES; + private BCFVersion bcfVersion = null; private VCFHeader header = null; @@ -70,19 +86,19 @@ public class BCF2Codec extends BinaryFeatureCodec { /** * Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field */ - private final ArrayList contigNames = new ArrayList(); + private BCF2Dictionary contigDictionary; /** * Maps header string names (encoded in VCF) into strings found in the BCF header - * + *

    * Initialized when processing the header */ - private ArrayList dictionary; + private BCF2Dictionary stringDictionary; /** * Our decoder that reads low-level objects from the BCF2 records */ - private final BCF2Decoder decoder = new BCF2Decoder(); + private BCF2Decoder decoder; /** * Provides some sanity checking on the header @@ -96,7 +112,7 @@ public class BCF2Codec extends BinaryFeatureCodec { /** * A cached array of GenotypeBuilders for efficient genotype decoding. - * + *

    * Caching it allows us to avoid recreating this intermediate data * structure each time we decode genotypes */ @@ -114,12 +130,12 @@ public class BCF2Codec extends BinaryFeatureCodec { // ---------------------------------------------------------------------- @Override - public Feature decodeLoc( final PositionalBufferedStream inputStream ) { + public Feature decodeLoc(final PositionalBufferedStream inputStream) { return decode(inputStream); } @Override - public VariantContext decode( final PositionalBufferedStream inputStream ) { + public VariantContext decode(final PositionalBufferedStream inputStream) { try { recordNo++; final VariantContextBuilder builder = new VariantContextBuilder(); @@ -134,7 +150,7 @@ public VariantContext decode( final PositionalBufferedStream inputStream ) { decoder.readNextBlock(genotypeBlockSize, inputStream); createLazyGenotypesDecoder(info, builder); return builder.fullyDecoded(true).make(); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("Failed to read BCF file", e); } } @@ -153,10 +169,13 @@ public Class getFeatureType() { * The default policy is to require an exact version match. * @param supportedVersion the current BCF implementation version * @param actualVersion the actual version - * @thows TribbleException if the version policy determines that {@code actualVersion} is not compatible + * @throws TribbleException if the version policy determines that {@code actualVersion} is not compatible * with {@code supportedVersion} */ - protected void validateVersionCompatibility(final BCFVersion supportedVersion, final BCFVersion actualVersion) { + protected void validateVersionCompatibility( + final BCFVersion supportedVersion, + final BCFVersion actualVersion + ) throws TribbleException { if ( actualVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) { error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion()); } @@ -168,26 +187,24 @@ protected void validateVersionCompatibility(final BCFVersion supportedVersion, f } @Override - public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { + public FeatureCodecHeader readHeader(final PositionalBufferedStream inputStream) { try { // note that this reads the magic as well, and so does double duty bcfVersion = BCFVersion.readBCFVersion(inputStream); - if ( bcfVersion == null ) { + if (bcfVersion == null) { error("Input stream does not contain a BCF encoded file; BCF magic header info not found"); } - validateVersionCompatibility(BCF2Codec.ALLOWED_BCF_VERSION, bcfVersion); - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Parsing data stream with BCF version " + bcfVersion); - } + decoder = BCF2Decoder.getDecoder(bcfVersion); + log.debug("Parsing data stream with BCF version " + bcfVersion); final int headerSizeInBytes = BCF2Type.INT32.read(inputStream); - if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB - error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); + if (headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB + error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < " + MAX_HEADER_SIZE); final byte[] headerBytes = new byte[headerSizeInBytes]; - if ( inputStream.read(headerBytes) != headerSizeInBytes ) + if (inputStream.read(headerBytes) != headerSizeInBytes) error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); @@ -195,24 +212,21 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream final VCFCodec headerParser = new VCFCodec(); this.header = (VCFHeader) headerParser.readActualHeader(lineIterator); bps.close(); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 header"); } - // create the config offsets - if ( ! header.getContigLines().isEmpty() ) { - contigNames.clear(); - for ( final VCFContigHeaderLine contig : header.getContigLines()) { - if ( contig.getID() == null || contig.getID().equals("") ) - error("found a contig with an invalid ID " + contig); - contigNames.add(contig.getID()); - } - } else { - error("Didn't find any contig lines in BCF2 file header"); + // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields + // Error on ##dictionary lines, we don't know what to do with them + if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { + throw new TribbleException("Use of the ##dictionary line is not supported"); } + // create the contig dictionary + contigDictionary = makeContigDictionary(bcfVersion); + // create the string dictionary - dictionary = parseDictionary(header); + stringDictionary = makeStringDictionary(bcfVersion); // prepare the genotype field decoders gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); @@ -220,7 +234,7 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream // create and initialize the genotype builder array final int nSamples = header.getNGenotypeSamples(); builders = new GenotypeBuilder[nSamples]; - for ( int i = 0; i < nSamples; i++ ) { + for (int i = 0; i < nSamples; i++) { builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i)); } @@ -229,11 +243,20 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream } @Override - public boolean canDecode( final String path ) { - try (InputStream fis = Files.newInputStream(IOUtil.getPath(path)) ){ - final BCFVersion version = BCFVersion.readBCFVersion(fis); - return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION; - } catch ( final IOException e ) { + public boolean canDecode(final String path) { + try (final InputStream fis = Files.newInputStream(IOUtil.getPath(path))) { + final InputStream is = IOUtil.isGZIPInputStream(fis) ? new GZIPInputStream(fis) : fis; + final BCFVersion version = BCFVersion.readBCFVersion(is); + if (version == null) { + return false; + } else { + // Validation will throw a TribbleException for incompatible versions + // The default policy is to require an exact major and minor version match + // but subclasses can implement more permissive policies + validateVersionCompatibility(ALLOWED_BCF_VERSION, version); + return true; + } + } catch (final IOException | TribbleException e) { return false; } } @@ -264,8 +287,8 @@ private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOE this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based final int refLength = decoder.decodeInt(BCF2Type.INT32); - builder.start((long)pos); - builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open + builder.start(pos); + builder.stop(pos + refLength - 1); // minus one because GATK has closed intervals but BCF2 is open } /** @@ -276,21 +299,22 @@ private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOE */ private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) throws IOException { final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT); - if ( qual != null ) { - builder.log10PError(((Double)qual) / -10.0); + if (qual != null) { + builder.log10PError(((Double) qual) / -10.0); } final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32); final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32); - final int nAlleles = nAlleleInfo >> 16; + // Use logical shift to not introduce leading 1s + final int nAlleles = nAlleleInfo >>> 16; final int nInfo = nAlleleInfo & 0x0000FFFF; - final int nFormatFields = nFormatSamples >> 24; + final int nFormatFields = nFormatSamples >>> 24; final int nSamples = nFormatSamples & 0x00FFFFF; - if ( header.getNGenotypeSamples() != nSamples ) + if (header.getNGenotypeSamples() != nSamples) error("Reading BCF2 files with different numbers of samples per record " + - "is not currently supported. Saw " + header.getNGenotypeSamples() + - " samples in header but have a record with " + nSamples + " samples"); + "is not currently supported. Saw " + header.getNGenotypeSamples() + + " samples in header but have a record with " + nSamples + " samples"); decodeID(builder); final List alleles = decodeAlleles(builder, pos, nAlleles); @@ -298,7 +322,7 @@ private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextB decodeInfo(builder, nInfo); final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles); - if ( ! info.isValid() ) + if (!info.isValid()) error("Sites info is malformed: " + info); return info; } @@ -316,8 +340,8 @@ private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final public boolean isValid() { return nFormatFields >= 0 && - nSamples >= 0 && - alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference(); + nSamples >= 0 && + alleles != null && !alleles.isEmpty() && alleles.get(0).isReference(); } @Override @@ -328,12 +352,13 @@ public String toString() { /** * Decode the id field in this BCF2 file and store it in the builder + * * @param builder */ - private void decodeID( final VariantContextBuilder builder ) throws IOException { - final String id = (String)decoder.decodeTypedValue(); + private void decodeID(final VariantContextBuilder builder) throws IOException { + final String id = decoder.decodeUnexplodedString(); - if ( id == null ) + if (id == null || id.isEmpty()) builder.noID(); else builder.id(id); @@ -341,54 +366,67 @@ private void decodeID( final VariantContextBuilder builder ) throws IOException /** * Decode the alleles from this BCF2 file and put the results in builder + * * @param builder * @param pos * @param nAlleles * @return the alleles */ - private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) throws IOException { - // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes - List alleles = new ArrayList(nAlleles); - String ref = null; - - for ( int i = 0; i < nAlleles; i++ ) { - final String alleleBases = (String)decoder.decodeTypedValue(); + private List decodeAlleles(final VariantContextBuilder builder, final int pos, final int nAlleles) throws IOException { + final List alleles = new ArrayList<>(nAlleles); + byte[] ref = null; + + for (int i = 0; i < nAlleles; i++) { + // Some decoder functionality is inlined here to avoid conversion from bytes -> string -> bytes + final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + if (type != BCF2Type.CHAR) { + error("Expected to find vector of type CHAR while decoding Allele bases, found type " + type); + } + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final byte[] alleleBases = decoder.decodeRawBytes(size); final boolean isRef = i == 0; + if (isRef) { + ref = alleleBases; + } + final Allele allele = Allele.create(alleleBases, isRef); - if ( isRef ) ref = alleleBases; alleles.add(allele); } + assert ref != null; + assert ref.length > 0; builder.alleles(alleles); - - assert !ref.isEmpty(); - return alleles; } /** * Decode the filter field of this BCF2 file and store the result in the builder + * * @param builder */ - private void decodeFilter( final VariantContextBuilder builder ) throws IOException { - final Object value = decoder.decodeTypedValue(); + private void decodeFilter(final VariantContextBuilder builder) throws IOException { + final byte typeDescriptor = decoder.readTypeDescriptor(); + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - if ( value == null ) + if (size == 0) { + // No filters builder.unfiltered(); - else { - if ( value instanceof Integer ) { - // fast path for single integer result - final String filterString = getDictionaryString((Integer)value); - if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString)) - builder.passFilters(); - else - builder.filter(filterString); + } else if (size == 1) { + final int i = decoder.decodeInt(type); + if (i == 0) { + // PASS is always implicitly encoded as 0 + builder.passFilters(); } else { - for ( final int offset : (List)value ) - builder.filter(getDictionaryString(offset)); + builder.filter(getDictionaryString(i)); + } + } else { + for (final int offset : decoder.decodeIntArray(size, type, null)) { + builder.filter(getDictionaryString(offset)); } } } @@ -399,17 +437,23 @@ private void decodeFilter( final VariantContextBuilder builder ) throws IOExcept * @param builder * @param numInfoFields */ - private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) throws IOException { - if ( numInfoFields == 0 ) + private void decodeInfo(final VariantContextBuilder builder, final int numInfoFields) throws IOException { + if (numInfoFields == 0) // fast path, don't bother doing any work if there are no fields return; - final Map infoFieldEntries = new HashMap(numInfoFields); - for ( int i = 0; i < numInfoFields; i++ ) { + final Map infoFieldEntries = new HashMap<>(numInfoFields); + for (int i = 0; i < numInfoFields; i++) { final String key = getDictionaryString(); Object value = decoder.decodeTypedValue(); final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key); - if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags + if (metaData.getType() == VCFHeaderLineType.Flag) { + // Despite contradictory language in the spec, bcftools/htslib encode the "payload" of + // FLAG as 0x00 (MISSING type) which we would normally decode as MISSING/null, + // so we consider this value to be Boolean TRUE simply based on the presence of the key + // See https://github.com/samtools/hts-specs/issues/384 + value = Boolean.TRUE; // special case for flags + } infoFieldEntries.put(key, value); } @@ -429,17 +473,17 @@ private void decodeInfo( final VariantContextBuilder builder, final int numInfoF * @param siteInfo * @param builder */ - private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo, - final VariantContextBuilder builder ) { + private void createLazyGenotypesDecoder(final SitesInfoForDecoding siteInfo, + final VariantContextBuilder builder) { if (siteInfo.nSamples > 0) { final LazyGenotypesContext.LazyParser lazyParser = - new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); + new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes()); final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples()); // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) + if (!header.samplesWereAlreadySorted()) lazy.decode(); builder.genotypesNoValidation(lazy); @@ -458,12 +502,22 @@ public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] } } - private final String getDictionaryString() throws IOException { + private String getDictionaryString() throws IOException { return getDictionaryString((Integer) decoder.decodeTypedValue()); } protected final String getDictionaryString(final int offset) { - return dictionary.get(offset); + return stringDictionary.get(offset); + } + + private BCF2Dictionary makeStringDictionary(final BCFVersion bcfVersion) { + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(header, bcfVersion); + + // if we got here we never found a dictionary, or there are no elements in the dictionary + if (dict.isEmpty()) + error("Dictionary header element was absent or empty"); + + return dict; } /** @@ -473,18 +527,16 @@ protected final String getDictionaryString(final int offset) { * @param contigOffset * @return */ - private final String lookupContigName( final int contigOffset ) { - return contigNames.get(contigOffset); + private String lookupContigName(final int contigOffset) { + return contigDictionary.get(contigOffset); } - private final ArrayList parseDictionary(final VCFHeader header) { - final ArrayList dict = BCF2Utils.makeDictionary(header); - - // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dict.isEmpty() ) - error("Dictionary header element was absent or empty"); + private BCF2Dictionary makeContigDictionary(final BCFVersion bcfVersion) { + // create the config offsets + if (header.getContigLines().isEmpty()) + error("Didn't find any contig lines in BCF2 file header"); - return dict; + return BCF2Dictionary.makeBCF2ContigDictionary(header, bcfVersion); } /** @@ -501,8 +553,9 @@ protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String protected void error(final String message) throws RuntimeException { throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos)); } - - /** try to read a BCFVersion from an uncompressed BufferedInputStream. + + /** + * Try to read a BCFVersion from an uncompressed BufferedInputStream. * The buffer must be large enough to contain {@link #SIZEOF_BCF_HEADER} * * @param uncompressedBufferedInput the uncompressed input stream @@ -515,5 +568,8 @@ public static BCFVersion tryReadBCFVersion(final BufferedInputStream uncompresse uncompressedBufferedInput.reset(); return bcfVersion; } - + + public BCFVersion getBCFVersion() { + return bcfVersion; + } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java index 0dd166eef6..1544d9ed6c 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java @@ -31,24 +31,35 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.List; -public final class BCF2Decoder { - byte[] recordBytes = null; - ByteArrayInputStream recordStream = null; +public abstract class BCF2Decoder { + protected byte[] recordBytes = null; + protected ByteArrayInputStream recordStream = null; - public BCF2Decoder() { + private BCF2Decoder() { // nothing to do } - /** - * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes - * - * @param recordBytes - */ - protected BCF2Decoder(final byte[] recordBytes) { - setRecordBytes(recordBytes); + public static BCF2Decoder getDecoder(final BCFVersion version) { + switch (version.getMinorVersion()) { + case 1: + return new BCF2Decoder.BCF2_1Decoder(); + case 2: + return new BCF2Decoder.BCF2_2Decoder(); + default: + throw new TribbleException("BCF2Codec can only process BCF2 files with minor version <= " + BCF2Codec.ALLOWED_MINOR_VERSION + " but this file has minor version " + version.getMinorVersion()); + } + } + + public static BCF2Decoder getDecoder(final BCFVersion version, final byte[] recordBytes) { + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version); + decoder.setRecordBytes(recordBytes); + return decoder; } // ---------------------------------------------------------------------- @@ -63,7 +74,7 @@ protected BCF2Decoder(final byte[] recordBytes) { * @param stream */ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { - if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes); + if (blockSizeInBytes < 0) throw new TribbleException("Invalid block size " + blockSizeInBytes); setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); } @@ -74,9 +85,9 @@ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) */ public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { try { - final int bytesRead = (int)stream.skip(blockSizeInBytes); + final int bytesRead = (int) stream.skip(blockSizeInBytes); validateReadBytes(bytesRead, 1, blockSizeInBytes); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 file", e); } this.recordBytes = null; @@ -85,6 +96,7 @@ public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) /** * Returns the byte[] for the block of data we are currently decoding + * * @return */ public byte[] getRecordBytes() { @@ -131,41 +143,54 @@ public final Object decodeTypedValue(final byte typeDescriptor) throws IOExcepti } public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException { - if ( size == 0 ) { + if (size == 0) { // missing value => null in java return null; } else { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency - return decodeLiteralString(size); - } else if ( size == 1 ) { - return decodeSingleValue(type); + if (type == BCF2Type.CHAR) { // special case string decoding for efficiency + final List strings = decodeExplodedStrings(size, ','); + if (strings.isEmpty()) { + return null; + } else if (strings.size() == 1) { + return strings.get(0); + } else { + return strings; + } + } else if (size == 1) { + final Object o = decodeSingleValue(type); + return o == BCF2Type.EOVValue() ? null : o; } else { - final ArrayList ints = new ArrayList(size); - for ( int i = 0; i < size; i++ ) { + final ArrayList ints = new ArrayList<>(size); + for (int i = 0; i < size; i++) { final Object val = decodeSingleValue(type); - if ( val == null ) continue; // auto-pruning. We remove trailing nulls + if (val == BCF2Type.EOVValue()) continue; ints.add(val); } - return ints.isEmpty() ? null : ints; // return null when all of the values are null + return ints.isEmpty() ? null : ints; } } } public final Object decodeSingleValue(final BCF2Type type) throws IOException { - // TODO -- decodeTypedValue should integrate this routine final int value = decodeInt(type); - if ( value == type.getMissingBytes() ) + if (value == type.getMissingBytes()) { return null; - else { + } else if (value == type.getEOVBytes()) { + return BCF2Type.EOVValue(); + } else { switch (type) { case INT8: case INT16: - case INT32: return value; - case FLOAT: return rawFloatToFloat(value); - case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased - default: throw new TribbleException("BCF2 codec doesn't know how to decode type " + type ); + case INT32: + return value; + case FLOAT: + return rawFloatToFloat(value); + case CHAR: + return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased + default: + throw new TribbleException("BCF2 codec doesn't know how to decode type " + type); } } } @@ -176,31 +201,8 @@ public final Object decodeSingleValue(final BCF2Type type) throws IOException { // // ---------------------------------------------------------------------- - private final Object decodeLiteralString(final int size) { - assert size > 0; - - // TODO -- assumes size > 0 - final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array - try { - recordStream.read(bytes); - - int goodLength = 0; - for ( ; goodLength < bytes.length ; goodLength++ ) - if ( bytes[goodLength] == 0 ) break; - - if ( goodLength == 0 ) - return null; - else { - final String s = new String(bytes, 0, goodLength); - return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s; - } - } catch ( IOException e ) { - throw new TribbleException("readByte failure", e); - } - } - public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException { - if ( BCF2Utils.sizeIsOverflow(typeDescriptor) ) + if (BCF2Utils.sizeIsOverflow(typeDescriptor)) // -1 ensures we explode immediately with a bad size if the result is missing return decodeInt(readTypeDescriptor(), -1); else @@ -228,14 +230,22 @@ public final int decodeInt(final BCF2Type type) throws IOException { /** * Low-level reader for int[] - * + *

    * Requires a typeDescriptor so the function knows how many elements to read, * and how they are encoded. - * + *

    + * Note that this method is only suitable for reading arrays which are known + * to not contain any internal MISSING values (e.g. filter or GT, + * in the case of GT in BCF 2.1, the vector may be MISSING padded if the + * sample ploidy is less than the maximum, but these missing values are + * not considered to be part of the array, and will not be returned). + * Parts of the decoder that require missing values to be preserved should + * use decodeTyped + *

    * If size == 0 => result is null * If size > 0 => result depends on the actual values in the stream - * -- If the first element read is MISSING, result is null (all values are missing) - * -- Else result = int[N] where N is the first N non-missing values decoded + * -- If the first element read is MISSING, result is null (all values are missing) + * -- Else result = int[N] where N is the first N non-missing values decoded * * @param maybeDest if not null we'll not allocate space for the vector, but instead use * the externally allocated array of ints to store values. If the @@ -244,45 +254,131 @@ public final int decodeInt(final BCF2Type type) throws IOException { * int elements are still forced to do a fresh allocation as well. * @return see description */ - public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { - if ( size == 0 ) { + public int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { + if (size == 0) { return null; } else { - if ( maybeDest != null && maybeDest.length < size ) + if (maybeDest != null && maybeDest.length < size) maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small final int val1 = decodeInt(type); - if ( val1 == type.getMissingBytes() ) { - // fast path for first element being missing - for ( int i = 1; i < size; i++ ) decodeInt(type); + if (val1 == getPaddingValue(type)) { + // Fast path for first element being padding, meaning the whole array is empty + final int bytesToDrop = type.getSizeInBytes() * (size - 1); + // Skip the rest of the padding values + recordStream.skip(bytesToDrop); return null; } else { // we know we will have at least 1 element, so making the int[] is worth it final int[] ints = maybeDest == null ? new int[size] : maybeDest; - ints[0] = val1; // we already read the first one - for ( int i = 1; i < size; i++ ) { + ints[0] = val1; + for (int i = 1; i < size; i++) { ints[i] = decodeInt(type); - if ( ints[i] == type.getMissingBytes() ) { - // read the rest of the missing values, dropping them - for ( int j = i + 1; j < size; j++ ) decodeInt(type); + if (ints[i] == getPaddingValue(type)) { + final int bytesToDrop = type.getSizeInBytes() * (size - (i + 1)); + // Skip the rest of the padding values + recordStream.skip(bytesToDrop); // deal with auto-pruning by returning an int[] containing - // only the non-MISSING values. We do this by copying the first + // only the non-padding values. We do this by copying the first // i elements, as i itself is missing return Arrays.copyOf(ints, i); } } - return ints; // all of the elements were non-MISSING + return ints; // all of the elements were non-padding } } } + public byte[] decodeRawBytes(final int size) throws IOException { + final byte[] bytes = new byte[size]; + recordStream.read(bytes); + return bytes; + } + + /** + * Decode a single ASCII encoded string which may be padded with NULL bytes. + * Multiple strings which were encoded as a single comma separated string are + * returned unexploded. + *

    + * Reads directly from underlying byte buffer to avoid unnecessary array copies. + * + * @param size + * @return + */ + public String decodeUnexplodedString(final int size) { + // Get our current position in the buffer so we can index directly into it + final int currentBufferPosition = recordBytes.length - recordStream.available(); + + // Jump over all bytes, including NULL padding + recordStream.skip(size); + + // Scan for first NULL padding byte + int realLength = 0; + for (; realLength < size; realLength++) + if (recordBytes[currentBufferPosition + realLength] == '\0') break; + + // The BCF spec states that strings are ASCII encoded, but we use UTF-8 for future proofing + return new String(recordBytes, currentBufferPosition, realLength, StandardCharsets.UTF_8); + } + + public String decodeUnexplodedString() throws IOException { + final byte typeDescriptor = readTypeDescriptor(); + final int size = decodeNumberOfElements(typeDescriptor); + + return size > 0 ? decodeUnexplodedString(size) : ""; + } + + /** + * Decode a list of ASCII encoded strings. + * Multiple strings as a single separator delimited string are + * exploded. If only a single string was encoded with no separators, returns a + * list of length 1. + *

    + * Reads directly from underlying byte buffer to avoid unnecessary array copies. + * + * @param size + * @return + */ + public List decodeExplodedStrings(final int size, final char separator) { + // Get our current position in the buffer so we can index directly into it + final int currentBufferPosition = recordBytes.length - recordStream.available(); + + // Jump over all bytes + recordStream.skip(size); + + if (size == 0 || recordBytes[currentBufferPosition] == '\0') return Collections.emptyList(); + + int numStrings = 1; + // Start at offset 1 to avoid counting optional leading comma + // Real length may be shorter than provided one because of NULL padding + int realLength = 1; + for (; realLength < size; realLength++) { + final byte currentByte = recordBytes[currentBufferPosition + realLength]; + if (currentByte == separator) numStrings++; + else if (currentByte == '\0') break; + } + + final List strings = new ArrayList<>(numStrings); + int currentStringStart = recordBytes[currentBufferPosition] == separator ? 1 : 0; + for (int i = 1; i < realLength; i++) { + if (recordBytes[currentBufferPosition + i] == separator) { + strings.add(new String(recordBytes, currentBufferPosition + currentStringStart, i - currentStringStart, StandardCharsets.UTF_8)); + currentStringStart = i + 1; + } + } + // Add final string + strings.add(new String(recordBytes, currentBufferPosition + currentStringStart, realLength - currentStringStart, StandardCharsets.UTF_8)); + + return strings; + } + public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); return decodeIntArray(size, type, null); } private double rawFloatToFloat(final int rawFloat) { - return (double)Float.intBitsToFloat(rawFloat); + return Float.intBitsToFloat(rawFloat); } // ---------------------------------------------------------------------- @@ -303,11 +399,11 @@ public final int readBlockSize(final InputStream inputStream) throws IOException /** * Read all bytes for a BCF record block into a byte[], and return it - * + *

    * Is smart about reading from the stream multiple times to fill the buffer, if necessary * * @param blockSizeInBytes number of bytes to read - * @param inputStream the stream to read from + * @param inputStream the stream to read from * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream */ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { @@ -316,23 +412,23 @@ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStr final byte[] record = new byte[blockSizeInBytes]; try { int bytesRead = 0; - int nReadAttempts = 0; // keep track of how many times we've read + final int nReadAttempts = 0; // keep track of how many times we've read // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF - while ( bytesRead < blockSizeInBytes ) { + while (bytesRead < blockSizeInBytes) { final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); - if ( read1 == -1 ) + if (read1 == -1) validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); else bytesRead += read1; } - if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me + if (GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1) { // TODO -- remove me System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); } validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 file", e); } @@ -349,14 +445,40 @@ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStr private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { assert expected >= 0; - if ( actuallyRead < expected ) { + if (actuallyRead < expected) { throw new TribbleException( - String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", - expected, actuallyRead, nReadAttempts)); + String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", + expected, actuallyRead, nReadAttempts)); } } public final byte readTypeDescriptor() throws IOException { - return BCF2Utils.readByte(recordStream); + return (byte) recordStream.read(); + } + + + // ---------------------------------------------------------------------- + // + // Version specific behavior + // + // ---------------------------------------------------------------------- + + + public abstract int getPaddingValue(final BCF2Type type); + + public static class BCF2_1Decoder extends BCF2Decoder { + + @Override + public int getPaddingValue(final BCF2Type type) { + return type.getMissingBytes(); + } + } + + public static class BCF2_2Decoder extends BCF2Decoder { + + @Override + public int getPaddingValue(final BCF2Type type) { + return type.getEOVBytes(); + } } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java new file mode 100644 index 0000000000..7b30da8643 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java @@ -0,0 +1,283 @@ +package htsjdk.variant.bcf2; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.stream.Collectors; + +/** + * Dictionary of strings or contigs for use with a BCF file. + *

    + * Provides an Integer -> String map interface, but determines during construction whether + * mapping can be stored as an array (if it can be stored as a dense array) or + * it must be stored using a map. + *

    + * This class validates that IDX fields are used as required by the BCF 2.2 spec, namely + * that either all lines of a given dictionary type (contig or FORMAT/INFO/FILTER) have + * IDX fields or none do. + *

    + * The spec does not require a 1-to-1 IDX-to-string mapping, but logically a header with a + * 1-to-n IDX-to-string mapping would be unparsable, and we reject such headers, while an + * n-to-1 IDX-to-string mapping might result from tools that do not deduplicate IDXs, so + * we accept them. + */ +public abstract class BCF2Dictionary extends AbstractMap { + + /** + * Create and return a BCF string dictionary + * The dictionary is an ordered list of common VCF identifiers (FILTER, INFO, and FORMAT) fields. + *

    + * Note that it's critical that the list be dedupped and sorted in a consistent manner each time, + * as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly + * the same way as in the header each time it's very bad + * + * @param vcfHeader VCFHeader containing the strings to be stored + * @param version BCF version for which the dictionary will be used + * @return BCF2Dictionary suitable for use with a BCF file + */ + public static BCF2Dictionary makeBCF2StringDictionary(final VCFHeader vcfHeader, final BCFVersion version) { + final List headerLines = vcfHeader.getMetaDataInInputOrder().stream() + .filter(BCF2Dictionary::isStringDictionaryDefining) + .map(l -> (VCFSimpleHeaderLine) l) + .collect(Collectors.toList()); + + return BCF2Dictionary.makeDictionary(headerLines, version, true); + } + + private static boolean isStringDictionaryDefining(final VCFHeaderLine line) { + switch (line.getKey()) { + case VCFConstants.INFO_HEADER_KEY: + case VCFConstants.FORMAT_HEADER_KEY: + case VCFConstants.FILTER_HEADER_KEY: + return true; + default: + return false; + } + } + + /** + * Create and return a BCF contig dictionary + * + * @param vcfHeader VCFHeader containing the contig header lines to be stored + * @param version BCF version for which the dictionary will be used + * @return BCF2Dictionary suitable for use with a BCF file + */ + public static BCF2Dictionary makeBCF2ContigDictionary(final VCFHeader vcfHeader, final BCFVersion version) { + return BCF2Dictionary.makeDictionary(vcfHeader.getContigLines(), version, false); + } + + private static BCF2Dictionary makeDictionary( + final List headerLines, + final BCFVersion version, + final boolean stringDictionary + ) { + if (headerLines.isEmpty()) { + return new BCF2DenseDictionary(Collections.emptyList()); + } + + // Note that we count FILTER/FORMAT/INFO header lines with the same ID but different key + // (e.g. a FORMAT line and an INFO line both with ID "A") to define the same string + // for the purposes of building the dictionary + // c.f. https://github.com/samtools/hts-specs/issues/591#issuecomment-904487133 + final Set seen = new HashSet<>(headerLines.size() + 1); + + if (stringDictionary) { + // Special case the special PASS field which may not show up in the FILTER field definitions + seen.add(VCFConstants.PASSES_FILTERS_v4); + } + + // Check version and possibly peek at first value to see if lines should contain IDX fields or not + final boolean shouldHaveIDX = version.getMinorVersion() > 1 && + headerLines.get(0).getGenericFieldValue(BCF2Codec.IDXField) != null; + + // Validate + for (final VCFSimpleHeaderLine headerLine : headerLines) { + final String idxString = headerLine.getGenericFieldValue(BCF2Codec.IDXField); + if ((idxString == null) == shouldHaveIDX) { + // If any line had an IDX then they all should + throw new TribbleException.InvalidHeader(String.format( + "Inconsistent IDX field usage in BCF file %s header line %s, %s", + headerLine.getKey(), + headerLine.getID(), + shouldHaveIDX ? "did not find expected IDX field" : "unexpected IDX field" + )); + } + } + + if (shouldHaveIDX) { + final HashMap strings = new HashMap<>(headerLines.size() + 1); + int maxIDX = 0; + if (stringDictionary) { + strings.put(0, VCFConstants.PASSES_FILTERS_v4); + } + + for (final VCFSimpleHeaderLine line : headerLines) { + final String id = line.getID(); + final int IDX = Integer.parseUnsignedInt(line.getGenericFieldValue(BCF2Codec.IDXField)); + if (!seen.contains(id)) { + seen.add(id); + maxIDX = Math.max(maxIDX, IDX); + strings.put(IDX, line.getID()); + } + + // Have we seen this IDX before with a different string? + if (strings.containsKey(IDX)) { + final String oldString = strings.get(IDX); + if (!oldString.equals(id)) { + throw new TribbleException.InvalidHeader(String.format( + "IDX %d associated with multiple dictionary defining strings: %s and %s", + IDX, oldString, id + )); + } + } + } + if (maxIDX == seen.size() - 1) { + // By the pigeonhole principle, if we have N unique non-negative IDXs numbered starting from 0 + // (possibly including 0 -> PASS implicitly) and (N - 1) is the highest IDX we have seen, + // we have all the IDXs in [0, N), which we can represent as a length N dense array. + // This check is useful because bcftools will always add IDX fields to headers even when not + // strictly necessary, so we can avoid the cost of the hash map in many cases. + final ArrayList stringsList = new ArrayList<>(seen.size()); + strings.forEach(stringsList::add); + return new BCF2DenseDictionary(stringsList); + } else { + return new BCF2SparseDictionary(strings); + } + } else { + final ArrayList strings = new ArrayList<>(headerLines.size() + 1); + if (stringDictionary) { + strings.add(VCFConstants.PASSES_FILTERS_v4); + } + + for (final VCFSimpleHeaderLine line : headerLines) { + final String id = line.getID(); + if (!seen.contains(id)) { + strings.add(line.getID()); + seen.add(id); + } + } + return new BCF2DenseDictionary(strings); + } + } + + /** + * Additional method in interface to avoid boxing when indexing into a + * dictionary backed by a List + * + * @param i index + * @return the string associated with the index or null + */ + public abstract String get(final int i); + + /** + * BCF 2.2 dense sequence dictionary. Strings are assigned an index corresponding to its position in a 0-indexed + * array. This dictionary is used if no IDX fields are present in the header, or they are present, but they + * represent a set of indices that are of the form 0, 1, ..., n, that is, the set has no gaps and is numbered + * starting at 0. + */ + private static class BCF2DenseDictionary extends BCF2Dictionary { + + private final List dictionary; + + private BCF2DenseDictionary(final List dictionary) { + this.dictionary = dictionary; + } + + @Override + public Set> entrySet() { + final Set> set = new HashSet<>(dictionary.size()); + int i = 0; + for (final String s : dictionary) { + set.add(new AbstractMap.SimpleEntry<>(i, s)); + i++; + } + return set; + } + + @Override + public String get(final int i) { + return i < 0 || i >= dictionary.size() ? null : dictionary.get(i); + } + + @Override + public String get(final Object key) { + return dictionary.get((Integer) key); + } + + @Override + public int size() { + return dictionary.size(); + } + + @Override + public boolean isEmpty() { + return dictionary.isEmpty(); + } + + @Override + public void forEach(final BiConsumer action) { + int i = 0; + for (final String s : dictionary) { + action.accept(i, s); + i++; + } + } + } + + /** + * BCF 2.2 sparse dictionary. Strings are assigned an index corresponding to its line's IDX field. + * This dictionary is used if IDX fields are present in the header, and they represent a set of + * indices that is not of the form 0, 1, ..., n, that is, the set has gaps or is not numbered starting + * at 0. + */ + private static class BCF2SparseDictionary extends BCF2Dictionary { + + private final Map dictionary; + + private BCF2SparseDictionary(final Map dictionary) { + this.dictionary = dictionary; + } + + @Override + public Set> entrySet() { + return dictionary.entrySet(); + } + + @Override + public String get(final int i) { + return dictionary.get(i); + } + + @Override + public String get(final Object key) { + return dictionary.get(key); + } + + @Override + public int size() { + return dictionary.size(); + } + + @Override + public boolean isEmpty() { + return dictionary.isEmpty(); + } + + @Override + public void forEach(final BiConsumer action) { + dictionary.forEach(action); + } + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java new file mode 100644 index 0000000000..ae99f8e7e1 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java @@ -0,0 +1,386 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package htsjdk.variant.bcf2; + +import htsjdk.samtools.util.ListByteBufferOutputStream; +import htsjdk.tribble.TribbleException; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +/** + * See #BCFWriter for documentation on this classes role in encoding BCF2 files + * + * @author Mark DePristo + * @since 06/12 + */ +public abstract class BCF2Encoder { + // TODO -- increase default size? + public static final int WRITE_BUFFER_INITIAL_SIZE = 16_384; + protected final ListByteBufferOutputStream encodeStream = new ListByteBufferOutputStream(WRITE_BUFFER_INITIAL_SIZE); + + public static BCF2Encoder getEncoder(final BCFVersion version) { + switch (version.getMinorVersion()) { + case 1: + return new BCF2_1Encoder(); + case 2: + return new BCF2_2Encoder(); + default: + throw new TribbleException("BCF2Codec can only process BCF2 files with minor version <= " + 2 + " but this file has minor version " + version.getMinorVersion()); + } + } + + + // -------------------------------------------------------------------------------- + // + // Functions to return the data being encoded here + // + // -------------------------------------------------------------------------------- + + /** + * This allocates a new array and copies the stream's contents over so it + * should not be used in the actual encoder, but may be useful for testing + */ + public byte[] getRecordBytes() { + final byte[] bytes = encodeStream.toByteArray(); + encodeStream.reset(); + return bytes; + } + + public final int getSize() { + return encodeStream.size(); + } + + public final void write(final OutputStream out) throws IOException { + encodeStream.writeTo(out); + encodeStream.reset(); + } + + + // -------------------------------------------------------------------------------- + // + // Writing typed values (writes out typing byte(s) first) + // + // -------------------------------------------------------------------------------- + + public final void encodeTypedMissing(final BCF2Type type) throws IOException { + encodeType(0, type); + } + + public final void encodeTyped(final Object value, final BCF2Type type) throws IOException { + if (value == null) + encodeTypedMissing(type); + else { + switch (type) { + case INT8: + case INT16: + case INT32: + encodeTypedInt((Integer) value, type); + break; + case FLOAT: + encodeTypedFloat((Double) value); + break; + case CHAR: + encodeTypedString((String) value); + break; + default: + throw new IllegalArgumentException("Illegal type encountered " + type); + } + } + } + + public final void encodeTypedInt(final int v) throws IOException { + final BCF2Type type = BCF2Utils.determineIntegerType(v); + encodeTypedInt(v, type); + } + + public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException { + encodeType(1, type); + encodeRawInt(v, type); + } + + public final void encodeTypedFloat(final double v) throws IOException { + encodeType(1, BCF2Type.FLOAT); + encodeRawFloat(v); + } + + public final void encodeTypedString(final String s) throws IOException { + encodeTypedString(s.getBytes(StandardCharsets.UTF_8)); + } + + public final void encodeTypedString(final byte[] s) throws IOException { + encodeType(s.length, BCF2Type.CHAR); + encodeStream.write(s); + } + + public final void encodeTypedVecInt(final int[] vs) throws IOException { + final int size = vs.length; + final BCF2Type type = BCF2Utils.determineIntegerType(vs); + encodeType(size, type); + encodeRawVecInt(vs, size, type); + } + + + public final void encodeTypedVecInt(final int[] vs, final int paddedSize) throws IOException { + final BCF2Type type = BCF2Utils.determineIntegerType(vs); + encodeType(paddedSize, type); + encodeRawVecInt(vs, paddedSize, type); + } + + // TODO only used in testing, should remove and update tests + public final void encodeTyped(final List v, final BCF2Type type) throws IOException { + if (type == BCF2Type.CHAR && !v.isEmpty()) { + encodeTypedString(compactStrings((List) v)); + } else { + encodeType(v.size(), type); + encodeRawValues(v, type); + } + } + + + // -------------------------------------------------------------------------------- + // + // Writing raw values (does not write out typing byte(s)) + // + // -------------------------------------------------------------------------------- + + public final void encodeRawValues(final Collection v, final BCF2Type type) throws IOException { + for (final T v1 : v) { + encodeRawValue(v1, type); + } + } + + public final void encodeRawValue(final T value, final BCF2Type type) throws IOException { + try { + if (value == type.getMissingJavaValue()) + encodeRawMissingValue(type); + else { + switch (type) { + case INT8: + case INT16: + case INT32: + encodeRawBytes((Integer) value, type); + break; + case FLOAT: + encodeRawFloat((Double) value); + break; + case CHAR: + encodeRawChar((Byte) value); + break; + default: + throw new IllegalArgumentException("Illegal type encountered " + type); + } + } + } catch (final ClassCastException e) { + throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value); + } + } + + public final void encodeRawMissingValue(final BCF2Type type) throws IOException { + encodeRawBytes(type.getMissingBytes(), type); + } + + + // -------------------------------------------------------------------------------- + // + // Low-level encoders + // + // -------------------------------------------------------------------------------- + + public final void encodeType(final int size, final BCF2Type type) throws IOException { + if (size <= BCF2Utils.MAX_INLINE_ELEMENTS) { + final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type); + encodeStream.write(typeByte); + } else { + final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type); + encodeStream.write(typeByte); + // write in the overflow size + encodeTypedInt(size); + } + } + + public final void encodeRawBytes(final int v, final BCF2Type type) throws IOException { + type.write(v, encodeStream); + } + + public final void encodeRawInt(final int v, final BCF2Type type) throws IOException { + type.write(v, encodeStream); + } + + public final void encodeRawFloat(final double v) throws IOException { + encodeRawBytes(Float.floatToIntBits((float) v), BCF2Type.FLOAT); + } + + public final void encodeRawChar(final byte c) { + encodeStream.write(c); + } + + public final void encodeRawString(final byte[] s, final int paddedSize) { + encodeStream.write(s); + final int padding = paddedSize - s.length; + if (padding > 0) { + // Pad with zeros, see https://github.com/samtools/hts-specs/issues/232 + encodeStream.write((byte) 0, padding); + } + } + + public final void encodeRawVecInt(final int[] vs, final int paddedSize, final BCF2Type type) throws IOException { + for (final int v : vs) { + type.write(v, encodeStream); + } + encodePaddingValues(paddedSize - vs.length, type); + } + + public final void encodeRawVecInt(final List vs, final BCF2Type type) throws IOException { + for (final Integer v : vs) { + if (v == null) { + type.write(type.getMissingBytes(), encodeStream); + } else { + type.write(v, encodeStream); + } + } + } + + public final void encodeRawVecInt(final List vs, final int paddedSize, final BCF2Type type) throws IOException { + encodeRawVecInt(vs, type); + encodePaddingValues(paddedSize - vs.size(), type); + } + + public final void encodeRawVecFloat(final double[] vs, final int paddedSize) throws IOException { + for (final double v : vs) { + encodeRawFloat(v); + } + encodePaddingValues(paddedSize - vs.length, BCF2Type.FLOAT); + } + + public final void encodeRawVecFloat(final List vs) throws IOException { + for (final Double v : vs) { + if (v == null) { + encodeRawMissingValue(BCF2Type.FLOAT); + } else { + encodeRawFloat(v); + } + } + } + + public final void encodeRawVecFloat(final List vs, final int paddedSize) throws IOException { + encodeRawVecFloat(vs); + encodePaddingValues(paddedSize - vs.size(), BCF2Type.FLOAT); + } + + public final void encodePaddingValues(final int size, final BCF2Type type) throws IOException { + for (int i = 0; i < size; i++) { + encodePaddingValue(type); + } + } + + public abstract void encodePaddingValue(final BCF2Type type) throws IOException; + + // -------------------------------------------------------------------------------- + // + // Utility Functions + // + // -------------------------------------------------------------------------------- + + public final byte[] compactStrings(final String[] strings) { + return compactStrings(Arrays.asList(strings)); + } + + public abstract byte[] compactStrings(final List strings); + + + // -------------------------------------------------------------------------------- + // + // Version specific behavior + // + // -------------------------------------------------------------------------------- + + public static class BCF2_1Encoder extends BCF2Encoder { + + @Override + public void encodePaddingValue(final BCF2Type type) throws IOException { + type.write(type.getMissingBytes(), encodeStream); + } + + @Override + public byte[] compactStrings(final List strings) { + if (strings.isEmpty()) return new byte[0]; + + // 1 comma for each string, then add on individual string lengths + int size = strings.size(); + final byte[][] bytes = new byte[strings.size()][]; + int i = 0; + for (final String s : strings) { + final byte[] b = s.getBytes(StandardCharsets.UTF_8); + size += b.length; + bytes[i++] = b; + } + final ByteBuffer buff = ByteBuffer.allocate(size); + for (final byte[] bs : bytes) { + buff.put((byte) ','); + buff.put(bs); + } + + return buff.array(); + } + } + + public static class BCF2_2Encoder extends BCF2Encoder { + + @Override + public void encodePaddingValue(final BCF2Type type) throws IOException { + type.write(type.getEOVBytes(), encodeStream); + } + + @Override + public byte[] compactStrings(final List strings) { + if (strings.isEmpty()) return new byte[0]; + + // 1 comma for each string except the first, then add on individual string lengths + int size = strings.size() - 1; + final byte[][] bytes = new byte[strings.size()][]; + int i = 0; + for (final String s : strings) { + final byte[] b = s.getBytes(StandardCharsets.UTF_8); + size += b.length; + bytes[i++] = b; + } + final ByteBuffer buff = ByteBuffer.allocate(size); + buff.put(bytes[0]); + for (int j = 1; j < strings.size(); j++) { + buff.put((byte) ','); + buff.put(bytes[j]); + } + + return buff.array(); + } + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java new file mode 100644 index 0000000000..3a6aeae2cb --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -0,0 +1,314 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +abstract class BCF2FieldEncoder { + + final BCF2Encoder encoder; + + BCF2Type type; + + /* + The number of VCF values this encoder has seen, taking the maximum over all objects loaded. + This value is not identical to either the number of Java objects loaded or the BCF2 typing byte length + but is primarily useful for checking that the number of VCF values matches the header's declared count. + + For example, for a writer of type Character having loaded the String "abc", nValues is 3 matching its typing byte, + while for a writer of type String having loaded the String "abc", nValues is 1, but its typing byte length is 3. + */ + int nValues; + + BCF2FieldEncoder(final BCF2Encoder encoder) { + this.encoder = encoder; + } + + abstract void load(final Object o); + + void encodeType() throws IOException { + encoder.encodeType(nValues, type); + } + + void checkNValues(final VCFCompoundHeaderLine headerLine, final VariantContext vc) { + final int expectedValues = headerLine.getCount(vc); + if (nValues > expectedValues) + throw BCF2FieldWriter.tooManyValues(nValues, expectedValues, headerLine.getKey(), vc); + nValues = expectedValues; + } + + abstract void encode() throws IOException; + + + static class AtomicIntFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + AtomicIntFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.INT8; + nValues = 1; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(null); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + vs.add(v); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() throws IOException { + encoder.encodeRawVecInt(vs, type); + vs.clear(); + type = BCF2Type.INT8; + } + } + + static class AtomicFloatFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + AtomicFloatFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.FLOAT; + nValues = 1; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(null); + } else if (o instanceof Double) { + vs.add((Double) o); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() throws IOException { + encoder.encodeRawVecFloat(vs); + vs.clear(); + } + } + + static class CharFieldEncoder extends BCF2FieldEncoder { + + private static final byte[] EMPTY = new byte[0]; + + private final List vs = new ArrayList<>(); + + CharFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.CHAR; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(EMPTY); + } else if (o instanceof String) { + final byte[] b = ((String) o).getBytes(StandardCharsets.UTF_8); + nValues = Math.max(nValues, b.length); + vs.add(b); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() { + for (final byte[] v : vs) { + encoder.encodeRawString(v, nValues); + } + vs.clear(); + nValues = 0; + } + } + + static class StringFieldEncoder extends BCF2FieldEncoder { + + private static final byte[] EMPTY = new byte[0]; + + private final List vs = new ArrayList<>(); + private int charLength; + + StringFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.CHAR; + nValues = 0; + charLength = 0; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(EMPTY); + } else { + final byte[] v; + final int stringsSeen; + if (o instanceof String) { + v = ((String) o).getBytes(StandardCharsets.UTF_8); + stringsSeen = 1; + } else if (o instanceof List) { + final List strings = (List) o; + v = encoder.compactStrings(strings); + stringsSeen = strings.size(); + } else if (o instanceof String[]) { + final String[] strings = (String[]) o; + v = encoder.compactStrings(strings); + stringsSeen = strings.length; + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + + vs.add(v); + nValues = Math.max(nValues, stringsSeen); + charLength = Math.max(charLength, v.length); + } + } + + @Override + void encodeType() throws IOException { + encoder.encodeType(charLength, type); + } + + @Override + void encode() { + for (final byte[] v : vs) { + encoder.encodeRawString(v, charLength); + } + vs.clear(); + nValues = 0; + charLength = 0; + } + } + + static class VecIntFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + VecIntFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.INT8; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o != null) { + if (o instanceof List) { + final List v = (List) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, v.size()); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, 1); + } else if (o instanceof int[]) { + final int[] v = (int[]) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, v.length); + } else { + // TODO do we need to support Integer[] ? + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + vs.add(o); + } + + @Override + void encode() throws IOException { + for (final Object o : vs) { + if (o == null) { + encoder.encodePaddingValues(nValues, type); + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecInt(v, nValues, type); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + encoder.encodeRawInt(v, type); + encoder.encodePaddingValues(nValues - 1, type); + } else if (o instanceof int[]) { + final int[] v = (int[]) o; + encoder.encodeRawVecInt(v, nValues, type); + } + } + vs.clear(); + type = BCF2Type.INT8; + nValues = 0; + } + } + + static class VecFloatFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + VecFloatFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.FLOAT; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o != null) { + if (o instanceof List) { + final List v = (List) o; + nValues = Math.max(nValues, v.size()); + } else if (o instanceof Double) { + nValues = Math.max(nValues, 1); + } else if (o instanceof double[]) { + final double[] v = (double[]) o; + nValues = Math.max(nValues, v.length); + } else { + // TODO do we need to support Double[] ? + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + vs.add(o); + } + + @Override + void encode() throws IOException { + for (final Object o : vs) { + if (o == null) { + encoder.encodePaddingValues(nValues, type); + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecFloat(v, nValues); + } else if (o instanceof Double) { + final Double v = (Double) o; + encoder.encodeRawFloat(v); + encoder.encodePaddingValues(nValues - 1, BCF2Type.FLOAT); + } else if (o instanceof double[]) { + final double[] v = (double[]) o; + encoder.encodeRawVecFloat(v, nValues); + } + } + vs.clear(); + nValues = 0; + } + } + + static TribbleException incompatibleType(final Object o, final BCF2Type type) { + final String error = "Could not write object: %s whose type is incompatible with declared header of type: %s"; + return new TribbleException(String.format(error, o, type)); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java new file mode 100644 index 0000000000..3b645bf981 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java @@ -0,0 +1,515 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * INFO and FORMAT writers + */ +class BCF2FieldWriter { + final VCFCompoundHeaderLine headerLine; + final int dictionaryOffset; + final BCF2Type dictionaryOffsetType; + final String key; + final BCF2Encoder encoder; + + BCF2FieldWriter(final VCFCompoundHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + this.headerLine = headerLine; + this.dictionaryOffset = dictionaryOffset; + this.dictionaryOffsetType = BCF2Utils.determineIntegerType(dictionaryOffset); + this.key = headerLine.getID(); + this.encoder = encoder; + } + + /** + * This should be called before encoding every VariantContext in both INFO and FORMAT writers + */ + void encodeKey() throws IOException { + encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType); + } + + + ////////////////////////////////////////////////// + // Factory Methods // + ////////////////////////////////////////////////// + static SiteWriter createSiteWriter( + final VCFInfoHeaderLine line, + final int offset, + final BCF2Encoder encoder + ) { + return line.getType() == VCFHeaderLineType.Flag + ? new SiteFlagWriter(line, offset, encoder) + : new SiteAttributeWriter(line, offset, encoder); + } + + static GenotypeWriter createGenotypeWriter( + final VCFFormatHeaderLine line, + final int offset, + final BCF2Encoder encoder + ) { + // Specialized writers for fields stored inline in the Genotype and not in its attributes map + switch (line.getID()) { + case VCFConstants.GENOTYPE_KEY: + return new GTWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_FILTER_KEY: + return new FTWriter(line, offset, encoder); + case VCFConstants.DEPTH_KEY: + return new DPWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_QUALITY_KEY: + return new GQWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_ALLELE_DEPTHS: + return new ADWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_PL_KEY: + return new PLWriter(line, offset, encoder); + } + + if (line.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException("Format lines cannot have type Flag"); + } else { + return new GenotypeAttributeWriter(line, offset, encoder); + } + } + + private static BCF2FieldEncoder getEncoder(final VCFCompoundHeaderLine line, final BCF2Encoder encoder) { + switch (line.getType()) { + case Integer: + return line.isFixedCount() && line.getCount() == 1 + ? new BCF2FieldEncoder.AtomicIntFieldEncoder(encoder) + : new BCF2FieldEncoder.VecIntFieldEncoder(encoder); + case Float: + return line.isFixedCount() && line.getCount() == 1 + ? new BCF2FieldEncoder.AtomicFloatFieldEncoder(encoder) + : new BCF2FieldEncoder.VecFloatFieldEncoder(encoder); + case String: + return new BCF2FieldEncoder.StringFieldEncoder(encoder); + case Character: + return new BCF2FieldEncoder.CharFieldEncoder(encoder); + default: + throw new TribbleException("Unrecognized line type: " + line.getType()); + } + } + + + /** + * Class that writes one field specified by a {@link VCFInfoHeaderLine} + * contained the attributes map of a {@link VariantContext} + */ + abstract static class SiteWriter extends BCF2FieldWriter { + + SiteWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + abstract void encode(final VariantContext vc) throws IOException; + } + + /** + * INFO writer that accesses variant context fields stored in the VC's attributes map + */ + static class SiteAttributeWriter extends SiteWriter { + + private final BCF2FieldEncoder siteEncoder; + private final boolean boundedNonAtomic; + + SiteAttributeWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + this.siteEncoder = BCF2FieldWriter.getEncoder(headerLine, encoder); + + // If this line's count is unbounded, or the inner encoder is one of the atomic specializations, + // the inner encoder can always figure out the correct number of BCF2 values to write out by itself. + // Otherwise we need to inspect the context to determine the number of values to encode + // and possibly error if too many values were provided + this.boundedNonAtomic = headerLine.getCountType() != VCFHeaderLineCount.UNBOUNDED && !( + siteEncoder instanceof BCF2FieldEncoder.AtomicIntFieldEncoder || siteEncoder instanceof BCF2FieldEncoder.AtomicFloatFieldEncoder + ); + } + + @Override + void encode(final VariantContext vc) throws IOException { + final Object o = vc.getAttribute(key); + if (o == null) { + encoder.encodeTypedMissing(siteEncoder.type); + } else { + siteEncoder.load(o); + if (boundedNonAtomic) { + siteEncoder.checkNValues(headerLine, vc); + } + + siteEncoder.encodeType(); + siteEncoder.encode(); + } + } + } + + /** + * INFO writer that accesses Flags stored in the VariantContext's attributes map + */ + static class SiteFlagWriter extends SiteWriter { + + SiteFlagWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc) throws IOException { + // This seems unintuitive, but it matches the behavior of htslib/bcftools + // See https://github.com/samtools/hts-specs/issues/384 + encoder.encodeRawBytes(0, BCF2Type.INT8); + } + } + + + // TODO in the genotype writers, a missing genotype (one where variantContext.getGenotype(sampleName) == null) + // is treated like one where all its attributes/inline fields are missing, this matches the behavior + // of the old writer, which previously created a new empty Genotype object for each missing genotypes, is this right? + // For example, should the FT string of a missing genotype be PASS or a padded empty string + + /** + * Class that writes one field specified by a {@link VCFFormatHeaderLine} + * from all Genotypes contained inside a {@link VariantContext}, iterating through each Genotype in order. + *

    + * Writing occurs in two passes: first all the attribute objects are loaded into the lower level + * {@link BCF2FieldEncoder} then the attributes are written out. This is necessary as some aspects of the BCF + * encoding such as type and sometimes count can only be determined by inspecting all elements to be written. + */ + abstract static class GenotypeWriter extends BCF2FieldWriter { + + GenotypeWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + abstract void encode(final VariantContext vc, final List sampleNames) throws IOException; + } + + /** + * FORMAT writer that accesses genotype fields stored in the Genotype object's attributes map + */ + static class GenotypeAttributeWriter extends GenotypeWriter { + + private final BCF2FieldEncoder siteEncoder; + private final boolean boundedNonAtomic; + + GenotypeAttributeWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + this.siteEncoder = BCF2FieldWriter.getEncoder(headerLine, encoder); + + // If this line's count is unbounded, or the inner encoder is one of the atomic specializations, + // the inner encoder can always figure out the correct number of BCF2 values to write out by itself. + // Otherwise we need to inspect the context to determine the number of values to encode + // and possibly error if too many values were provided + this.boundedNonAtomic = headerLine.getCountType() != VCFHeaderLineCount.UNBOUNDED && !( + siteEncoder instanceof BCF2FieldEncoder.AtomicIntFieldEncoder || siteEncoder instanceof BCF2FieldEncoder.AtomicFloatFieldEncoder + ); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + siteEncoder.load(g == null ? null : g.getExtendedAttribute(key)); + } + + if (boundedNonAtomic) { + siteEncoder.checkNValues(headerLine, vc); + } + + siteEncoder.encodeType(); + siteEncoder.encode(); + } + } + + /** + * Base class for FORMAT writers that access genotype fields stored directly + * as int fields in the Genotype object and not inside the attributes map. + */ + abstract static class GenotypeInlineAtomicIntWriter extends GenotypeWriter { + + // Used to store values to write out to avoid boxing + private int[] vs; + + GenotypeInlineAtomicIntWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + if (vs == null || vs.length < sampleNames.size()) { + vs = new int[sampleNames.size()]; + } + + BCF2Type type = BCF2Type.INT8; + int i = 0; + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final int v = g == null ? -1 : get(g); + if (v != -1) { + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + } + vs[i++] = v; + } + + encoder.encodeType(1, type); + + for (int j = 0; j < i; j++) { + final int v = vs[j]; + if (v == -1) { + encoder.encodeRawMissingValue(type); + } else { + encoder.encodeRawInt(v, type); + } + } + } + + abstract int get(final Genotype g); + } + + static class DPWriter extends GenotypeInlineAtomicIntWriter { + + DPWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int get(final Genotype g) { + return g.getDP(); + } + } + + static class GQWriter extends GenotypeInlineAtomicIntWriter { + + GQWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int get(final Genotype g) { + return g.getGQ(); + } + } + + /** + * Base class for FORMAT writers that access genotype fields stored directly + * as int[] fields in the Genotype object and not inside the attributes map. + */ + abstract static class GenotypeInlineVecIntWriter extends GenotypeWriter { + + private final List vs = new ArrayList<>(); + + GenotypeInlineVecIntWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + BCF2Type type = BCF2Type.INT8; + + // For both vector of int types represented as inline fields by htsjdk (AD and PL), + // the count type can be determined by inspecting the header + final int nValues = headerLine.getCount(vc); + + // Find narrowest integer type that fits all values + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final int[] v = g == null ? null : get(g); + vs.add(v); + + if (v == null) continue; + if (v.length > nValues) + throw BCF2FieldWriter.tooManyValues(v.length, nValues, key, vc); + + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + } + + encoder.encodeType(nValues, type); + + for (final int[] vs : vs) { + if (vs == null) { + encoder.encodePaddingValues(nValues, type); + } else { + encoder.encodeRawVecInt(vs, nValues, type); + } + } + vs.clear(); + } + + abstract int[] get(final Genotype g); + } + + static class ADWriter extends GenotypeInlineVecIntWriter { + + ADWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int[] get(final Genotype g) { + return g.getAD(); + } + } + + static class PLWriter extends GenotypeInlineVecIntWriter { + + PLWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int[] get(final Genotype g) { + return g.getPL(); + } + } + + /** + * Writer for the FT or filter field. This is a special case of the String writer + * where the type of the value is known to be String (and not List) + * and null values must be specially handled by encoding them as PASS. + */ + static class FTWriter extends GenotypeWriter { + + private static final byte[] PASS = "PASS".getBytes(StandardCharsets.US_ASCII); + + private final List vs = new ArrayList<>(); + + FTWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + int nValues = 0; + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final String f; + final byte[] v; + if (g == null || (f = g.getFilters()) == null) { + v = FTWriter.PASS; + } else { + v = f.getBytes(StandardCharsets.UTF_8); + } + nValues = Math.max(nValues, v.length); + vs.add(v); + } + + encoder.encodeType(nValues, BCF2Type.CHAR); + for (final byte[] v : vs) { + encoder.encodeRawString(v, nValues); + } + vs.clear(); + } + } + + /** + * Specialized writer for GT field. + */ + static class GTWriter extends GenotypeWriter { + + private final HashMap alleleMapForTriPlus = new HashMap<>(5); + private Allele ref, alt1; + + GTWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + buildAlleleMap(vc); + final int nValues = vc.getMaxPloidy(2); + // Offsets should always fit into a signed 8-bit integer but do this check anyway for spec compliance + final BCF2Type type = BCF2Utils.determineIntegerType(vc.getNAlleles() << 1); + + encoder.encodeType(nValues, type); + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + if (g != null) { + boolean notFirst = false; + for (final Allele a : g.getAlleles()) { + // TODO Genotype and Allele classes can't properly store phasing information for ploidy > 2 + // Currently all non ref alleles are assumed to have the same phasing + final int encoded = encodeAlleleWithoutPhasing(a) | ((g.isPhased() && notFirst) ? 0x01 : 0x00); + encoder.encodeRawInt(encoded, type); + notFirst = true; + } + // Pad with missing values if sample ploidy is less than maximum + final int padding = nValues - g.getPloidy(); + if (padding > 0) { + encoder.encodePaddingValues(padding, type); + } + } else { + // TODO read the spec more closely, look at htslib, this may not be correct + // Entirely missing genotype, which we encode as vector of no call + for (int i = 0; i < nValues; i++) { + encoder.encodeRawInt(0, type); + } + } + } + } + + /** + * Fast path code to encode an allele without phasing information. + * Inline tests for == against ref (most common, first test) + * == alt1 (second most common, second test) + * == NO_CALL (third) + * and finally in the map from allele => offset for all alt 2+ alleles + * + * @param a the allele we want to encode + * @return the encoded allele without phasing information + */ + private int encodeAlleleWithoutPhasing(final Allele a) { + if (a == ref) return 2; // ( 0 + 1) << 1 + else if (a == alt1) return 4; // ( 1 + 1) << 1 + else if (a == Allele.NO_CALL) return 0; // (-1 + 1) << 1 + else { + final Integer i = alleleMapForTriPlus.get(a); + if (i == null) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a); + return i; + } + } + + private void buildAlleleMap(final VariantContext vc) { + // ref and alt1 are handled by a fast path when determining the offset + // so they do not need to be placed in the map + final int nAlleles = vc.getNAlleles(); + ref = vc.getReference(); + alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null; + + if (nAlleles > 2) { + // for multi-allelics we need to clear the map, and add additional looks + alleleMapForTriPlus.clear(); + final List alleles = vc.getAlleles(); + for (int i = 2; i < alleles.size(); i++) { + // Perform encoding here so we only do it once instead of after every lookup + alleleMapForTriPlus.put(alleles.get(i), (i + 1) << 1); + } + } + } + } + + + ////////////////////////////////////////////////// + // Exception utilities // + ////////////////////////////////////////////////// + static TribbleException tooManyValues(final int observed, final int expected, final String key, final VariantContext vc) { + final String error = "Observed number of values: %d exceeds expected number: %d for attribute: %s in VariantContext: %s"; + return new TribbleException(String.format(error, observed, expected, key, vc)); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java new file mode 100644 index 0000000000..b73a88036d --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java @@ -0,0 +1,106 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFStandardHeaderLines; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class BCF2FieldWriterManager { + private static final Log log = Log.getInstance(BCF2FieldWriterManager.class); + + private final Map infoWriters; + private final Map formatWriters; + private final List sampleNames; + + public BCF2FieldWriterManager(final VCFHeader header, final Map dict, final BCF2Encoder encoder) { + infoWriters = new HashMap<>(header.getInfoHeaderLines().size()); + for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { + final String field = line.getID(); + validateStandardHeader(line, VCFStandardHeaderLines.getInfoLine(field, false)); + final int offset = dict.get(field); + final BCF2FieldWriter.SiteWriter writer = BCF2FieldWriter.createSiteWriter(line, offset, encoder); + infoWriters.put(field, writer); + } + + formatWriters = new HashMap<>(header.getFormatHeaderLines().size()); + for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { + final String field = line.getID(); + validateStandardHeader(line, VCFStandardHeaderLines.getFormatLine(field, false)); + final int offset = dict.get(field); + final BCF2FieldWriter.GenotypeWriter writer = BCF2FieldWriter.createGenotypeWriter(line, offset, encoder); + formatWriters.put(field, writer); + } + + sampleNames = header.getGenotypeSamples(); + } + + public void writeInfo(final VariantContext vc) throws IOException { + for (final String field : vc.getAttributes().keySet()) { + final BCF2FieldWriter.SiteWriter writer = infoWriters.get(field); + if (writer == null) errorUnexpectedFieldToWrite(vc, field, "INFO"); + writer.encodeKey(); + writer.encode(vc); + } + } + + public void writeFormat(final VariantContext vc, final List genotypeFields) throws IOException { + for (final String field : genotypeFields) { + final BCF2FieldWriter.GenotypeWriter writer = formatWriters.get(field); + if (writer == null) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + writer.encodeKey(); + writer.encode(vc, sampleNames); + } + } + + private static void validateStandardHeader( + final T actualLine, + final T expectedLine + ) { + if (expectedLine == null) return; + final VCFHeaderLineType actualType = actualLine.getType(); + final VCFHeaderLineType expectedType = expectedLine.getType(); + if (actualType != expectedType) { + log.error(String.format( + "Header with standard key: `%s` has type: %s which does not match standard type: %s", + actualLine.getID(), + actualType, + expectedType + )); + } + + final VCFHeaderLineCount actualCountType = actualLine.getCountType(); + final VCFHeaderLineCount expectedCountType = expectedLine.getCountType(); + if (actualCountType != expectedCountType || actualLine.isFixedCount() && actualLine.getCount() != expectedLine.getCount()) { + log.error(String.format( + "Header with standard key: `%s` has count: %s which does not match standard count: %s", + actualLine.getID(), + actualLine.isFixedCount() ? actualLine.getCount() : actualCountType, + expectedLine.isFixedCount() ? expectedLine.getCount() : expectedCountType + )); + } + } + + private static void errorUnexpectedFieldToWrite( + final VariantContext vc, + final String field, + final String fieldType + ) { + throw new TribbleException(String.format( + "Found %s field %s of VariantContext at %s:%d from %s that has not been defined in the VCFHeader", + fieldType, field, + vc.getContig(), vc.getStart(), vc.getSource() + )); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java index c406b6602d..173e095687 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java @@ -33,6 +33,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -50,14 +51,13 @@ public class BCF2GenotypeFieldDecoders { private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number // initialized once per writer to allow parallel writers to work - private final HashMap genotypeFieldDecoder = new HashMap(); + private final HashMap genotypeFieldDecoder = new HashMap<>(); private final Decoder defaultDecoder = new GenericDecoder(); public BCF2GenotypeFieldDecoders(final VCFHeader header) { // TODO -- fill in appropriate decoders for each FORMAT field in the header genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); - // currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder()); genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); @@ -74,41 +74,41 @@ public BCF2GenotypeFieldDecoders(final VCFHeader header) { /** * Return decoder appropriate for field, or the generic decoder if no * specialized one is bound + * * @param field the GT field to decode * @return a non-null decoder */ public Decoder getDecoder(final String field) { - final Decoder d = genotypeFieldDecoder.get(field); - return d == null ? defaultDecoder : d; + return genotypeFieldDecoder.getOrDefault(field, defaultDecoder); } /** * Decoder a field (implicit from creation) encoded as * typeDescriptor in the decoder object in the GenotypeBuilders * one for each sample in order. - * + *

    * The way this works is that this decode method * iterates over the builders, decoding a genotype field * in BCF2 for each sample from decoder. - * + *

    * This system allows us to easily use specialized * decoders for specific genotype field values. For example, * we use a special decoder to directly read the BCF2 data for * the PL field into a int[] rather than the generic List of Integer */ public interface Decoder { - public void decode(final List siteAlleles, - final String field, - final BCF2Decoder decoder, - final byte typeDescriptor, - final int numElements, - final GenotypeBuilder[] gbs) throws IOException; + void decode(final List siteAlleles, + final String field, + final BCF2Decoder decoder, + final byte typeDescriptor, + final int numElements, + final GenotypeBuilder[] gbs) throws IOException; } - private class GTDecoder implements Decoder { + private static class GTDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + if (ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); @@ -117,44 +117,47 @@ public void decode(final List siteAlleles, final String field, final BCF /** * fast path for many samples with diploid genotypes - * + *

    * The way this would work is simple. Create a List diploidGenotypes[] object * After decoding the offset, if that sample is diploid compute the * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype * cache it and use that - * + *

    * Some notes. If there are nAlleles at the site, there are implicitly actually - * n + 1 options including + * n + 1 options including ref */ @SuppressWarnings({"unchecked"}) - private final void fastBiallelicDiploidDecode(final List siteAlleles, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { + private void fastBiallelicDiploidDecode(final List siteAlleles, + final BCF2Decoder decoder, + final byte typeDescriptor, + final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final int nPossibleGenotypes = 3 * 3; - final Object allGenotypes[] = new Object[nPossibleGenotypes]; + final Object[] allGenotypes = new Object[nPossibleGenotypes]; - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { final int a1 = decoder.decodeInt(type); final int a2 = decoder.decodeInt(type); - if ( a1 == type.getMissingBytes() ) { - assert a2 == type.getMissingBytes(); + final boolean phased; + if (a1 == decoder.getPaddingValue(type)) { + assert a2 == decoder.getPaddingValue(type); // no called sample GT = . gb.alleles(null); - } else if ( a2 == type.getMissingBytes() ) { - gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1))); + phased = false; + } else if (a2 == decoder.getPaddingValue(type)) { + gb.alleles(Collections.singletonList(getAlleleFromEncoded(siteAlleles, a1))); + phased = (a1 & 0x01) == 1; } else { // downshift to remove phase final int offset = (a1 >> 1) * 3 + (a2 >> 1); assert offset < allGenotypes.length; // TODO -- how can I get rid of this cast? - List gt = (List)allGenotypes[offset]; - if ( gt == null ) { + List gt = (List) allGenotypes[offset]; + if (gt == null) { final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1); final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2); gt = Arrays.asList(allele1, allele2); @@ -162,116 +165,120 @@ private final void fastBiallelicDiploidDecode(final List siteAlleles, } gb.alleles(gt); + phased = (a2 & 0x01) == 1; } - final boolean phased = (a2 & 0x01) == 1; gb.phased(phased); } } - private final void generalDecode(final List siteAlleles, - final int ploidy, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { + private void generalDecode(final List siteAlleles, + final int ploidy, + final BCF2Decoder decoder, + final byte typeDescriptor, + final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // a single cache for the encoded genotypes, since we don't actually need this vector final int[] tmp = new int[ploidy]; - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp); - if ( encoded == null ) + if (encoded == null) // no called sample GT = . gb.alleles(null); else { assert encoded.length > 0; // we have at least some alleles to decode - final List gt = new ArrayList(encoded.length); + final List gt = new ArrayList<>(encoded.length); // note that the auto-pruning of fields magically handles different // ploidy per sample at a site - for ( final int encode : encoded ) + for (final int encode : encoded) gt.add(getAlleleFromEncoded(siteAlleles, encode)); gb.alleles(gt); + // TODO htsjdk's Genotype class cannot properly encode phasing for ploidy > 2 + // See https://github.com/samtools/htsjdk/issues/1044 final boolean phased = ((encoded.length > 1 ? encoded[1] : encoded[0]) & 0x01) == 1; gb.phased(phased); } } } - private final Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { - final int offset = encode >> 1; + private Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { + final int offset = encode >>> 1; return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1); } } - private class DPDecoder implements Decoder { + private static class DPDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); } } } - private class GQDecoder implements Decoder { + private static class GQDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); } } } - private class ADDecoder implements Decoder { + private static class ADDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); } } } - private class PLDecoder implements Decoder { + private static class PLDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); } } } - private class GenericDecoder implements Decoder { + private static class GenericDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) { - // todo -- I really hate this, and it suggests that the code isn't completely right - // the reason it's here is that it's possible to prune down a vector to a singleton - // value and there we have the contract that the value comes back as an atomic value - // not a vector of size 1 - value = ((List)value).get(0); - } + for (final GenotypeBuilder gb : gbs) { + final Object value = decoder.decodeTypedValue(typeDescriptor, numElements); + if (value == null) continue; + if (value instanceof List && ((List) value).size() == 1) { + // TODO not sure what this refers to, htsjdk itself doesn't make any assumptions about + // the concrete type of the data contained in the attributes map. + // Maybe there are upstream consumers who have this contract. + + // todo -- I really hate this, and it suggests that the code isn't completely right + // the reason it's here is that it's possible to prune down a vector to a singleton + // value and there we have the contract that the value comes back as an atomic value + // not a vector of size 1 + gb.attribute(field, ((List) value).get(0)); + } else { gb.attribute(field, value); } } } } - private class FTDecoder implements Decoder { + private static class FTDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - assert value == null || value instanceof String; - gb.filter((String)value); + for (final GenotypeBuilder gb : gbs) { + gb.filters(decoder.decodeExplodedStrings(numElements, ';')); } } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java index aadea53dfb..a23c74c091 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java @@ -65,7 +65,8 @@ public LazyGenotypesContext.LazyData parse(final Object data) { try { // load our byte[] data into the decoder - final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(codec.getBCFVersion()); + decoder.setRecordBytes(((BCF2Codec.LazyData)data).bytes); for ( int i = 0; i < nSamples; i++ ) builders[i].reset(true); diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java index 11c8edf6c5..89610c7569 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java @@ -1,27 +1,27 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; @@ -39,62 +39,73 @@ public enum BCF2Type { // the actual values themselves MISSING(0, 0, 0x00) { - @Override public int read(final InputStream in) throws IOException { + @Override + public int read(final InputStream in) throws IOException { throw new IllegalArgumentException("Cannot read MISSING type"); } - @Override public void write(final int value, final OutputStream out) throws IOException { + + @Override + public void write(final int value, final OutputStream out) throws IOException { throw new IllegalArgumentException("Cannot write MISSING type"); } }, - INT8 (1, 1, 0xFFFFFF80, -127, 127) { + INT8(1, 1, 0xFFFFFF80, 0xFFFFFF81, -120, 127) { @Override public int read(final InputStream in) throws IOException { - return BCF2Utils.readByte(in); + // This cast to byte then implicit cast back to int is needed so that negative + // integers are sign extended to their proper 32 bit representation. + // The integer read from the stream before truncating to byte is an 32-bit integer + // with the 3 high bytes 0, and the widening conversion performs sign extension, + // the same applies for the read method of INT16. + return (byte) in.read(); } @Override public void write(final int value, final OutputStream out) throws IOException { - out.write(0xFF & value); // TODO -- do we need this operation? + // Do not need to mask off higher bytes because Java's OutputStream contract is to + // only write the bottom byte of the passed in int, the same applies to the write + // methods of the larger int sizes below. + out.write(value); } }, - INT16(2, 2, 0xFFFF8000, -32767, 32767) { + INT16(2, 2, 0xFFFF8000, 0xFFFF8001, -32760, 32767) { @Override public int read(final InputStream in) throws IOException { - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (short)((b1 << 8) | b2); + final int b2 = in.read(); + final int b1 = in.read(); + return (short) ((b1 << 8) | b2); } @Override public void write(final int value, final OutputStream out) throws IOException { // TODO -- optimization -- should we put this in a local buffer? - out.write((0x00FF & value)); - out.write((0xFF00 & value) >> 8); + out.write(value); + out.write(value >> 8); } }, - INT32(3, 4, 0x80000000, -2147483647, 2147483647) { + INT32(3, 4, 0x80000000, 0x80000001, -2147483640, 2147483647) { @Override public int read(final InputStream in) throws IOException { - final int b4 = BCF2Utils.readByte(in) & 0xFF; - final int b3 = BCF2Utils.readByte(in) & 0xFF; - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); + final int b4 = in.read(); + final int b3 = in.read(); + final int b2 = in.read(); + final int b1 = in.read(); + return b1 << 24 | b2 << 16 | b3 << 8 | b4; } @Override public void write(final int value, final OutputStream out) throws IOException { - out.write((0x000000FF & value)); - out.write((0x0000FF00 & value) >> 8); - out.write((0x00FF0000 & value) >> 16); - out.write((0xFF000000 & value) >> 24); + out.write(value); + out.write(value >> 8); + out.write(value >> 16); + out.write(value >> 24); } }, - FLOAT(5, 4, 0x7F800001) { + FLOAT(5, 4, 0x7F800001, 0x7F800002, 0, 0) { @Override public int read(final InputStream in) throws IOException { return INT32.read(in); @@ -106,7 +117,10 @@ public void write(final int value, final OutputStream out) throws IOException { } }, - CHAR (7, 1, 0x00000000) { + // CHAR isn't given a MISSING or EOV value in the spec, but for the purposes of + // padding strings (i.e. variable length vectors of chars), it is treated as if + // '\0' or NULL is both the MISSING and EOV value of CHAR + CHAR(7, 1, 0x00000000) { @Override public int read(final InputStream in) throws IOException { return INT8.read(in); @@ -120,25 +134,40 @@ public void write(final int value, final OutputStream out) throws IOException { private final int id; private final Object missingJavaValue; + + /* + Note that the values for these fields for INT8 and IN16 differ from those given in the spec + The values given here are as if they have been sign-extended to 32 bits from their native + integer width (meaning they have all bits above that width set, as the missing and EOV + values all have their highest bit set in their native width) + + This is so that they compare equal to the values returned by the various + integer types' read methods, which must also sign-extend their return values so + we can return a uniformly sized 32-bit int + */ private final int missingBytes; + private final int EOVBytes; private final int sizeInBytes; + private final long minValue, maxValue; BCF2Type(final int id, final int sizeInBytes, final int missingBytes) { - this(id, sizeInBytes, missingBytes, 0, 0); + this(id, sizeInBytes, missingBytes, 0, 0, 0); } - BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { + BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final int EOVBytes, final long minValue, final long maxValue) { this.id = id; this.sizeInBytes = sizeInBytes; this.missingJavaValue = null; this.missingBytes = missingBytes; + this.EOVBytes = EOVBytes; this.minValue = minValue; this.maxValue = maxValue; } /** * How many bytes are used to represent this type on disk? + * * @return */ public int getSizeInBytes() { @@ -147,19 +176,24 @@ public int getSizeInBytes() { /** * The ID according to the BCF2 specification + * * @return */ - public int getID() { return id; } + public int getID() { + return id; + } /** * Can we encode value v in this type, according to its declared range. - * + *

    * Only makes sense for integer values * * @param v * @return */ - public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } + public final boolean withinRange(final long v) { + return v <= maxValue && v >= minValue; + } /** * Return the java object (aka null) that is used to represent a missing value for this @@ -167,7 +201,9 @@ public int getSizeInBytes() { * * @return */ - public Object getMissingJavaValue() { return missingJavaValue; } + public Object getMissingJavaValue() { + return missingJavaValue; + } /** * The bytes (encoded as an int) that are used to represent a missing value @@ -175,7 +211,19 @@ public int getSizeInBytes() { * * @return */ - public int getMissingBytes() { return missingBytes; } + public int getMissingBytes() { + return missingBytes; + } + + /** + * The bytes (encoded as an int) that are used to represent an end of vector value + * for this type in BCF2 + * + * @return + */ + public int getEOVBytes() { + return EOVBytes; + } /** * An enum set of the types that might represent Integer values @@ -195,7 +243,7 @@ public boolean isIntegerType() { /** * Read a value from in stream of this BCF2 type as an int [32 bit] collection of bits - * + *

    * For intX and char values this is just the int / byte value of the underlying data represented as a 32 bit int * For a char the result must be converted to a char by (char)(byte)(0x0F & value) * For doubles it's necessary to convert subsequently this value to a double via Double.bitsToDouble() @@ -211,4 +259,16 @@ public int read(final InputStream in) throws IOException { public void write(final int value, final OutputStream out) throws IOException { throw new IllegalArgumentException("Not implemented"); } + + private enum Special { + MISSING, + EOV, + } + + /** + * @return a unique End Of Vector object used by the low level decoder + */ + public static Object EOVValue() { + return Special.EOV; + } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java index 545ede7497..f64f49a9b6 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java @@ -27,36 +27,27 @@ import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.TribbleException; -import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFIDHeaderLine; import htsjdk.variant.vcf.VCFSimpleHeaderLine; import java.io.File; -import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Array; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; -import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.Set; /** * Common utilities for working with BCF2 files - * + *

    * Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type) * * @author depristo * @since 5/12 */ public final class BCF2Utils { - public static final int MAX_ALLELES_IN_GENOTYPES = 127; public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; @@ -66,55 +57,16 @@ public final class BCF2Utils { static { int maxID = -1; - for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID); - ID_TO_ENUM = new BCF2Type[maxID+1]; - for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v; + for (final BCF2Type v : BCF2Type.values()) maxID = Math.max(v.getID(), maxID); + ID_TO_ENUM = new BCF2Type[maxID + 1]; + for (final BCF2Type v : BCF2Type.values()) ID_TO_ENUM[v.getID()] = v; } - private BCF2Utils() {} - - /** - * Create a strings dictionary from the VCF header - * - * The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT) - * fields. - * - * Note that its critical that the list be dedupped and sorted in a consistent manner each time, - * as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly - * the same way as in the header each time it's very bad - * - * @param header the VCFHeader from which to build the dictionary - * @return a non-null dictionary of elements, may be empty - */ - public static ArrayList makeDictionary(final VCFHeader header) { - final Set seen = new HashSet(); - final ArrayList dict = new ArrayList(); - - // special case the special PASS field which doesn't show up in the FILTER field definitions - seen.add(VCFConstants.PASSES_FILTERS_v4); - dict.add(VCFConstants.PASSES_FILTERS_v4); - - // set up the strings dictionary - for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { - if ( line.shouldBeAddedToDictionary() ) { - if (!line.isIDHeaderLine()) { - //is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true - throw new TribbleException(String.format( - "The header line %s cannot be added to the BCF dictionary since its not an ID header line", - line)); - } - if ( ! seen.contains(line.getID())) { - dict.add(line.getID()); - seen.add(line.getID()); - } - } - } - - return dict; + private BCF2Utils() { } - public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { - return (byte)((0x0F & nElements) << 4 | (type.getID() & 0x0F)); + public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type) { + return (byte) ((0x0F & nElements) << 4 | (type.getID() & 0x0F)); } public static int decodeSize(final byte typeDescriptor) { @@ -133,58 +85,12 @@ public static boolean sizeIsOverflow(final byte typeDescriptor) { return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; } - public static byte readByte(final InputStream stream) throws IOException { - return (byte)(stream.read() & 0xFF); - } - - /** - * Collapse multiple strings into a comma separated list - * - * ["s1", "s2", "s3"] => ",s1,s2,s3" - * - * @param strings size > 1 list of strings - * @return - */ - public static String collapseStringList(final List strings) { - if ( strings.isEmpty() ) return ""; - else if ( strings.size() == 1 ) return strings.get(0); - else { - final StringBuilder b = new StringBuilder(); - for ( final String s : strings ) { - if ( s != null ) { - assert s.indexOf(",") == -1; // no commas in individual strings - b.append(',').append(s); - } - } - return b.toString(); - } - } - - /** - * Inverse operation of collapseStringList. - * - * ",s1,s2,s3" => ["s1", "s2", "s3"] - * - * - * @param collapsed - * @return - */ - public static List explodeStringList(final String collapsed) { - assert isCollapsedString(collapsed); - final String[] exploded = collapsed.substring(1).split(","); - return Arrays.asList(exploded); - } - - public static boolean isCollapsedString(final String s) { - return !s.isEmpty() && s.charAt(0) == ','; - } - /** * Returns a good name for a shadow BCF file for vcfFile. - * + *

    * foo.vcf => foo.bcf * foo.xxx => foo.xxx.bcf - * + *

    * If the resulting BCF file cannot be written, return null. Happens * when vcfFile = /dev/null for example * @@ -193,11 +99,11 @@ public static boolean isCollapsedString(final String s) { */ public static final File shadowBCF(final File vcfFile) { final String path = vcfFile.getAbsolutePath(); - if ( path.contains(FileExtensions.VCF) ) + if (path.contains(FileExtensions.VCF)) return new File(path.replace(FileExtensions.VCF, FileExtensions.BCF)); else { - final File bcf = new File( path + FileExtensions.BCF ); - if ( bcf.canRead() ) + final File bcf = new File(path + FileExtensions.BCF); + if (bcf.canRead()) return bcf; else { try { @@ -206,9 +112,7 @@ public static final File shadowBCF(final File vcfFile) { o.close(); bcf.delete(); return bcf; - } catch ( FileNotFoundException e ) { - return null; - } catch ( IOException e ) { + } catch (final IOException e) { return null; } } @@ -216,8 +120,8 @@ public static final File shadowBCF(final File vcfFile) { } public static BCF2Type determineIntegerType(final int value) { - for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { - if ( potentialType.withinRange(value) ) + for (final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { + if (potentialType.withinRange(value)) return potentialType; } @@ -227,9 +131,9 @@ public static BCF2Type determineIntegerType(final int value) { public static BCF2Type determineIntegerType(final int[] values) { // find the min and max values in the array int max = 0, min = 0; - for ( final int v : values ) { - if ( v > max ) max = v; - if ( v < min ) min = v; + for (final int v : values) { + if (v > max) max = v; + if (v < min) min = v; } final BCF2Type maxType = determineIntegerType(max); @@ -241,7 +145,7 @@ public static BCF2Type determineIntegerType(final int[] values) { /** * Returns the maximum BCF2 integer size of t1 and t2 - * + *

    * For example, if t1 == INT8 and t2 == INT16 returns INT16 * * @param t1 @@ -249,64 +153,49 @@ public static BCF2Type determineIntegerType(final int[] values) { * @return */ public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { - switch ( t1 ) { - case INT8: return t2; - case INT16: return t2 == BCF2Type.INT32 ? t2 : t1; - case INT32: return t1; - default: throw new TribbleException("BUG: unexpected BCF2Type " + t1); + switch (t1) { + case INT8: + return t2; + case INT16: + return t2 == BCF2Type.INT32 ? t2 : t1; + case INT32: + return t1; + default: + throw new TribbleException("BUG: unexpected BCF2Type " + t1); } } public static BCF2Type determineIntegerType(final List values) { BCF2Type maxType = BCF2Type.INT8; - for ( final int value : values ) { + for (final Integer value : values) { + if (value == null) continue; final BCF2Type type1 = determineIntegerType(value); - switch ( type1 ) { - case INT8: break; - case INT16: maxType = BCF2Type.INT16; break; - case INT32: return BCF2Type.INT32; // fast path for largest possible value - default: throw new TribbleException("Unexpected integer type " + type1 ); + switch (type1) { + case INT8: + break; + case INT16: + maxType = BCF2Type.INT16; + break; + case INT32: + return BCF2Type.INT32; // fast path for largest possible value + default: + throw new TribbleException("Unexpected integer type " + type1); } } return maxType; } - /** - * Helper function that takes an object and returns a list representation - * of it: - * - * o == null => [] - * o is a list => o - * else => [o] - * - * @param c the class of the object - * @param o the object to convert to a Java List - * @return - */ - public static List toList(final Class c, final Object o) { - if ( o == null ) return Collections.emptyList(); - else if ( o instanceof List ) return (List)o; - else if ( o.getClass().isArray() ) { - final int arraySize = Array.getLength(o); - final List list = new ArrayList(arraySize); - for (int i=0; i * If the order of INFO, FILTER, or contig elements in the output header is different than * in the input header we must decode the blocks using the input header and then recode them * based on the new output order. - * + *

    * If they are consistent, we can simply pass through the raw genotypes block bytes, which is * a *huge* performance win for large blocks. - * + *

    * Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc) * don't modify the ordering of the header fields and so can safely pass through the genotypes * undecoded. Some operations -- those at add filters or info fields -- can change the ordering @@ -314,28 +203,25 @@ else if ( o.getClass().isArray() ) { */ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) { // first, we have to have the same samples in the same order - if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) + if (!nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder()))) return false; - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - while ( inputLinesIt.hasNext() ) { - if ( ! outputLinesIt.hasNext() ) // missing lines in output + for (final VCFHeaderLine headerLine : genotypesBlockHeader.getIDHeaderLines()) { + if (!outputLinesIt.hasNext()) // missing lines in output return false; - final VCFSimpleHeaderLine outputLine = outputLinesIt.next(); - final VCFSimpleHeaderLine inputLine = inputLinesIt.next(); - - if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) + final VCFHeaderLine outputLine = outputLinesIt.next(); + if (!headerLine.getClass().equals(outputLine.getClass()) || !headerLine.getID().equals(outputLine.getID())) return false; } return true; } - private static List nullAsEmpty(List l) { - if ( l == null ) + private static List nullAsEmpty(final List l) { + if (l == null) return Collections.emptyList(); else return l; diff --git a/src/main/java/htsjdk/variant/bcf2/BCFVersion.java b/src/main/java/htsjdk/variant/bcf2/BCFVersion.java index b18b83e4aa..7bec9ef192 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCFVersion.java +++ b/src/main/java/htsjdk/variant/bcf2/BCFVersion.java @@ -29,6 +29,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; /** * Simple holder for BCF version information @@ -44,6 +46,12 @@ public final class BCFVersion { */ public static final byte[] MAGIC_HEADER_START = "BCF".getBytes(); + public static final BCFVersion BCF2_1Version = new BCFVersion(2, 1); + public static final BCFVersion BCF2_2Version = new BCFVersion(2, 2); + + public static final Set SUPPORTED_VERSIONS = new HashSet<>(Arrays.asList(BCF2_1Version, BCF2_2Version)); + + final int majorVersion; final int minorVersion; diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java index a63d940670..a2c47f9e7a 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java @@ -273,7 +273,6 @@ public class VariantContext implements HtsRecord, Feature, Serializable { * Determine which genotype fields are in use in the genotypes in VC * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first */ - public List calcVCFGenotypeKeys(final VCFHeader header) { final Set keys = new HashSet<>(); @@ -299,21 +298,23 @@ public List calcVCFGenotypeKeys(final VCFHeader header) { if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY); if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - List sortedList = ParsingUtils.sortList(new ArrayList<>(keys)); - - // make sure the GT is first + final List list = new ArrayList<>(6 + keys.size()); + // Make sure the GT is first if present if (sawGoodGT) { - final List newList = new ArrayList<>(sortedList.size() + 1); - newList.add(VCFConstants.GENOTYPE_KEY); - newList.addAll(sortedList); - sortedList = newList; + list.add(VCFConstants.GENOTYPE_KEY); + list.addAll(keys); + // Sort, skipping GT which will be at the first position of the list + Collections.sort(list.subList(1, list.size())); + } else { + list.addAll(keys); + Collections.sort(list); } - if (sortedList.isEmpty() && header.hasGenotypingData()) { + if (list.isEmpty() && header.hasGenotypingData()) { // this needs to be done in case all samples are no-calls return Collections.singletonList(VCFConstants.GENOTYPE_KEY); } else { - return sortedList; + return list; } } @@ -469,7 +470,9 @@ protected VariantContext(final String source, this.stop = stop; // intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null - if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + if ( ID == null || ID.equals("") ) { + throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + } this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID; this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java deleted file mode 100644 index 495cd93ec9..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public final class BCF2Encoder { - // TODO -- increase default size? - public static final int WRITE_BUFFER_INITIAL_SIZE = 16384; - private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE); - - // -------------------------------------------------------------------------------- - // - // Functions to return the data being encoded here - // - // -------------------------------------------------------------------------------- - - public byte[] getRecordBytes() { - byte[] bytes = encodeStream.toByteArray(); - encodeStream.reset(); - return bytes; - } - - // -------------------------------------------------------------------------------- - // - // Writing typed values (have type byte) - // - // -------------------------------------------------------------------------------- - - public final void encodeTypedMissing(final BCF2Type type) throws IOException { - encodeType(0, type); - } - - public final void encodeTyped(final Object value, final BCF2Type type) throws IOException { - if ( value == null ) - encodeTypedMissing(type); - else { - switch ( type ) { - case INT8: - case INT16: - case INT32: encodeTypedInt((Integer)value, type); break; - case FLOAT: encodeTypedFloat((Double) value); break; - case CHAR: encodeTypedString((String) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } - - public final void encodeTypedInt(final int v) throws IOException { - final BCF2Type type = BCF2Utils.determineIntegerType(v); - encodeTypedInt(v, type); - } - - public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException { - encodeType(1, type); - encodeRawInt(v, type); - } - - public final void encodeTypedString(final String s) throws IOException { - encodeTypedString(s.getBytes()); - } - - public final void encodeTypedString(final byte[] s) throws IOException { - if ( s == null ) - encodeType(0, BCF2Type.CHAR); - else { - encodeType(s.length, BCF2Type.CHAR); - for ( int i = 0; i < s.length; i++ ) { - encodeRawChar(s[i]); - } - } - } - - public final void encodeTypedFloat(final double d) throws IOException { - encodeType(1, BCF2Type.FLOAT); - encodeRawFloat(d); - } - - public final void encodeTyped(List v, final BCF2Type type) throws IOException { - if ( type == BCF2Type.CHAR && !v.isEmpty()) { - final String s = BCF2Utils.collapseStringList((List) v); - v = stringToBytes(s); - } - - encodeType(v.size(), type); - encodeRawValues(v, type); - } - - // -------------------------------------------------------------------------------- - // - // Writing raw values (don't have a type byte) - // - // -------------------------------------------------------------------------------- - - public final void encodeRawValues(final Collection v, final BCF2Type type) throws IOException { - for ( final T v1 : v ) { - encodeRawValue(v1, type); - } - } - - public final void encodeRawValue(final T value, final BCF2Type type) throws IOException { - try { - if ( value == type.getMissingJavaValue() ) - encodeRawMissingValue(type); - else { - switch (type) { - case INT8: - case INT16: - case INT32: encodeRawBytes((Integer) value, type); break; - case FLOAT: encodeRawFloat((Double) value); break; - case CHAR: encodeRawChar((Byte) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } catch ( ClassCastException e ) { - throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value); - } - } - - public final void encodeRawMissingValue(final BCF2Type type) throws IOException { - encodeRawBytes(type.getMissingBytes(), type); - } - - public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException { - for ( int i = 0; i < size; i++ ) - encodeRawMissingValue(type); - } - - // -------------------------------------------------------------------------------- - // - // low-level encoders - // - // -------------------------------------------------------------------------------- - - public final void encodeRawChar(final byte c) throws IOException { - encodeStream.write(c); - } - - public final void encodeRawFloat(final double value) throws IOException { - encodeRawBytes(Float.floatToIntBits((float) value), BCF2Type.FLOAT); - } - - public final void encodeType(final int size, final BCF2Type type) throws IOException { - if ( size <= BCF2Utils.MAX_INLINE_ELEMENTS ) { - final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type); - encodeStream.write(typeByte); - } else { - final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type); - encodeStream.write(typeByte); - // write in the overflow size - encodeTypedInt(size); - } - } - - public final void encodeRawInt(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - public final void encodeRawBytes(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - // -------------------------------------------------------------------------------- - // - // utility functions - // - // -------------------------------------------------------------------------------- - - public void encodeRawString(final String s, final int sizeToWrite) throws IOException { - final byte[] bytes = s.getBytes(); - for ( int i = 0; i < sizeToWrite; i++ ) - if ( i < bytes.length ) - encodeRawChar(bytes[i]); - else - encodeRawMissingValue(BCF2Type.CHAR); - } - - /** - * Totally generic encoder that examines o, determines the best way to encode it, and encodes it - * - * This method is incredibly slow, but it's only used for UnitTests so it doesn't matter - * - * @param o - * @return - */ - public final BCF2Type encode(final Object o) throws IOException { - if ( o == null ) throw new IllegalArgumentException("Generic encode cannot deal with null values"); - - if ( o instanceof List ) { - final BCF2Type type = determineBCFType(((List) o).get(0)); - encodeTyped((List) o, type); - return type; - } else { - final BCF2Type type = determineBCFType(o); - encodeTyped(o, type); - return type; - } - } - - private final BCF2Type determineBCFType(final Object arg) { - final Object toType = arg instanceof List ? ((List)arg).get(0) : arg; - - if ( toType instanceof Integer ) - return BCF2Utils.determineIntegerType((Integer) toType); - else if ( toType instanceof String ) - return BCF2Type.CHAR; - else if ( toType instanceof Double ) - return BCF2Type.FLOAT; - else - throw new IllegalArgumentException("No native encoding for Object of type " + arg.getClass().getSimpleName()); - } - - private final List stringToBytes(final String v) throws IOException { - if ( v == null || v.equals("") ) - return Collections.emptyList(); - else { - // TODO -- this needs to be optimized away for efficiency - final byte[] bytes = v.getBytes(); - final List l = new ArrayList(bytes.length); - for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); - return l; - } - } -} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java deleted file mode 100644 index 7d1f0de43d..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java +++ /dev/null @@ -1,455 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCompoundHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public abstract class BCF2FieldEncoder { - /** - * The header line describing the field we will encode values of - */ - final VCFCompoundHeaderLine headerLine; - - /** - * The BCF2 type we'll use to encoder this field, if it can be determined statically. - * If not, this variable must be null - */ - final BCF2Type staticType; - - /** - * The integer offset into the strings map of the BCF2 file corresponding to this - * field. - */ - final int dictionaryOffset; - - /** - * The integer type we use to encode our dictionary offset in the BCF2 file - */ - final BCF2Type dictionaryOffsetType; - - // ---------------------------------------------------------------------- - // - // Constructor - // - // ---------------------------------------------------------------------- - - private BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final Map dict, final BCF2Type staticType) { - this.headerLine = headerLine; - this.staticType = staticType; - - final Integer offset = dict.get(getField()); - if ( offset == null ) throw new IllegalStateException("Format error: could not find string " + getField() + " in header as required by BCF"); - this.dictionaryOffset = offset; - dictionaryOffsetType = BCF2Utils.determineIntegerType(offset); - } - - // ---------------------------------------------------------------------- - // - // Basic accessors - // - // ---------------------------------------------------------------------- - - public final String getField() { return headerLine.getID(); } - - /** - * Write the field key (dictionary offset and type) into the BCF2Encoder stream - * - * @param encoder where we write our dictionary offset - * @throws IOException - */ - public final void writeFieldKey(final BCF2Encoder encoder) throws IOException { - encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType); - } - - @Override - public String toString() { - return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName(); - } - - // ---------------------------------------------------------------------- - // - // methods to determine the number of encoded elements - // - // ---------------------------------------------------------------------- - - protected final VCFHeaderLineCount getCountType() { - return headerLine.getCountType(); - } - - /** - * @return True if this field has a constant, fixed number of elements (such as 1 for an atomic integer) - */ - public boolean hasConstantNumElements() { - return getCountType() == VCFHeaderLineCount.INTEGER; - } - - /** - * @return True if the only way to determine how many elements this field contains is by - * inspecting the actual value directly, such as when the number of elements - * is a variable length list per site or per genotype. - */ - public boolean hasValueDeterminedNumElements() { - return getCountType() == VCFHeaderLineCount.UNBOUNDED; - } - - /** - * @return True if this field has a non-fixed number of elements that depends only on the properties - * of the current VariantContext, such as one value per Allele or per genotype configuration. - */ - public boolean hasContextDeterminedNumElements() { - return ! hasConstantNumElements() && ! hasValueDeterminedNumElements(); - } - - /** - * @return the number of elements, assuming this field has a constant number of elements. - */ - public int numElements() { - return headerLine.getCount(); - } - - /** - * @return the number of elements by looking at the actual value provided - */ - public int numElements(final Object value) { - return numElementsFromValue(value); - } - - /** - * @return the number of elements, assuming this field has context-determined number of elements. - */ - public int numElements(final VariantContext vc) { - return headerLine.getCount(vc); - } - - /** - * A convenience access for the number of elements. - * @param vc - * @param value - * @return the number of encoded elements, either from the fixed number - * it has, from the VC, or from the value itself. - */ - public final int numElements(final VariantContext vc, final Object value) { - if ( hasConstantNumElements() ) return numElements(); - else if ( hasContextDeterminedNumElements() ) return numElements(vc); - else return numElements(value); - } - - /** - * Given a value, return the number of elements we will encode for it. - * - * Assumes the value is encoded as a List - * - * @param value - * @return the number of elements we will encode for {@param value}. - */ - protected int numElementsFromValue(final Object value) { - if ( value == null ) return 0; - else if ( value instanceof List ) return ((List) value).size(); - else return 1; - } - - // ---------------------------------------------------------------------- - // - // methods to determine the BCF2 type of the encoded values - // - // ---------------------------------------------------------------------- - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return true if the field is static - */ - public final boolean isStaticallyTyped() { return ! isDynamicallyTyped(); } - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return true if the field is not static - */ - public final boolean isDynamicallyTyped() { return staticType == null; } - - /** - * Get the BCF2 type for this field, either from the static type of the - * field itself or by inspecting the value itself. - * - * @return the BCF2 type for this field - */ - public final BCF2Type getType(final Object value) { - return isDynamicallyTyped() ? getDynamicType(value) : getStaticType(); - } - - public final BCF2Type getStaticType() { - return staticType; - } - - public BCF2Type getDynamicType(final Object value) { - throw new IllegalStateException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField()); - } - - // ---------------------------------------------------------------------- - // - // methods to encode values, including the key abstract method - // - // ---------------------------------------------------------------------- - - /** - * Key abstract method that should encode a value of the given type into the encoder. - * - * Value will be of a type appropriate to the underlying encoder. If the genotype field is represented as - * an int[], this will be value, and the encoder needs to handle encoding all of the values in the int[]. - * - * The argument should be used, not the getType() method in the superclass as an outer loop might have - * decided a more general type (int16) to use, even through this encoder could have been done with int8. - * - * If minValues > 0, then encodeValue must write in at least minValues items from value. If value is atomic, - * this means that minValues - 1 MISSING values should be added to the encoder. If minValues is a collection - * type (int[]) then minValues - values.length should be added. This argument is intended to handle padding - * of values in genotype fields. - * - * @param encoder - * @param value - * @param type - * @param minValues - * @throws IOException - */ - public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException; - - // ---------------------------------------------------------------------- - // - // Subclass to encode Strings - // - // ---------------------------------------------------------------------- - - public static class StringOrCharacter extends BCF2FieldEncoder { - public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.CHAR); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - final String s = javaStringToBCF2String(value); - encoder.encodeRawString(s, Math.max(s.length(), minValues)); - } - - // - // Regardless of what the header says, BCF2 strings and characters are always encoded - // as arrays of CHAR type, which has a variable number of elements depending on the - // exact string being encoded - // - @Override public boolean hasConstantNumElements() { return false; } - @Override public boolean hasContextDeterminedNumElements() { return false; } - @Override public boolean hasValueDeterminedNumElements() { return true; } - @Override protected int numElementsFromValue(final Object value) { - return value == null ? 0 : javaStringToBCF2String(value).length(); - } - - /** - * Recode the incoming object to a String, compacting it into a - * BCF2 string if the value is a list. - * - * @param value a String or List to encode, or null - * @return a non-null string to encode - */ - private String javaStringToBCF2String(final Object value) { - if ( value == null ) - return ""; - else if (value instanceof List) { - final List l = (List)value; - return BCF2Utils.collapseStringList(l); - } else if ( value.getClass().isArray() ) { - final List l = new ArrayList(); - Collections.addAll(l, (String[])value); - return BCF2Utils.collapseStringList(l); - } else - return (String)value; - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLAG - // - // ---------------------------------------------------------------------- - - public static class Flag extends BCF2FieldEncoder { - public Flag(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.INT8); - if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 ) - throw new IllegalStateException("Flag encoder only supports atomic flags for field " + getField()); - } - - @Override - public int numElements() { - return 1; // the header says 0 but we will write 1 value - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - encoder.encodeRawBytes(1, getStaticType()); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLOAT - // - // ---------------------------------------------------------------------- - - public static class Float extends BCF2FieldEncoder { - final boolean isAtomic; - - public Float(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.FLOAT); - isAtomic = hasConstantNumElements() && numElements() == 1; - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - // TODO -- can be restructured to avoid toList operation - if ( isAtomic ) { - // fast path for fields with 1 fixed float value - if ( value != null ) { - encoder.encodeRawFloat((Double)value); - count++; - } - } else { - // handle generic case - final List doubles = BCF2Utils.toList(Double.class, value); - for ( final Double d : doubles ) { - if ( d != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawFloat(d); - count++; - } - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode int[] - // - // ---------------------------------------------------------------------- - - public static class IntArray extends BCF2FieldEncoder { - public IntArray(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - protected int numElementsFromValue(final Object value) { - return value == null ? 0 : ((int[])value).length; - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - for ( final int i : (int[])value ) { - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode List - // - // ---------------------------------------------------------------------- - - /** - * Specialized int encoder for atomic (non-list) integers - */ - public static class AtomicInt extends BCF2FieldEncoder { - public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - encoder.encodeRawInt((Integer)value, type); - count++; - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - public static class GenericInts extends BCF2FieldEncoder { - public GenericInts(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(BCF2Utils.toList(Integer.class, value)); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - for ( final Integer i : BCF2Utils.toList(Integer.class, value) ) { - if ( i != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } -} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java deleted file mode 100644 index 857cedfe3a..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java +++ /dev/null @@ -1,324 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeader; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public abstract class BCF2FieldWriter { - private final VCFHeader header; - private final BCF2FieldEncoder fieldEncoder; - - protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - this.header = header; - this.fieldEncoder = fieldEncoder; - } - - protected VCFHeader getHeader() { return header; } - protected BCF2FieldEncoder getFieldEncoder() { - return fieldEncoder; - } - protected String getField() { return getFieldEncoder().getField(); } - - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - fieldEncoder.writeFieldKey(encoder); - } - - public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness - - @Override - public String toString() { - return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder(); - } - - // -------------------------------------------------------------------------------- - // - // Sites writers - // - // -------------------------------------------------------------------------------- - - public static abstract class SiteWriter extends BCF2FieldWriter { - protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException; - } - - public static class GenericSiteWriter extends SiteWriter { - public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - final Object rawValue = vc.getAttribute(getField(), null); - final BCF2Type type = getFieldEncoder().getType(rawValue); - if ( rawValue == null ) { - // the value is missing, just write in null - encoder.encodeType(0, type); - } else { - final int valueCount = getFieldEncoder().numElements(vc, rawValue); - encoder.encodeType(valueCount, type); - getFieldEncoder().encodeValue(encoder, rawValue, type, valueCount); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Genotypes writers - // - // -------------------------------------------------------------------------------- - - public static abstract class GenotypesWriter extends BCF2FieldWriter { - int nValuesPerGenotype = -1; - BCF2Type encodingType = null; - - protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - - if ( fieldEncoder.hasConstantNumElements() ) { - nValuesPerGenotype = getFieldEncoder().numElements(); - } - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // writes the key information - super.start(encoder, vc); - - // only update if we need to - if ( ! getFieldEncoder().hasConstantNumElements() ) { - if ( getFieldEncoder().hasContextDeterminedNumElements() ) - // we are cheap -- just depends on genotype of allele counts - nValuesPerGenotype = getFieldEncoder().numElements(vc); - else - // we have to go fishing through the values themselves (expensive) - nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc); - } - - encoder.encodeType(nValuesPerGenotype, encodingType); - } - - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final Object fieldValue = g.getExtendedAttribute(getField(), null); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField())); - } - - private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { - int size = -1; - - for ( final Genotype g : vc.getGenotypes() ) { - size = Math.max(size, numElements(vc, g)); - } - - return size; - } - } - - public static class StaticallyTypeGenotypesWriter extends GenotypesWriter { - public StaticallyTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - encodingType = getFieldEncoder().getStaticType(); - } - } - - public static class IntegerTypeGenotypesWriter extends GenotypesWriter { - public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // the only value that is dynamic are integers - final List values = new ArrayList(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - for ( final Integer i : BCF2Utils.toList(Integer.class, g.getExtendedAttribute(getField(), null)) ) { - if ( i != null ) values.add(i); - } - } - - encodingType = BCF2Utils.determineIntegerType(values); - super.start(encoder, vc); - } - } - - public static class IGFGenotypesWriter extends GenotypesWriter { - final IntGenotypeFieldAccessors.Accessor ige; - - public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) { - super(header, fieldEncoder); - this.ige = ige; - - if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) ) - throw new IllegalArgumentException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField()); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // TODO - // TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration - // TODO - encodingType = BCF2Type.INT8; - for ( final Genotype g : vc.getGenotypes() ) { - final int[] pls = ige.getValues(g); - final BCF2Type plsType = getFieldEncoder().getType(pls); - encodingType = BCF2Utils.maxIntegerType(encodingType, plsType); - if ( encodingType == BCF2Type.INT32 ) - break; // stop early - } - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return ige.getSize(g); - } - } - - public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter { - public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final String fieldValue = g.getFilters(); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getFilters()); - } - } - - public static class GTWriter extends GenotypesWriter { - final Map alleleMapForTriPlus = new HashMap(5); - Allele ref, alt1; - - public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) - throw new IllegalStateException("Current BCF2 encoder cannot handle sites " + - "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " - + vc.getNAlleles() + " at " + vc.getContig() + ":" + vc.getStart()); - - encodingType = BCF2Type.INT8; - buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(2); - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final int samplePloidy = g.getPloidy(); - for ( int i = 0; i < nValuesPerGenotype; i++ ) { - if ( i < samplePloidy ) { - // we encode the actual allele - final Allele a = g.getAllele(i); - final int offset = getAlleleOffset(a); - final int encoded = ((offset+1) << 1) | ((g.isPhased() && i!=0) ? 0x01 : 0x00); - encoder.encodeRawBytes(encoded, encodingType); - } else { - // we need to pad with missing as we have ploidy < max for this sample - encoder.encodeRawBytes(encodingType.getMissingBytes(), encodingType); - } - } - } - - /** - * Fast path code to determine the offset. - * - * Inline tests for == against ref (most common, first test) - * == alt1 (second most common, second test) - * == NO_CALL (third) - * and finally in the map from allele => offset for all alt 2+ alleles - * - * @param a the allele whose offset we wish to determine - * @return the offset (from 0) of the allele in the list of variant context alleles (-1 means NO_CALL) - */ - private final int getAlleleOffset(final Allele a) { - if ( a == ref ) return 0; - else if ( a == alt1 ) return 1; - else if ( a == Allele.NO_CALL ) return -1; - else { - final Integer o = alleleMapForTriPlus.get(a); - if ( o == null ) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a); - return o; - } - } - - private final void buildAlleleMap(final VariantContext vc) { - // these are fast path options to determine the offsets for - final int nAlleles = vc.getNAlleles(); - ref = vc.getReference(); - alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null; - - if ( nAlleles > 2 ) { - // for multi-allelics we need to clear the map, and add additional looks - alleleMapForTriPlus.clear(); - final List alleles = vc.getAlleles(); - for ( int i = 2; i < alleles.size(); i++ ) { - alleleMapForTriPlus.put(alleles.get(i), i); - } - } - } - } -} - diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java deleted file mode 100644 index 20f9ce6aa4..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.utils.GeneralUtils; -import htsjdk.variant.vcf.VCFCompoundHeaderLine; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; - -import java.util.HashMap; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public class BCF2FieldWriterManager { - final Map siteWriters = new HashMap(); - final Map genotypesWriters = new HashMap(); - final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); - - public BCF2FieldWriterManager() { } - - /** - * Setup the FieldWriters appropriate to each INFO and FORMAT in the VCF header - * - * Must be called before any of the getter methods will work - * - * @param header a VCFHeader containing description for every INFO and FORMAT field we'll attempt to write out to BCF - * @param encoder the encoder we are going to use to write out the BCF2 data - * @param stringDictionary a map from VCFHeader strings to their offsets for encoding - */ - public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map stringDictionary) { - for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary); - add(siteWriters, field, writer); - } - - for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary); - add(genotypesWriters, field, writer); - } - } - - private final void add(final Map map, final String field, final T writer) { - if ( map.containsKey(field) ) - throw new IllegalStateException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders"); - map.put(field, writer); - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate SiteWriter for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header, - final VCFInfoHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false)); - } - - private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line, - final BCF2Encoder encoder, - final Map dict, - final boolean createGenotypesEncoders ) { - - if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && line.getType() != VCFHeaderLineType.Integer ) - System.err.println("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line); - return new BCF2FieldEncoder.IntArray(line, dict); - } else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldEncoder.GenericInts(line, dict); - } else { - switch ( line.getType() ) { - case Character: - case String: - return new BCF2FieldEncoder.StringOrCharacter(line, dict); - case Flag: - return new BCF2FieldEncoder.Flag(line, dict); - case Float: - return new BCF2FieldEncoder.Float(line, dict); - case Integer: - if ( line.isFixedCount() && line.getCount() == 1 ) - return new BCF2FieldEncoder.AtomicInt(line, dict); - else - return new BCF2FieldEncoder.GenericInts(line, dict); - default: - throw new IllegalArgumentException("Unexpected type for field " + line.getID()); - } - } - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate Genotypes for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header, - final VCFFormatHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - final String field = line.getID(); - final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true); - - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldWriter.GTWriter(header, fieldEncoder); - } else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder); - } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { - return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); - } else if ( line.getType() == VCFHeaderLineType.Integer ) { - return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder); - } else { - return new BCF2FieldWriter.StaticallyTypeGenotypesWriter(header, fieldEncoder); - } - } - - // ----------------------------------------------------------------- - // - // Accessors to get site / genotype writers - // - // ----------------------------------------------------------------- - - /** - * Get a site writer specialized to encode values for site info field - * @param field key found in the VCF header INFO records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) { - return getWriter(field, siteWriters); - } - - /** - * Get a genotypes writer specialized to encode values for genotypes field - * @param field key found in the VCF header FORMAT records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) { - return getWriter(field, genotypesWriters); - } - - public T getWriter(final String key, final Map map) { - return map.get(key); - } -} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index 78990f5f3f..fd95161be2 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -28,22 +28,22 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; import htsjdk.variant.bcf2.BCF2Codec; +import htsjdk.variant.bcf2.BCF2Dictionary; +import htsjdk.variant.bcf2.BCF2Encoder; import htsjdk.variant.bcf2.BCF2Type; import htsjdk.variant.bcf2.BCF2Utils; import htsjdk.variant.bcf2.BCFVersion; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.bcf2.BCF2FieldWriter.BCF2FieldWriterManager; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFUtils; import java.io.ByteArrayOutputStream; @@ -52,19 +52,16 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * VariantContextWriter that emits BCF2 binary encoding - * + *

    * Overall structure of this writer is complex for efficiency reasons - * + *

    * -- The BCF2Writer manages the low-level BCF2 encoder, the mappings * from contigs and strings to offsets, the VCF header, and holds the * lower-level encoders that map from VC and Genotype fields to their @@ -72,29 +69,23 @@ * like POS, contig, the size of info and genotype data, QUAL, etc. It * has loops over the INFO and GENOTYPES to encode each individual datum * with the generic field encoders, but the actual encoding work is - * done with by the FieldWriters classes themselves - * + * done with by the FieldWriters classes themselves. The piece of code + * that determines which FieldWriters to associate with each SITE and + * GENOTYPE field is the BCF2FieldWriterManager. + *

    * -- BCF2FieldWriter are specialized classes for writing out SITE and * genotype information for specific SITE/GENOTYPE fields (like AC for * sites and GQ for genotypes). These are objects in themselves because - * the manage all of the complexity of relating the types in the VCF header + * they manage all of the complexity of relating the types in the VCF header * with the proper encoding in BCF as well as the type representing this * in java. Relating all three of these pieces of information together - * is the main complexity challenge in the encoder. The piece of code - * that determines which FieldWriters to associate with each SITE and - * GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters - * are specialized for specific combinations of encoders (see below) - * and contexts (genotypes) for efficiency, so they smartly manage - * the writing of PLs (encoded as int[]) directly into the lowest - * level BCFEncoder. - * - * -- At the third level is the BCF2FieldEncoder, relatively simple - * pieces of code that handle the task of determining the right - * BCF2 type for specific field values, as well as reporting back - * information such as the number of elements used to encode it - * (simple for atomic values like Integer but complex for PLs - * or lists of strings) - * + * is the main complexity challenge in the encoder. These classes are + * responsible for extracting the necessary data from the VariantContext + * or Genotype, determining its BCF type and size, and writing it out. + * These FieldWriters are specialized for specific combinations of VCF type + * and contexts for efficiency, so they smartly manage the writing of PLs + * (encoded as int[]) directly into the lowest level BCFEncoder. + *

    * -- At the lowest level is the BCF2Encoder itself. This provides * just the limited encoding methods specified by the BCF2 specification. This encoder * doesn't do anything but make it possible to conveniently write out valid low-level @@ -105,19 +96,22 @@ */ class BCF2Writer extends IndexingVariantContextWriter { public static final int MAJOR_VERSION = 2; - public static final int MINOR_VERSION = 1; + public static final int MINOR_VERSION = 2; + + public static final BCFVersion VERSION = new BCFVersion(MAJOR_VERSION, MINOR_VERSION); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support private VCFHeader header; - private final Map contigDictionary = new HashMap(); - private final Map stringDictionaryMap = new LinkedHashMap(); + private final Map contigDictionary = new HashMap<>(); + private final Map stringDictionaryMap = new HashMap<>(); private final boolean doNotWriteGenotypes; - private String[] sampleNames = null; + private final Map> genotypeKeys = new HashMap<>(); + + private BCF2Encoder encoder; // initialized after the header arrives - private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives - final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); + private BCF2FieldWriterManager fieldWriterManager; /** * cached results for whether we can write out raw genotypes data. @@ -135,15 +129,15 @@ public BCF2Writer(final File location, final OutputStream output, final SAMSeque } public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict, - final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { + final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); this.outputStream = getOutputStream(); this.doNotWriteGenotypes = doNotWriteGenotypes; } public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, - final IndexCreator indexCreator, - final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { + final IndexCreator indexCreator, + final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { this(IOUtil.toPath(location), output, refDict, indexCreator, enableOnTheFlyIndexing, doNotWriteGenotypes); } @@ -163,7 +157,7 @@ public BCF2Writer(final Path location, final OutputStream output, final SAMSeque // -------------------------------------------------------------------------------- @Override - public void writeHeader(VCFHeader header) { + public void writeHeader(final VCFHeader header) { setHeader(header); try { @@ -175,33 +169,54 @@ public void writeHeader(VCFHeader header) { writer.close(); final byte[] headerBytes = capture.toByteArray(); - new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream); + BCF2Writer.VERSION.write(outputStream); BCF2Type.INT32.write(headerBytes.length, outputStream); outputStream.write(headerBytes); outputHasBeenWritten = true; - } catch (IOException e) { + } catch (final IOException e) { throw new RuntimeIOException("BCF2 stream: Got IOException while trying to write BCF2 header", e); } } @Override - public void add( VariantContext vc ) { - if ( doNotWriteGenotypes ) + public void add(VariantContext vc) { + if (doNotWriteGenotypes) vc = new VariantContextBuilder(vc).noGenotypes().make(); vc = vc.fullyDecode(header, false); super.add(vc); // allow on the fly indexing try { - final byte[] infoBlock = buildSitesData(vc); - final byte[] genotypesBlock = buildSamplesData(vc); + // Sites data + buildSitesData(vc); + final int sitesLength = encoder.getSize(); + + // Genotypes data + final int genotypesLength; + final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects + if (lazyData != null) { + // we never decoded any data from this BCF file so we don't need to re-encode the samples data + genotypesLength = lazyData.bytes.length; + } else { + // we have to do work to convert the VC into a BCF2 byte stream + buildSamplesData(vc); + genotypesLength = encoder.getSize() - sitesLength; + } + + // Write lengths + BCF2Type.INT32.write(sitesLength, outputStream); + BCF2Type.INT32.write(genotypesLength, outputStream); - // write the two blocks to disk - writeBlock(infoBlock, genotypesBlock); + // Write the encoder's buffer into the output stream + // If there was no lazy data, this also contains the genotypes data + encoder.write(outputStream); + if (lazyData != null) { + // The encoder only contained sites data, so we need to write the lazy data + outputStream.write(lazyData.bytes); + } outputHasBeenWritten = true; - } - catch ( IOException e ) { - throw new RuntimeIOException("Error writing record to BCF2 file: " + vc.toString(), e); + } catch (final IOException e) { + throw new RuntimeIOException("Error writing record to BCF2 file: " + vc, e); } } @@ -209,8 +224,7 @@ public void add( VariantContext vc ) { public void close() { try { outputStream.flush(); - } - catch ( IOException e ) { + } catch (final IOException e) { throw new RuntimeIOException("Failed to flush BCF2 file"); } super.close(); @@ -221,39 +235,50 @@ public void setHeader(final VCFHeader header) { if (outputHasBeenWritten) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } + + // TODO we default to 2.2 here, is this alright? + encoder = BCF2Encoder.getEncoder(BCF2Codec.ALLOWED_BCF_VERSION); + // make sure the header is sorted correctly - this.header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : new VCFHeader( - header.getMetaDataInSortedOrder(), header.getGenotypeSamples()); + this.header = doNotWriteGenotypes + ? new VCFHeader(header.getMetaDataInSortedOrder()) + : new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples()); + + // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields + // Error on ##dictionary lines, we don't know what to do with them + if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { + throw new TribbleException("Use of the ##dictionary line is not supported"); + } + // create the config offsets map - if ( this.header.getContigLines().isEmpty() ) { - if ( ALLOW_MISSING_CONTIG_LINES ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { + if (this.header.getContigLines().isEmpty()) { + if (ALLOW_MISSING_CONTIG_LINES) { + if (GeneralUtils.DEBUG_MODE_ENABLED) { System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary"); } + // The reference sequence dictionary should never contain IDX fields createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null)); } else { throw new IllegalStateException("Cannot write BCF2 file with missing contig lines"); } } else { - createContigDictionary(this.header.getContigLines()); - } - // set up the map from dictionary string values -> offset - final ArrayList dict = BCF2Utils.makeDictionary(this.header); - for ( int i = 0; i < dict.size(); i++ ) { - stringDictionaryMap.put(dict.get(i), i); + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2ContigDictionary(header, BCF2Writer.VERSION); + dict.forEach((offset, string) -> contigDictionary.put(string, offset)); } - sampleNames = this.header.getGenotypeSamples().toArray(new String[this.header.getNGenotypeSamples()]); - // setup the field encodings - fieldManager.setup(this.header, encoder, stringDictionaryMap); + // Create offset -> string map then turn inside-out + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(this.header, BCF2Writer.VERSION); + dict.forEach((offset, string) -> stringDictionaryMap.put(string, offset)); + // Set up the field encodings + fieldWriterManager = new BCF2FieldWriterManager(header, stringDictionaryMap, encoder); } // -------------------------------------------------------------------------------- // // implicit block // - // The first four records of BCF are inline untype encoded data of: + // The first four records of BCF are inline untyped encoded data of: // // 4 byte integer chrom offset // 4 byte integer start @@ -261,23 +286,23 @@ public void setHeader(final VCFHeader header) { // 4 byte float qual // // -------------------------------------------------------------------------------- - private byte[] buildSitesData( VariantContext vc ) throws IOException { + private void buildSitesData(final VariantContext vc) throws IOException { final int contigIndex = contigDictionary.get(vc.getContig()); - if ( contigIndex == -1 ) + if (contigIndex == -1) throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getContig())); - // note use of encodeRawValue to not insert the typing byte - encoder.encodeRawValue(contigIndex, BCF2Type.INT32); + // note use of encodeRawInt to not insert the typing byte + encoder.encodeRawInt(contigIndex, BCF2Type.INT32); // pos. GATK is 1 based, BCF2 is 0 based - encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32); + encoder.encodeRawInt(vc.getStart() - 1, BCF2Type.INT32); // ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1 // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1 - encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); + encoder.encodeRawInt(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); // qual - if ( vc.hasLog10PError() ) + if (vc.hasLog10PError()) encoder.encodeRawFloat((float) vc.getPhredScaledQual()); else encoder.encodeRawMissingValue(BCF2Type.FLOAT); @@ -295,14 +320,12 @@ private byte[] buildSitesData( VariantContext vc ) throws IOException { buildAlleles(vc); buildFilter(vc); buildInfo(vc); - - return encoder.getRecordBytes(); } /** * Can we safely write on the raw (undecoded) genotypes of an input VC? - * + *

    * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in * which case we return the previous result. If it's not cached, we use the BCF2Util to * compare the VC header with our header (expensive) and cache it. @@ -311,9 +334,9 @@ private byte[] buildSitesData( VariantContext vc ) throws IOException { * @return */ private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) { - if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) { + if (lazyData.header != lastVCFHeaderOfUnparsedGenotypes) { // result is already cached - canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header); + canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header, lazyData.header); lastVCFHeaderOfUnparsedGenotypes = lazyData.header; } @@ -321,12 +344,12 @@ private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyDat } private BCF2Codec.LazyData getLazyData(final VariantContext vc) { - if ( vc.getGenotypes().isLazyWithData() ) { - final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); + if (vc.getGenotypes().isLazyWithData()) { + final LazyGenotypesContext lgc = (LazyGenotypesContext) vc.getGenotypes(); - if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && - canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { - return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData(); + if (lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && + canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { + return (BCF2Codec.LazyData) lgc.getUnparsedGenotypeData(); } else { lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long } @@ -337,7 +360,7 @@ private BCF2Codec.LazyData getLazyData(final VariantContext vc) { /** * Try to get the nGenotypeFields as efficiently as possible. - * + *

    * If this is a lazy BCF2 object just grab the field count from there, * otherwise do the whole counting by types test in the actual data * @@ -346,80 +369,46 @@ private BCF2Codec.LazyData getLazyData(final VariantContext vc) { */ private int getNGenotypeFormatFields(final VariantContext vc) { final BCF2Codec.LazyData lazyData = getLazyData(vc); - return lazyData != null ? lazyData.nGenotypeFields : vc.calcVCFGenotypeKeys(header).size(); + if (lazyData == null) { + // Calculate genotype keys of a VariantContext and cache result + // This computation can be expensive as it needs to inspect every genotype in the VC, + // so we cache the result as it will be needed again when writing the genotype information + return genotypeKeys.computeIfAbsent(vc, v -> v.calcVCFGenotypeKeys(header)).size(); + } else { + return lazyData.nGenotypeFields; + } } - private void buildID( VariantContext vc ) throws IOException { + private void buildID(final VariantContext vc) throws IOException { encoder.encodeTypedString(vc.getID()); } - private void buildAlleles( VariantContext vc ) throws IOException { - for ( Allele allele : vc.getAlleles() ) { + private void buildAlleles(final VariantContext vc) throws IOException { + for (final Allele allele : vc.getAlleles()) { final byte[] s = allele.getDisplayBases(); - if ( s == null ) + if (s == null) throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele); encoder.encodeTypedString(s); } } - private void buildFilter( VariantContext vc ) throws IOException { - if ( vc.isFiltered() ) { + private void buildFilter(final VariantContext vc) throws IOException { + if (vc.isFiltered()) { encodeStringsByRef(vc.getFilters()); - } else if ( vc.filtersWereApplied() ) { - encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4)); + } else if (vc.filtersWereApplied()) { + // PASS is always implicitly encoded as 0 + encoder.encodeTypedInt(0, BCF2Type.INT8); } else { encoder.encodeTypedMissing(BCF2Type.INT8); } } - private void buildInfo( VariantContext vc ) throws IOException { - for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { - final String field = infoFieldEntry.getKey(); - final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO"); - writer.start(encoder, vc); - writer.site(encoder, vc); - writer.done(encoder, vc); - } + private void buildInfo(final VariantContext vc) throws IOException { + fieldWriterManager.writeInfo(vc); } - private byte[] buildSamplesData(final VariantContext vc) throws IOException { - final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects - if ( lazyData != null ) { - // we never decoded any data from this BCF file, so just pass it back - return lazyData.bytes; - } - - // we have to do work to convert the VC into a BCF2 byte stream - final List genotypeFields = vc.calcVCFGenotypeKeys(header); - for ( final String field : genotypeFields ) { - final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); - - assert writer != null; - - writer.start(encoder, vc); - for ( final String name : sampleNames ) { - Genotype g = vc.getGenotype(name); - if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); - writer.addGenotype(encoder, vc, g); - } - writer.done(encoder, vc); - } - return encoder.getRecordBytes(); - } - - /** - * Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file - * but there's no header line for it. - * - * @param vc - * @param field - * @param fieldType - */ - private void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) { - throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " + - vc.getContig() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader"); + private void buildSamplesData(final VariantContext vc) throws IOException { + fieldWriterManager.writeFormat(vc, genotypeKeys.get(vc)); } // -------------------------------------------------------------------------------- @@ -428,34 +417,20 @@ private void errorUnexpectedFieldToWrite(final VariantContext vc, final String f // // -------------------------------------------------------------------------------- - /** - * Write the data in the encoder to the outputstream as a length encoded - * block of data. After this call the encoder stream will be ready to - * start a new data block - * - * @throws IOException - */ - private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException { - BCF2Type.INT32.write(infoBlock.length, outputStream); - BCF2Type.INT32.write(genotypesBlock.length, outputStream); - outputStream.write(infoBlock); - outputStream.write(genotypesBlock); - } - - private BCF2Type encodeStringsByRef(final Collection strings) throws IOException { - final List offsets = new ArrayList(strings.size()); + private void encodeStringsByRef(final Collection strings) throws IOException { + final int[] offsets = new int[strings.size()]; + int i = 0; - // iterate over strings until we find one that needs 16 bits, and break - for ( final String string : strings ) { + // Map strings to their position in string dictionary + for (final String string : strings) { final Integer got = stringDictionaryMap.get(string); - if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF"); - final int offset = got; - offsets.add(offset); + if (got == null) + throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF"); + offsets[i] = got; + i++; } - final BCF2Type type = BCF2Utils.determineIntegerType(offsets); - encoder.encodeTyped(offsets, type); - return type; + encoder.encodeTypedVecInt(offsets); } /** @@ -465,7 +440,7 @@ private BCF2Type encodeStringsByRef(final Collection strings) throws IOE */ private void createContigDictionary(final Collection contigLines) { int offset = 0; - for ( VCFContigHeaderLine contig : contigLines ) + for (final VCFContigHeaderLine contig : contigLines) contigDictionary.put(contig.getID(), offset++); } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java index 215eaf996b..0dd3e9d77c 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java @@ -481,7 +481,8 @@ else if (STREAM_TYPES.contains(this.outType)) if ((refDict == null) && (options.contains(Options.INDEX_ON_THE_FLY))) throw new IllegalArgumentException("A reference dictionary is required for creating Tribble indices on the fly"); - writer = createBCFWriter(outPath, outStreamFromFile); + // BCFs are always bgzipped, but the compression level can be set to 0 to only apply trivial compression + writer = createBCFWriter(outPath, new BlockCompressedOutputStream(outStreamFromFile, outPath)); break; case VCF_STREAM: writer = createVCFWriter(null, outStreamFromFile); @@ -492,7 +493,7 @@ else if (STREAM_TYPES.contains(this.outType)) options.remove(Options.INDEX_ON_THE_FLY); } - writer = createBCFWriter(null, outStream); + writer = createBCFWriter(null, new BlockCompressedOutputStream(outStreamFromFile, outPath)); break; } diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index cacff036b5..753a1c16f1 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -405,7 +405,12 @@ private Set parsePrimaryHeaderLine(final String headerLine) { } while ( col < columns.length ) { - sampleNames.add(columns[col++]); + // Sample names must be unique + if (sampleNames.contains(columns[col])) { + throw new TribbleException.InvalidHeader("duplicate sample name: " + columns[col]); + } else { + sampleNames.add(columns[col++]); + } } if ( sawFormatTag && sampleNames.isEmpty()) diff --git a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java index 2ab29ddcb4..7deade9374 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java @@ -55,14 +55,21 @@ public class VCFFileReader implements VCFReader { * Returns true if the given file appears to be a BCF file. */ public static boolean isBCF(final File file) { - return isBCF(file.toPath()); + return isBCF(file.toString()); } /** * Returns true if the given path appears to be a BCF file. */ public static boolean isBCF(final Path path) { - return path.toUri().getRawPath().endsWith(FileExtensions.BCF); + return isBCF(path.toUri().getRawPath()); + } + + /** + * Returns true if the given path appears to be a BCF file. + */ + public static boolean isBCF(final String path) { + return path.endsWith(FileExtensions.BCF) || path.endsWith(FileExtensions.COMPRESSED_BCF); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 1b890db1b1..101ff304c6 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -87,11 +87,6 @@ private void validate() { } } - @Override - public boolean shouldBeAddedToDictionary() { - return true; - } - /** * get the "Description" field * @return the "Description" field diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index fc75ee5291..1e927b7d05 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -93,12 +93,8 @@ public static VCFFormatHeaderLine getMergedFormatHeaderLine( private void validate() { if (this.getType() == VCFHeaderLineType.Flag) { - throw new TribbleException(String.format("Flag is an unsupported type for format fields: ", this.toStringEncoding())); + throw new TribbleException("Flag is an unsupported type for format fields: " + this.toStringEncoding()); } } - @Override - public boolean shouldBeAddedToDictionary() { - return true; - } } \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 9214f7095f..c1bec06d47 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -134,7 +134,7 @@ public Optional> getValidationFailure(final } /** - * Validate that the header line conforms to {@code vcfTargetVersion. + * Validate that the header line conforms to {@code vcfTargetVersion}. * @param vcfTargetVersion * @throws {@link TribbleException.VersionValidationFailure} if this header line fails to conform */ @@ -160,16 +160,6 @@ protected Optional validateKeyOrID(final String keyString) { } } - /** - * By default the header lines won't be added to the BCF dictionary, unless this method is overriden - * (for example in FORMAT, INFO or FILTER header lines). - * - * @return false - */ - public boolean shouldBeAddedToDictionary() { - return false; - } - public String toString() { return toStringEncoding(); } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index a22ecd2102..2397e28641 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -159,14 +159,21 @@ public Map parseLine(String valueLine, List expectedTagO throw new TribbleException.InvalidHeader("Unclosed quote in header line value " + valueLine); } + // Validate the order of all discovered tags against requiredTagOrder. All tags are treated as // "optional". Succeeding does not mean that all expected tags in the list were seen. Also, all // structured header lines can have "extra" tags, with no order specified, so additional tags // are tolerated. if ( expectedTagOrder != null ) { + // If there are N expected tags present in the parsed header, the first N tags must exactly + // match the order of the expected tags list, the remaining tags are considered optional + int numExpectedTagsPresent = 0; + for (final String expectedTag : expectedTagOrder) { + if (ret.containsKey(expectedTag)) numExpectedTagsPresent++; + } index = 0; - for (String str : ret.keySet()) { - if (index >= expectedTagOrder.size()) { + for (final String str : ret.keySet()) { + if (index == numExpectedTagsPresent) { break; // done - end of requiredTagOrder list } else if (!expectedTagOrder.get(index).equals(str)) { throw new TribbleException.InvalidHeader( diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 4a116e1381..410409ca12 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -110,9 +110,4 @@ protected Optional validateKeyOrID(final String id) { : super.validateKeyOrID(id); } - @Override - public boolean shouldBeAddedToDictionary() { - return true; - } - } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index 2c53899f1d..a5271114d4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -233,10 +233,6 @@ private void validate() { throw new TribbleException( String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); } - final Optional validationFailure = validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE)); - if (validationFailure.isPresent()) { - throw new TribbleException.VersionValidationFailure(validationFailure.get()); - } } // Perform all text transformations required to encode an attribute value diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index 0d61cf35e4..6dd5f3906f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -207,7 +207,17 @@ public T repair(final T line) { + (badCount ? " -- counts disagree; header has " + line.getCount() + " but standard is " + standard.getCount() : "") + (badDesc ? " -- descriptions disagree; header has '" + line.getDescription() + "' but standard is '" + standard.getDescription() + "'": "")); } - return standard; + // Create a new set so we can modify it without mutating the standard line + final Set additionalFields = new HashSet<>(line.getGenericFields().keySet()); + additionalFields.removeAll(standard.getGenericFields().keySet()); + + if (additionalFields.isEmpty()) { + return standard; + } else { + // We need to handle the case where a line has nonstandard attributes, but also additional + // attributes of its own that would be lost if we simply returned the standard line + return mergeStandardLine(standard, line, additionalFields); + } } else { return line; } @@ -216,6 +226,26 @@ public T repair(final T line) { } } + private T mergeStandardLine(final T standard, final T line, final Set additionalFields) { + // Create a new line identical to the standard line + final VCFCompoundHeaderLine mergedLine; + if (standard instanceof VCFFormatHeaderLine) { + mergedLine = standard.isFixedCount() + ? new VCFFormatHeaderLine(standard.getID(), standard.getCount(), standard.getType(), standard.getDescription()) + : new VCFFormatHeaderLine(standard.getID(), standard.getCountType(), standard.getType(), standard.getDescription()); + } else { + mergedLine = standard.isFixedCount() + ? new VCFInfoHeaderLine(standard.getID(), standard.getCount(), standard.getType(), standard.getDescription()) + : new VCFInfoHeaderLine(standard.getID(), standard.getCountType(), standard.getType(), standard.getDescription()); + } + + final Map originalGenericFields = line.getGenericFields(); + for (final String field : additionalFields) { + mergedLine.updateGenericField(field, originalGenericFields.get(field)); + } + return (T) mergedLine; + } + public Set addToHeader(final Set headerLines, final Collection IDs, final boolean throwErrorForMissing) { final Set missing = new HashSet(); for ( final String ID : IDs ) { diff --git a/src/test/java/htsjdk/samtools/SamStreamsTest.java b/src/test/java/htsjdk/samtools/SamStreamsTest.java index d08a14dabf..7611c762f3 100644 --- a/src/test/java/htsjdk/samtools/SamStreamsTest.java +++ b/src/test/java/htsjdk/samtools/SamStreamsTest.java @@ -28,6 +28,7 @@ import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; +import htsjdk.samtools.util.IOUtil; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -44,7 +45,7 @@ public class SamStreamsTest extends HtsjdkTest { public void testDataFormat(final String inputFile, final boolean isGzippedSAMFile, final boolean isBAMFile, final boolean isCRAMFile) throws Exception { final File input = new File(TEST_DATA_DIR, inputFile); try(final InputStream fis = new BufferedInputStream(new FileInputStream(input))) { //must be buffered or the isGzippedSAMFile will blow up - Assert.assertEquals(SamStreams.isGzippedSAMFile(fis), isGzippedSAMFile, "isGzippedSAMFile:" + inputFile); + Assert.assertEquals(IOUtil.isGZIPInputStream(fis), isGzippedSAMFile, "isGzippedSAMFile:" + inputFile); Assert.assertEquals(SamStreams.isBAMFile(fis), isBAMFile, "isBAMFile:" + inputFile); Assert.assertEquals(SamStreams.isCRAMFile(fis), isCRAMFile, "isCRAMFile:" + inputFile); } diff --git a/src/test/java/htsjdk/utils/BCFToolsTestUtils.java b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java new file mode 100644 index 0000000000..8193791e93 --- /dev/null +++ b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java @@ -0,0 +1,136 @@ +package htsjdk.utils; + +import htsjdk.samtools.util.FileExtensions; +import htsjdk.samtools.util.ProcessExecutor; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class BCFToolsTestUtils { + private static final String BCFTOOLS_BINARY_ENV_VARIABLE = "HTSJDK_BCFTOOLS_BIN"; + public static final String expectedBCFtoolsVersion = "1.13"; + + /** + * @return true if bcftools is available, otherwise false + */ + public static boolean isBCFToolsAvailable() { + final String binPath = getBCFToolsBin(); + final Path binFile = Paths.get(binPath); + return Files.exists(binFile); + } + + /** + * @throws RuntimeException if bcftools executable is not available + */ + public static void assertBCFToolsAvailable() { + if (!isBCFToolsAvailable()) { + throw new RuntimeException(String.format( + "No bcftools executable can be found." + + " The %s environment variable must be set to the name of the local bcftools executable.", + BCFTOOLS_BINARY_ENV_VARIABLE + )); + } + } + + /** + * @return the name and location of the local bcftools executable as specified by the environment + * variable HTSJDK_BCFTOOLS_BIN, or the default value of "/usr/local/bin/bcftools" if the environment + * variable is not set + */ + public static String getBCFToolsBin() { + final String bcftoolsPath = System.getenv(BCFTOOLS_BINARY_ENV_VARIABLE); + return bcftoolsPath == null ? "/usr/local/bin/bcftools" : bcftoolsPath; + } + + /** + * Execute a bcftools command line if a local bcftools executable is available see {@link #isBCFToolsAvailable()}. + * + * @param commandLine bcftools command line string, excluding the "bcftools" prefix. For example: + * {@code "view my.vcf > my.bcf"} + * @return the {@link ProcessExecutor.ExitStatusAndOutput} resulting from the command execution, if + * the command succeeds + * @throws RuntimeException if the command fails, or if a local bcftools executable is not available. + */ + public static ProcessExecutor.ExitStatusAndOutput executeBCFToolsCommand(final String commandLine) { + assertBCFToolsAvailable(); + final String commandString = String.format("%s %s", getBCFToolsBin(), commandLine); + final ProcessExecutor.ExitStatusAndOutput processStatus = + ProcessExecutor.executeAndReturnInterleavedOutput(commandString); + if (processStatus.exitStatus != 0) { + // bcftools seems to write some errors to stdout + throw new RuntimeException( + String.format( + "Failure code %d returned from bcftools command %s\n (stderr: %.500s)\n (stdout: %.500s)\n", + processStatus.exitStatus, + commandString, + processStatus.stderr == null ? "" : processStatus.stderr, + processStatus.stdout == null ? "" : processStatus.stdout + ) + ); + } + return processStatus; + } + + /** + * Convert an input VCF file to a temporary BCF file using the bcftools "view" command. The temp + * file will be deleted when the process exits. Use {@link #isBCFToolsAvailable()} to determine if it's safe + * to use this method. + * + * @param inputVCF input file to convert + * @param commandLineOptions additional command line options (--input-fmt-option or --output-fmt-option) + * @return a temporary file containing the bcftools-generated results. + */ + public static File VCFtoBCF( + final File inputVCF, + final String commandLineOptions + ) { + assertBCFToolsAvailable(); + try { + final File tempBCFFile = File.createTempFile("bcftoolsTemporaryBCF", FileExtensions.BCF); + tempBCFFile.deleteOnExit(); + final String commandString = String.format( + "view %s %s -o %s", + commandLineOptions == null ? "" : commandLineOptions, + inputVCF.getAbsolutePath(), + tempBCFFile.getAbsolutePath() + ); + executeBCFToolsCommand(commandString); + return tempBCFFile; + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Convert an input BCF file to a temporary VCF file using the bcftools "view" command. The temp + * file will be deleted when the process exits. Use {@link #isBCFToolsAvailable()} to determine if it's safe + * to use this method. + * + * @param inputBCF input file to convert + * @param commandLineOptions additional command line options (--input-fmt-option or --output-fmt-option) + * @return a temporary file containing the bcftools-generated results. + */ + public static File BCFToVCF( + final File inputBCF, + final String commandLineOptions + ) { + assertBCFToolsAvailable(); + try { + final File tempVCFFile = File.createTempFile("bcftoolsTemporaryVCF" + inputBCF, FileExtensions.VCF); + final String commandString = String.format( + "view %s %s -o %s", + commandLineOptions == null ? "" : commandLineOptions, + inputBCF.getAbsolutePath(), + tempVCFFile.getAbsolutePath() + ); + executeBCFToolsCommand(commandString); + return tempVCFFile; + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } +} diff --git a/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java b/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java new file mode 100644 index 0000000000..9fd5451f55 --- /dev/null +++ b/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java @@ -0,0 +1,35 @@ +package htsjdk.utils; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.ProcessExecutor; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.Test; + +public class BCFToolsTestUtilsTest extends HtsjdkTest { + + @Test + public void testBCFToolsIsAvailable() { + Assert.assertTrue(BCFToolsTestUtils.isBCFToolsAvailable()); + } + + @Test + public void testBCFToolsVersion() { + if (!BCFToolsTestUtils.isBCFToolsAvailable()) { + throw new SkipException("bcftools not available on local device"); + } + // If this test runs, but fails because version validation fails, then the local bcftools version is + // not the one expected by the htsjdk tests + final ProcessExecutor.ExitStatusAndOutput processStatus = BCFToolsTestUtils.executeBCFToolsCommand("--version"); + Assert.assertTrue(processStatus.stdout.contains(BCFToolsTestUtils.expectedBCFtoolsVersion)); + } + + + @Test(expectedExceptions = RuntimeException.class) + public void testBCFToolsPresentButCommandFails() { + if (!BCFToolsTestUtils.isBCFToolsAvailable()) { + throw new SkipException("bcftools not available on local device"); + } + BCFToolsTestUtils.executeBCFToolsCommand("--notABcftoolsCommand"); + } +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java new file mode 100644 index 0000000000..9d5b09a0ec --- /dev/null +++ b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java @@ -0,0 +1,101 @@ +package htsjdk.variant.bcf2; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.VariantBaseTest; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; + +public class BCF2DictionaryTest extends VariantBaseTest { + + @DataProvider(name = "dictionaryProvider") + public Object[][] dictionaryProvider() { + final List cases = new ArrayList<>(); + + final List inputLines = new ArrayList<>(); + int counter = 0; + inputLines.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFInfoHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFHeaderLine("x", "misc")); + inputLines.add(new VCFHeaderLine("y", "misc")); + inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); + inputLines.add(new VCFFormatHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); + + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(inputHeader, version); + cases.add(new Object[]{dict}); + } + + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "dictionaryProvider") + public void testCreateDictionary(final BCF2Dictionary dict) { + final int dict_size = dict.size(); + Assert.assertEquals(8, dict_size); + } + + /* + @DataProvider(name = "inconsistentIDXProvider") + public Object[][] inconsistentIDXProvider() { + final List cases = new ArrayList<>(); + + // TODO can't create FILTER/FORMAT/INFO lines with arbitrary attributes + // should probably be addressed as part of refactoring, would be simpler and more consistent + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + // String lines with inconsistent IDX + { + int counter = 0; + final List inputLines = new ArrayList<>(); + inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); + inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)).getGenericFieldValue()); + + new VCFSimpleHeaderLine() + + + final VCFHeader header = new VCFHeader(new LinkedHashSet<>(inputLines)); + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(header, version); + cases.add(new Object[]{dict}); + } + + // Contig lines with inconsistent IDX + { + + } + } + + return cases.toArray(new Object[0][]); + } + + @Test(expectedExceptions = {TribbleException.class}) + public void inconsistentIDX(final VCFHeader header, final BCFVersion version, final boolean string) { + if (string) { + BCF2Dictionary.makeBCF2StringDictionary(header, version); + } else { + BCF2Dictionary.makeBCF2ContigDictionary(header, version); + } + } + */ +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java index d0d3a88fe2..050931444b 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java @@ -1,33 +1,33 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; // the imports for unit testing. + import htsjdk.variant.VariantBaseTest; -import htsjdk.variant.variantcontext.writer.BCF2Encoder; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -45,9 +45,9 @@ public class BCF2EncoderDecoderUnitTest extends VariantBaseTest { private final double FLOAT_TOLERANCE = 1e-6; - final List primitives = new ArrayList(); - final List basicTypes = new ArrayList(); - final List forCombinations = new ArrayList(); + final List primitives = new ArrayList<>(); + final List basicTypes = new ArrayList<>(); + final List forCombinations = new ArrayList<>(); @BeforeSuite public void before() { @@ -63,23 +63,23 @@ public void before() { primitives.add(new BCF2TypedValue(-1, BCF2Type.INT8)); primitives.add(new BCF2TypedValue(100, BCF2Type.INT8)); primitives.add(new BCF2TypedValue(-100, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(-127, BCF2Type.INT8)); // last value in range - primitives.add(new BCF2TypedValue( 127, BCF2Type.INT8)); // last value in range + primitives.add(new BCF2TypedValue(-120, BCF2Type.INT8)); // last value in range + primitives.add(new BCF2TypedValue(127, BCF2Type.INT8)); // last value in range // medium ints primitives.add(new BCF2TypedValue(-1000, BCF2Type.INT16)); primitives.add(new BCF2TypedValue(1000, BCF2Type.INT16)); primitives.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue(-32767, BCF2Type.INT16)); // last value in range - primitives.add(new BCF2TypedValue( 32767, BCF2Type.INT16)); // last value in range + primitives.add(new BCF2TypedValue(128, BCF2Type.INT16)); // first value in range + primitives.add(new BCF2TypedValue(-32760, BCF2Type.INT16)); // last value in range + primitives.add(new BCF2TypedValue(32767, BCF2Type.INT16)); // last value in range // larger ints primitives.add(new BCF2TypedValue(-32768, BCF2Type.INT32)); // first value in range - primitives.add(new BCF2TypedValue( 32768, BCF2Type.INT32)); // first value in range + primitives.add(new BCF2TypedValue(32768, BCF2Type.INT32)); // first value in range primitives.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); primitives.add(new BCF2TypedValue(100000, BCF2Type.INT32)); - primitives.add(new BCF2TypedValue(-2147483647, BCF2Type.INT32)); + primitives.add(new BCF2TypedValue(-2147483640, BCF2Type.INT32)); primitives.add(new BCF2TypedValue(2147483647, BCF2Type.INT32)); // floats @@ -116,7 +116,7 @@ public void before() { primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); // missing values - for ( BCF2Type type : BCF2Type.values() ) { + for (final BCF2Type type : BCF2Type.values()) { primitives.add(new BCF2TypedValue(null, type)); } @@ -124,7 +124,7 @@ public void before() { forCombinations.add(new BCF2TypedValue(100, BCF2Type.INT8)); forCombinations.add(new BCF2TypedValue(-100, BCF2Type.INT8)); forCombinations.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - forCombinations.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range + forCombinations.add(new BCF2TypedValue(128, BCF2Type.INT16)); // first value in range forCombinations.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); forCombinations.add(new BCF2TypedValue(100000, BCF2Type.INT32)); forCombinations.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT)); @@ -135,7 +135,7 @@ public void before() { forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); // missing values - for ( BCF2Type type : BCF2Type.values() ) { + for (final BCF2Type type : BCF2Type.values()) { forCombinations.add(new BCF2TypedValue(null, type)); } } @@ -146,16 +146,16 @@ public void before() { // // -------------------------------------------------------------------------------- - private class BCF2TypedValue { + private static class BCF2TypedValue { final BCF2Type type; final Object value; private BCF2TypedValue(final int value, final BCF2Type type) { - this(new Integer(value), type); + this(Integer.valueOf(value), type); } private BCF2TypedValue(final double value, final BCF2Type type) { - this(new Double(value), type); + this(Double.valueOf(value), type); } private BCF2TypedValue(final Object value, final BCF2Type type) { @@ -163,7 +163,9 @@ private BCF2TypedValue(final Object value, final BCF2Type type) { this.value = value; } - public boolean isMissing() { return value == null; } + public boolean isMissing() { + return value == null; + } @Override public String toString() { @@ -179,68 +181,56 @@ public String toString() { @DataProvider(name = "BCF2EncodingTestProviderBasicTypes") public Object[][] BCF2EncodingTestProviderBasicTypes() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : basicTypes ) - tests.add(new Object[]{Arrays.asList(tv)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv : basicTypes) + tests.add(new Object[]{Collections.singletonList(tv), version}); return tests.toArray(new Object[][]{}); } private interface EncodeMe { - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException; + void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException; } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithStaticCalls(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - switch ( tv.type ) { - case INT8: - case INT16: - case INT32: - encoder.encodeTypedInt((Integer)tv.value, tv.type); - break; - case FLOAT: - encoder.encodeTypedFloat((Double)tv.value); - break; - case CHAR: - encoder.encodeTypedString((String)tv.value); - break; - } - } - }); - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encodeTyped(tv.value, tv.type); - } - }); + public void testBCF2BasicTypesWithStaticCalls(final List toEncode, final BCFVersion version) throws IOException { + testBCF2BasicTypesWithEncodeMe( + toEncode, + (encoder, tv) -> { + switch (tv.type) { + case INT8: + case INT16: + case INT32: + encoder.encodeTypedInt((Integer) tv.value, tv.type); + break; + case FLOAT: + encoder.encodeTypedFloat((Double) tv.value); + break; + case CHAR: + encoder.encodeTypedString((String) tv.value); + break; + } + }, + version + ); } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectNoType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encode(tv.value); - } - }); + public void testBCF2BasicTypesWithObjectType(final List toEncode, final BCFVersion version) throws IOException { + testBCF2BasicTypesWithEncodeMe( + toEncode, + (encoder, tv) -> encoder.encodeTyped(tv.value, tv.type), + version + ); } - public void testBCF2BasicTypesWithEncodeMe(final List toEncode, final EncodeMe func) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - BCF2Encoder encoder = new BCF2Encoder(); + public void testBCF2BasicTypesWithEncodeMe(final List toEncode, final EncodeMe func, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); func.encode(encoder, tv); - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); Assert.assertNotNull(decoded); @@ -250,20 +240,20 @@ public void testBCF2BasicTypesWithEncodeMe(final List toEncode, } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectors(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { - BCF2Encoder encoder = new BCF2Encoder(); - List expected = Collections.nCopies(length, tv.value); + public void testBCF2EncodingVectors(final List toEncode, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + for (final int length : Arrays.asList(2, 5, 10, 15, 20, 25)) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + final List expected = Collections.nCopies(length, tv.value); encoder.encodeTyped(expected, tv.type); - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); Assert.assertTrue(decoded instanceof List); - final List decodedList = (List)decoded; + final List decodedList = (List) decoded; Assert.assertEquals(decodedList.size(), expected.size()); - for ( Object decodedValue : decodedList ) + for (final Object decodedValue : decodedList) myAssertEquals(tv, decodedValue); } } @@ -271,16 +261,17 @@ public void testBCF2EncodingVectors(final List toEncode) throws @DataProvider(name = "BCF2EncodingTestProviderSingletons") public Object[][] BCF2EncodingTestProviderSingletons() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : primitives ) - tests.add(new Object[]{Arrays.asList(tv)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv : primitives) + tests.add(new Object[]{Collections.singletonList(tv), version}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "BCF2EncodingTestProviderSingletons") - public void testBCF2EncodingSingletons(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); + public void testBCF2EncodingSingletons(final List toEncode, final BCFVersion version) throws IOException { + final byte[] record = encodeRecord(toEncode, version); + decodeRecord(toEncode, record, version); } // ----------------------------------------------------------------- @@ -291,29 +282,30 @@ public void testBCF2EncodingSingletons(final List toEncode) thro @DataProvider(name = "BCF2EncodingTestProviderSequences") public Object[][] BCF2EncodingTestProviderSequences() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv1 : forCombinations ) - for ( BCF2TypedValue tv2 : forCombinations ) - for ( BCF2TypedValue tv3 : forCombinations ) - tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv1 : forCombinations) + for (final BCF2TypedValue tv2 : forCombinations) + for (final BCF2TypedValue tv3 : forCombinations) + tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3), version}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectorsWithMissing(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.type != BCF2Type.CHAR ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { + public void testBCF2EncodingVectorsWithMissing(final List toEncode, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + if (tv.type != BCF2Type.CHAR) { + for (final int length : Arrays.asList(2, 5, 10, 15, 20, 25)) { final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type); - final BCF2Encoder encoder = new BCF2Encoder(); - for ( int i = 0; i < length; i++ ) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + for (int i = 0; i < length; i++) { encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type); } - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); - for ( int i = 0; i < length; i++ ) { + for (int i = 0; i < length; i++) { final Object decoded = decoder.decodeTypedValue(td); myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded); } @@ -323,9 +315,9 @@ public void testBCF2EncodingVectorsWithMissing(final List toEnco } @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons") - public void testBCF2EncodingTestProviderSequences(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); + public void testBCF2EncodingTestProviderSequences(final List toEncode, final BCFVersion version) throws IOException { + final byte[] record = encodeRecord(toEncode, version); + decodeRecord(toEncode, record, version); } // ----------------------------------------------------------------- @@ -334,20 +326,58 @@ public void testBCF2EncodingTestProviderSequences(final List toE // // ----------------------------------------------------------------- + @DataProvider(name = "Strings") + public Object[][] stringsProvider() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + tests.add(new Object[]{"", version}); + tests.add(new Object[]{" ", version}); + tests.add(new Object[]{"s", version}); + tests.add(new Object[]{"sss", version}); + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "Strings") + public void testEncodingOfListOfString(final String s, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + encoder.encodeTypedString(s); + + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); + final String decoded = decoder.decodeUnexplodedString(); + + Assert.assertEquals(s, decoded); + } + @DataProvider(name = "ListOfStrings") - public Object[][] listOfStringsProvider() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"}); + public Object[][] listofStringsProvider() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + for (final int padding : Arrays.asList(0, 1, 5)) { + tests.add(new Object[]{Collections.emptyList(), padding, version}); + tests.add(new Object[]{Collections.singletonList("s"), padding, version}); + tests.add(new Object[]{Arrays.asList("s", ""), padding, version}); + tests.add(new Object[]{Arrays.asList("s", "ss", "sss"), padding, version}); + } + } return tests.toArray(new Object[][]{}); } @Test(dataProvider = "ListOfStrings") - public void testEncodingListOfString(List strings, String expected) throws IOException { - final String collapsed = BCF2Utils.collapseStringList(strings); - Assert.assertEquals(collapsed, expected); - Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings); + public void testEncodingOfListOfString(final List strings, final int padding, final BCFVersion version) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + final byte[] bytes = encoder.compactStrings(strings); + final int paddedSize = bytes.length + padding; + encoder.encodeRawString(bytes, paddedSize); + + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); + final List decodedStrings = decoder.decodeExplodedStrings(paddedSize, ','); + + // Padding values not included + Assert.assertEquals(strings, decodedStrings); + + // The decoder should have drained all the remaining padding values from the stream + Assert.assertTrue(decoder.blockIsFullyDecoded()); } // ----------------------------------------------------------------- @@ -358,16 +388,16 @@ public void testEncodingListOfString(List strings, String expected) thro @DataProvider(name = "BestIntTypeTests") public Object[][] BestIntTypeTests() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8}); + final List tests = new ArrayList<>(); + tests.add(new Object[]{Collections.singletonList(1), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16}); tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16}); tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32}); + tests.add(new Object[]{Collections.singletonList(1000), BCF2Type.INT16}); + tests.add(new Object[]{Collections.singletonList(100000), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32}); @@ -376,22 +406,21 @@ public Object[][] BestIntTypeTests() { } @Test(dataProvider = "BestIntTypeTests") - public void determineBestEncoding(final List ints, final BCF2Type expectedType) throws IOException { + public void determineBestEncoding(final List ints, final BCF2Type expectedType) { Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType); Assert.assertEquals(BCF2Utils.determineIntegerType(toPrimitive(ints.toArray(new Integer[0]))), expectedType); } - private static int[] toPrimitive ( final Integer[] array ) { - if ( array == null ) { + private static int[] toPrimitive(final Integer[] array) { + if (array == null) { return null; - } - else if ( array.length == 0 ) { + } else if (array.length == 0) { return new int[0]; } final int[] result = new int[array.length]; for (int i = 0; i < array.length; i++) { - result[i] = array[i].intValue(); + result[i] = array[i]; } return result; } @@ -403,20 +432,20 @@ else if ( array.length == 0 ) { // ----------------------------------------------------------------- @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences") - public void testReadAndSkipWithMultipleBlocks(final List block) throws IOException { - testReadAndSkipWithMultipleBlocks(block, forCombinations); - testReadAndSkipWithMultipleBlocks(forCombinations, block); + public void testReadAndSkipWithMultipleBlocks(final List block, final BCFVersion version) throws IOException { + testReadAndSkipWithMultipleBlocks(block, forCombinations, version); + testReadAndSkipWithMultipleBlocks(forCombinations, block, version); } - public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2) throws IOException { - final byte[] record1 = encodeRecord(block1); - final byte[] record2 = encodeRecord(block2); + public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2, final BCFVersion version) throws IOException { + final byte[] record1 = encodeRecord(block1, version); + final byte[] record2 = encodeRecord(block2, version); // each record is individually good - decodeRecord(block1, record1); - decodeRecord(block2, record2); + decodeRecord(block1, record1, version); + decodeRecord(block2, record2, version); - BCF2Decoder decoder = new BCF2Decoder(); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version); // test setting decoder.setRecordBytes(record1); @@ -426,7 +455,7 @@ public void testReadAndSkipWithMultipleBlocks(final List block1, // test combining the streams final byte[] combined = combineRecords(record1, record2); - final List combinedObjects = new ArrayList(block1); + final List combinedObjects = new ArrayList<>(block1); combinedObjects.addAll(block2); // the combined bytes is the same as the combined objects @@ -447,70 +476,60 @@ public void testReadAndSkipWithMultipleBlocks(final List block1, // // Test encoding / decoding arrays of ints // - // This checks that we can encode and decode correctly with the - // low-level decodeIntArray function arrays of values. This - // has to be pretty comprehensive as decodeIntArray is a highly optimized + // This checks that we can correctly encode and decode int[] with + // the low-level decodeIntArray function arrays. This has to be + // pretty comprehensive as decodeIntArray is a highly optimized // piece of code with lots of edge cases. The values we are encoding // don't really matter -- just that the values come back as expected. // + // decodeIntArray is only meant to decode arrays that are guaranteed + // to not have internal missing values, but may be missing (or EOV) + // padded, so we are interested in whether the encoder correctly + // truncates padded arrays while draining the stream. // ----------------------------------------------------------------- - @DataProvider(name = "IntArrays") - public Object[][] makeIntArrays() { - List tests = new ArrayList(); + @DataProvider(name = "BCF2_2IntArrays") + public Object[][] IntArrays() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + for (final int nValues : Arrays.asList(0, 1, 2, 5, 10, 100)) { + for (final int nPad : Arrays.asList(0, 1, 2, 5, 10, 100)) { + final int nElements = nValues + nPad; - for ( int nValues : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - for ( int nPad : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - int nElements = nValues + nPad; + final int[] vs = new int[nValues]; - List values = new ArrayList(nElements); + // add nValues from 0 to nValues - 1 + for (int i = 0; i < nValues; i++) + vs[i] = i; - // add nValues from 0 to nValues - 1 - for ( int i = 0; i < nValues; i++ ) - values.add(i); - - // add nPad nulls - for ( int i = 0; i < nPad; i++ ) - values.add(null); - - tests.add(new Object[]{values}); + tests.add(new Object[]{vs, nElements, version}); + } } } return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "IntArrays") - public void testIntArrays(final List ints) throws IOException { - final BCF2Encoder encoder = new BCF2Encoder(); - encoder.encodeTyped(ints, BCF2Type.INT16); + @Test(dataProvider = "BCF2_2IntArrays") + public void testBCF2_2IntArrays(final int[] ints, final int paddedSize, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + encoder.encodeTypedVecInt(ints, paddedSize); - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - - final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); // read the int[] with the low-level version + final byte typeDescriptor = decoder.readTypeDescriptor(); final int size = decoder.decodeNumberOfElements(typeDescriptor); final int[] decoded = decoder.decodeIntArray(typeDescriptor, size); - if ( isMissing(ints) ) { - // we expect that the result is null in this case - Assert.assertNull(decoded, "Encoded all missing values -- expected null"); + if (ints.length == 0) { + Assert.assertNull(decoded); } else { - // we expect at least some values to come back - Assert.assertTrue(decoded.length > 0, "Must have at least 1 element for non-null encoded data"); - - // check corresponding values - for ( int i = 0; i < ints.size(); i++ ) { - final Integer expected = ints.get(i); - - if ( expected == null ) { - Assert.assertTrue(decoded.length <= i, "we expect decoded to be truncated for missing values"); - } else { - Assert.assertTrue(decoded.length > i, "we expected at least " + i + " values in decoded array"); - Assert.assertEquals(decoded[i], (int)expected); - } - } + // Padding values not included + Assert.assertEquals(ints.length, decoded.length); + + // The decoder should have drained all the remaining padding values from the stream + Assert.assertTrue(decoder.blockIsFullyDecoded()); } } @@ -520,24 +539,17 @@ public void testIntArrays(final List ints) throws IOException { // // ----------------------------------------------------------------- - private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); + private byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); baos.write(record1); baos.write(record2); return baos.toByteArray(); } - private final byte[] encodeRecord(final List toEncode) throws IOException { - BCF2Encoder encoder = new BCF2Encoder(); - - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.isMissing() ) - encoder.encodeTypedMissing(tv.type); - else { - final BCF2Type encodedType = encoder.encode(tv.value); - if ( tv.type != null ) // only if we have an expectation - Assert.assertEquals(encodedType, tv.type); - } + private byte[] encodeRecord(final List toEncode, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + for (final BCF2TypedValue tv : toEncode) { + encoder.encodeTyped(tv.value, tv.type); } // check output @@ -547,12 +559,12 @@ private final byte[] encodeRecord(final List toEncode) throws IO return record; } - private final void decodeRecord(final List toEncode, final byte[] record) throws IOException { - decodeRecord(toEncode, new BCF2Decoder(record)); + private void decodeRecord(final List toEncode, final byte[] record, final BCFVersion version) throws IOException { + decodeRecord(toEncode, BCF2Decoder.getDecoder(version, record)); } - private final void decodeRecord(final List toEncode, final BCF2Decoder decoder) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { + private void decodeRecord(final List toEncode, final BCF2Decoder decoder) throws IOException { + for (final BCF2TypedValue tv : toEncode) { Assert.assertFalse(decoder.blockIsFullyDecoded()); final Object decoded = decoder.decodeTypedValue(); @@ -562,25 +574,17 @@ private final void decodeRecord(final List toEncode, final BCF2D Assert.assertTrue(decoder.blockIsFullyDecoded()); } - private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { - if ( tv.value == null ) { // special needs for instanceof double - Assert.assertEquals(decoded, tv.value); - } else if ( tv.type == BCF2Type.FLOAT ) { // need tolerance for floats, and they aren't null + private void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { + if (tv.value == null) { // special needs for instanceof double + Assert.assertNull(decoded); + } else if (tv.type == BCF2Type.FLOAT) { // need tolerance for floats, and they aren't null Assert.assertTrue(decoded instanceof Double); - final double valueFloat = (Double)tv.value; - final double decodedFloat = (Double)decoded; + final double valueFloat = (Double) tv.value; + final double decodedFloat = (Double) decoded; VariantBaseTest.assertEqualsDoubleSmart(decodedFloat, valueFloat, FLOAT_TOLERANCE); } else Assert.assertEquals(decoded, tv.value); } - - private final boolean isMissing(final List values) { - if ( values != null ) - for ( Integer value : values ) - if ( value != null ) - return false; - return true; - } } \ No newline at end of file diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java new file mode 100644 index 0000000000..afb198286e --- /dev/null +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -0,0 +1,464 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.variant.VariantBaseTest; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +public class BCF2FieldEncoderTest extends VariantBaseTest { + + private static final BCF2Encoder.BCF2_2Encoder ENCODER = new BCF2Encoder.BCF2_2Encoder(); + private static final BCF2FieldEncoder.AtomicIntFieldEncoder ATOMIC_INT = new BCF2FieldEncoder.AtomicIntFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.AtomicFloatFieldEncoder ATOMIC_FLOAT = new BCF2FieldEncoder.AtomicFloatFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.CharFieldEncoder CHAR = new BCF2FieldEncoder.CharFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.StringFieldEncoder STRING = new BCF2FieldEncoder.StringFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.VecIntFieldEncoder VEC_INT = new BCF2FieldEncoder.VecIntFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.VecFloatFieldEncoder VEC_FLOAT = new BCF2FieldEncoder.VecFloatFieldEncoder(ENCODER); + + + @DataProvider(name = "fieldEncoderCases") + public static Object[][] fieldEncoderCases() { + final List cases = new ArrayList<>(); + + // Integer encoding + { + for (final BCF2Type intType : BCF2Utils.INTEGER_TYPES_BY_SIZE) { + final int byteWidth = intType.getSizeInBytes(); + final List intsToEncode = Arrays.asList(1, -1, null, 1 << (byteWidth * 8 - 2)); + final ByteBuffer bytes = ByteBuffer.allocate(intsToEncode.size() * byteWidth); + for (final Object o : intsToEncode) { + final int i = o == null ? intType.getMissingBytes() : (Integer) o; + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + ATOMIC_INT, + intsToEncode, + bytes.array(), + }); + } + } + + // Float encoding + { + final int byteWidth = BCF2Type.FLOAT.getSizeInBytes(); + final List floatsToEncode = Arrays.asList(1.0, -1.0, null, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); + final ByteBuffer bytes = ByteBuffer.allocate(floatsToEncode.size() * byteWidth); + for (final Object o : floatsToEncode) { + final int i = o == null ? BCF2Type.FLOAT.getMissingBytes() : Float.floatToRawIntBits((float) (double) (Double) o); + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + ATOMIC_FLOAT, + floatsToEncode, + bytes.array(), + }); + } + + // Char encoding + { + final List stringsToEncode = Arrays.asList("str", null, "\0a\0"); + final int maxByteWidth = stringsToEncode + .stream() + .mapToInt(o -> o == null ? 0 : ((String) o).getBytes(StandardCharsets.UTF_8).length) + .max().getAsInt(); + final ByteBuffer bytes = ByteBuffer.allocate(stringsToEncode.size() * maxByteWidth); + for (final Object o : stringsToEncode) { + final byte[] b = o == null ? new byte[0] : ((String) o).getBytes(StandardCharsets.UTF_8); + bytes.put(b); + for (int i = maxByteWidth - b.length; i > 0; i--) bytes.put((byte) 0); + } + cases.add(new Object[]{ + CHAR, + stringsToEncode, + bytes.array(), + }); + } + + // String encoding + { + final List stringsToEncode = Arrays.asList("st", null, Arrays.asList("a", "b"), new String[]{"a", "b"}); + final byte[] bytes = new byte[]{ + 's', 't', '\0', // padding + '\0', '\0', '\0', // null values should be encoded as all NULL bytes + 'a', ',', 'b', // lists of strings joined with , + 'a', ',', 'b', // arrays of strings joined with , + }; + cases.add(new Object[]{ + STRING, + stringsToEncode, + bytes, + }); + } + + // Vector of integers encoding + { + for (final BCF2Type intType : BCF2Utils.INTEGER_TYPES_BY_SIZE) { + final int byteWidth = intType.getSizeInBytes(); + final List vecsToEncode = Arrays.asList( + Arrays.asList(null, 1), // Internal null should be missing bytes, not EOV + new int[]{1}, // Short vector should be EOV padded + null, // Entirely missing vector should be all EOV + 1 << (byteWidth * 8 - 2) // Atomic value should be treated as vector of size 1 + ); + final int nValues = 2; + final ByteBuffer bytes = ByteBuffer.allocate(nValues * vecsToEncode.size() * byteWidth); + final int[] ints = new int[]{ + intType.getMissingBytes(), 1, + 1, intType.getEOVBytes(), + intType.getEOVBytes(), intType.getEOVBytes(), + 1 << (byteWidth * 8 - 2), intType.getEOVBytes(), + }; + for (final int i : ints) { + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + VEC_INT, + vecsToEncode, + bytes.array(), + }); + } + } + + // Vector of floats encoding + { + final int byteWidth = BCF2Type.FLOAT.getSizeInBytes(); + final List vecsToEncode = Arrays.asList( + Arrays.asList(null, 1.0), // Internal null should be missing bytes, not EOV + new double[]{1.0}, // Short vector should be EOV padded + null, // Entirely missing vector should be all EOV + Double.NaN // Atomic value should be treated as vector of size 1 + ); + final int nValues = 2; + final ByteBuffer bytes = ByteBuffer.allocate(nValues * vecsToEncode.size() * byteWidth); + final int[] ints = new int[]{ + BCF2Type.FLOAT.getMissingBytes(), Float.floatToRawIntBits(1.0f), + Float.floatToRawIntBits(1.0f), BCF2Type.FLOAT.getEOVBytes(), + BCF2Type.FLOAT.getEOVBytes(), BCF2Type.FLOAT.getEOVBytes(), + Float.floatToRawIntBits((float) Double.NaN), BCF2Type.FLOAT.getEOVBytes(), + }; + for (final int i : ints) { + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + VEC_FLOAT, + vecsToEncode, + bytes.array(), + }); + } + + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "fieldEncoderCases") + public static void testFieldEncoders( + final BCF2FieldEncoder encoder, + final List objects, + final byte[] expectedBytes + ) throws IOException { + for (final Object o : objects) { + encoder.load(o); + } + encoder.encode(); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } + + + @DataProvider(name = "siteWriterCases") + public static Object[][] siteWriterCases() { + final List cases = new ArrayList<>(); + + // Generic encoder + { + final VCFInfoHeaderLine info = new VCFInfoHeaderLine("genericKey", 2, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter.SiteAttributeWriter writer = new BCF2FieldWriter.SiteAttributeWriter(info, 1, ENCODER); + final VariantContext vc1 = new VariantContextBuilder() + .attribute("genericKey", 1) + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes1 = new byte[]{ + 0x21, // 2 8-bit ints + 1, (byte) BCF2Type.INT8.getEOVBytes() // Field writer should pad out array to 2 elements to match header count + }; + cases.add(new Object[]{ + writer, vc1, bytes1, + }); + + final VariantContext vc2 = new VariantContextBuilder() + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes2 = new byte[]{ + 0x01, // Field writer should directly write typed missing, ignoring header count + }; + cases.add(new Object[]{ + writer, vc2, bytes2, + }); + } + + // Flag writer + { + final VCFInfoHeaderLine info = new VCFInfoHeaderLine("genericKey", 0, VCFHeaderLineType.Flag, "test"); + final BCF2FieldWriter.SiteFlagWriter writer = new BCF2FieldWriter.SiteFlagWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .attribute("genericKey", true) + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x00, // MISSING type just used as a filler value + }; + cases.add(new Object[]{ + writer, vc, bytes, + }); + } + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "siteWriterCases") + public void testSiteWriters( + final BCF2FieldWriter.SiteWriter writer, + final VariantContext vc, + final byte[] expectedBytes + ) throws IOException { + // Skip starting so we don't get key in output + writer.encode(vc); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } + + + @DataProvider(name = "genotypeWriterCases") + public static Object[][] genotypeWriterCases() { + final List cases = new ArrayList<>(); + + // Generic encoder + { + final VCFFormatHeaderLine info = new VCFFormatHeaderLine("genericKey", 2, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter.GenotypeAttributeWriter writer = new BCF2FieldWriter.GenotypeAttributeWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .attribute("genericKey", 1) + .chr("dummy") + .genotypes(new GenotypeBuilder() + .name("sample") + .attribute("genericKey", 1) + .make() + ) + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + 1, (byte) BCF2Type.INT8.getEOVBytes() // Field writer should pad out array to 2 elements to match header count + }; + cases.add(new Object[]{ + writer, vc, Collections.singletonList("sample"), bytes, + }); + } + + // FT encoder + { + final VCFFormatHeaderLine info = new VCFFormatHeaderLine("FT", 1, VCFHeaderLineType.String, "test"); + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .genotypes( + new GenotypeBuilder() + .name("hasFilter") + .filter("f") + .make(), + new GenotypeBuilder() + .name("noFilter") + .unfiltered() // should be encoded as PASS + .make() + ) + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x47, // Strings of length 4 + 'f', 0, 0, 0, + 'P', 'A', 'S', 'S', + }; + cases.add(new Object[]{ + writer, vc, Arrays.asList("hasFilter", "noFilter"), bytes, + }); + } + + // GT encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "test"); + final Allele ref = Allele.REF_A; + final Allele alt = Allele.ALT_T; + + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + { + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("refAlt") + .alleles(Arrays.asList(ref, alt)) + .make(), + new GenotypeBuilder() + .name("refAltPhased") + .alleles(Arrays.asList(ref, alt)) + .phased(true) + .make(), + new GenotypeBuilder() + .name("missingMissing") + .alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)) + .make(), + new GenotypeBuilder() + .name("haploid") + .alleles(Collections.singletonList(ref)) + .make() + ) + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + 0x02, 0x04, + 0x02, 0x05, + 0x00, 0x00, + 0x02, (byte) 0x81, + }; + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + + // TODO revisit this test once the correct behavior is determined + // Test encoding for a VC entirely missing genotype data + { + final VariantContext vcMissingGenotypes = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("refAlt") + .alleles(Arrays.asList(ref, alt)) + .make() + ) + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + (byte) BCF2Type.INT8.getMissingBytes(), (byte) BCF2Type.INT8.getMissingBytes(), + }; + cases.add(new Object[]{ + writer, vcMissingGenotypes, + Collections.singletonList("sampleNameNotPresentInGenotype"), + bytes, + }); + } + } + + // Inline integer encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .genotypes( + new GenotypeBuilder() + .name("small") + .DP(2) + .make(), + new GenotypeBuilder() + .name("big") + .DP(256) + .make() + ) + .alleles("A") + .make(); + + final byte[] bytes = new byte[]{ + 0x12, // 1 16-bit int + 0x02, 0x00, + (byte) 256, 256 >> 8, + }; + + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + + // Inline vector of integer encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "test"); + final Allele ref = Allele.REF_A; + final Allele alt = Allele.ALT_T; + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("small") + .alleles(Arrays.asList(ref, alt)) + .PL(new int[]{1, 2}) + .make(), + new GenotypeBuilder() + .name("big") + .alleles(Arrays.asList(ref, alt)) + .PL(new int[]{256}) // should pad out + .make() + ) + .make(); + + final byte[] bytes = new byte[]{ + 0x32, // 3 16-bit ints + 0x01, 0x00, 0x02, 0x00, (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8), + (byte) 256, 256 >> 8, (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8), (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8) + }; + + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "genotypeWriterCases") + public void testGenotypeWriters( + final BCF2FieldWriter.GenotypeWriter writer, + final VariantContext vc, + final List sampleNames, + final byte[] expectedBytes + ) throws IOException { + writer.encode(vc, sampleNames); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index e18c0d9309..5f658bd69b 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -50,47 +50,6 @@ * Tests for BCF2Utils */ public final class BCF2UtilsUnitTest extends VariantBaseTest { - @DataProvider(name = "CollapseExpandTest") - public Object[][] makeCollapseExpandTest() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("A"), "A", false}); - tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true}); - tests.add(new Object[]{Arrays.asList("AB"), "AB", false}); - tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true}); - tests.add(new Object[]{Arrays.asList(), "", false}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CollapseExpandTest") - public void testCollapseExpandTest(final List in, final String expectedCollapsed, final boolean isCollapsed) { - final String actualCollapsed = BCF2Utils.collapseStringList(in); - Assert.assertEquals(actualCollapsed, expectedCollapsed); - Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed); - if ( isCollapsed ) - Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in); - } - - @Test - public void testCreateDictionary() { - final List inputLines = new ArrayList(); - int counter = 0; - inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); - inputLines.add(new VCFFilterHeaderLine("l" + counter++)); - inputLines.add(new VCFFilterHeaderLine("l" + counter++)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFHeaderLine("x", "misc")); - inputLines.add(new VCFHeaderLine("y", "misc")); - inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); - final ArrayList dict = BCF2Utils.makeDictionary(inputHeader); - final int dict_size = dict.size(); - Assert.assertEquals(8,dict_size); - } /** * Wrapper class for HeaderOrderTestProvider test cases to prevent TestNG from calling toString() @@ -101,7 +60,7 @@ private static class HeaderOrderTestCase { public final VCFHeader testHeader; public final boolean expectedConsistent; - public HeaderOrderTestCase( final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent ) { + public HeaderOrderTestCase(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) { this.inputHeader = inputHeader; this.testHeader = testHeader; this.expectedConsistent = expectedConsistent; @@ -110,8 +69,8 @@ public HeaderOrderTestCase( final VCFHeader inputHeader, final VCFHeader testHea @DataProvider(name = "HeaderOrderTestProvider") public Object[][] makeHeaderOrderTestProvider() { - final List inputLines = new ArrayList(); - final List extraLines = new ArrayList(); + final List inputLines = new ArrayList<>(); + final List extraLines = new ArrayList<>(); int counter = 0; inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); @@ -125,7 +84,7 @@ public Object[][] makeHeaderOrderTestProvider() { inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); extraLines.add(new VCFFilterHeaderLine("l" + counter++)); extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); @@ -134,19 +93,20 @@ public Object[][] makeHeaderOrderTestProvider() { extraLines.add(new VCFHeaderLine("x", "misc")); extraLines.add(new VCFHeaderLine("y", "misc")); - List tests = new ArrayList(); - for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) { + final List tests = new ArrayList<>(); + for (final int extrasToTake : Arrays.asList(0, 1, 2, 3)) { final List empty = Collections.emptyList(); final List> permutations = extrasToTake == 0 - ? Collections.singletonList(empty) - : GeneralUtils.makePermutations(extraLines, extrasToTake, false); - for ( final List permutation : permutations ) { - for ( int i = -1; i < inputLines.size(); i++ ) { - final List allLines = new ArrayList(inputLines); - if ( i >= 0 && !VCFHeaderVersion.isFormatString(allLines.get(i).getKey()) ) + ? Collections.singletonList(empty) + : GeneralUtils.makePermutations(extraLines, extrasToTake, false); + for (final List permutation : permutations) { + for (int i = -1; i < inputLines.size(); i++) { + final List allLines = new ArrayList<>(inputLines); + if (i >= 0) allLines.remove(i); allLines.addAll(permutation); - final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); + allLines.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<>(allLines)); final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter); tests.add(new Object[]{new HeaderOrderTestCase(inputHeader, testHeader, expectedConsistent)}); } @@ -155,18 +115,18 @@ public Object[][] makeHeaderOrderTestProvider() { // sample name tests final List> sampleNameTests = Arrays.asList( - new ArrayList(), - Arrays.asList("A"), - Arrays.asList("A", "B"), - Arrays.asList("A", "B", "C")); - for ( final List inSamples : sampleNameTests ) { - for ( final List testSamples : sampleNameTests ) { + new ArrayList<>(), + Collections.singletonList("A"), + Arrays.asList("A", "B"), + Arrays.asList("A", "B", "C")); + for (final List inSamples : sampleNameTests) { + for (final List testSamples : sampleNameTests) { final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples); final List> permutations = testSamples.isEmpty() - ? Collections.singletonList(testSamples) - : GeneralUtils.makePermutations(testSamples, testSamples.size(), false); - for ( final List testSamplesPermutation : permutations ) { + ? Collections.singletonList(testSamples) + : GeneralUtils.makePermutations(testSamples, testSamples.size(), false); + for (final List testSamplesPermutation : permutations) { final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation); final boolean expectedConsistent = testSamples.equals(inSamples); tests.add(new Object[]{new HeaderOrderTestCase(inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent)}); @@ -187,8 +147,8 @@ private static boolean expectedConsistent(final VCFHeader combinationHeader, fin } // as long as the start contains all of the ids up to minCounterForInputLines in order - for ( int i = 0; i < minCounterForInputLines; i++ ) - if ( i >= ids.size() || ids.get(i) != i ) + for (int i = 0; i < minCounterForInputLines; i++) + if (i >= ids.size() || ids.get(i) != i) return false; return true; @@ -199,32 +159,8 @@ private static boolean expectedConsistent(final VCFHeader combinationHeader, fin // even when the header file is slightly different // @Test(dataProvider = "HeaderOrderTestProvider") - public void testHeaderOrder( final HeaderOrderTestCase testCase ) { + public void testHeaderOrder(final HeaderOrderTestCase testCase) { final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testCase.testHeader, testCase.inputHeader); Assert.assertEquals(actualOrderConsistency, testCase.expectedConsistent); } - - - private void assertListsAreEquivalent(final List a, final List b) { - Assert.assertEquals(a.size(), b.size()); - for (int i=0; i tests = new ArrayList(); - tests.add(new Object[]{Object.class, null, Collections.emptyList()}); - tests.add(new Object[]{Integer.class, 1, Arrays.asList(1)}); - tests.add(new Object[]{Integer.class, new int[]{1, 2, 3}, Arrays.asList(1, 2, 3)}); - tests.add(new Object[]{String.class, Arrays.asList("X", "Y"), Arrays.asList("X", "Y")}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "toListTestProvider") - public void testToList(final Class cls, final Object input, final List expectedOutput) { - assertListsAreEquivalent(BCF2Utils.toList(cls, input), expectedOutput); - } - - } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 17e2ae3257..0bc493bf15 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -1,34 +1,36 @@ /* -* Copyright (c) 2017 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2017 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.TestUtil; +import htsjdk.samtools.util.Tuple; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.utils.BCFToolsTestUtils; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -37,30 +39,40 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.VariantContextTestProvider; -import htsjdk.variant.variantcontext.writer.*; -import htsjdk.variant.vcf.*; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; /** * @author amila - *

    - * Class BCF2WriterUnitTest - *

    - * This class tests out the ability of the BCF writer to correctly write BCF files + *

    + * Class BCF2WriterUnitTest + *

    + * This class tests out the ability of the BCF writer to correctly write BCF files */ public class BCF2WriterUnitTest extends VariantBaseTest { @@ -72,7 +84,7 @@ public class BCF2WriterUnitTest extends VariantBaseTest { * @return a fake VCF header */ private static VCFHeader createFakeHeader() { - final SAMSequenceDictionary sequenceDict = createArtificialSequenceDictionary(); + final SAMSequenceDictionary sequenceDict = VariantBaseTest.createArtificialSequenceDictionary(); final Set metaData = new HashSet<>(); final Set additionalColumns = new HashSet<>(); metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); @@ -80,10 +92,10 @@ private static VCFHeader createFakeHeader() { additionalColumns.add("extra1"); additionalColumns.add("extra2"); final VCFHeader header = new VCFHeader(metaData, additionalColumns); - header.addMetaDataLine(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.String, "x")); + header.addMetaDataLine(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "x")); header.addMetaDataLine(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x")); header.addMetaDataLine(new VCFFormatHeaderLine("BB", 1, VCFHeaderLineType.String, "x")); - header.addMetaDataLine(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.String, "x")); + header.addMetaDataLine(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "x")); header.setSequenceDictionary(sequenceDict); return header; } @@ -102,25 +114,23 @@ private void createTemporaryDirectory() { public void testWriteAndReadBCF() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider - .readAllVCs(bcfOutputFile, new BCF2Codec()); + final VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider + .readAllVCs(bcfOutputFile, new BCF2Codec()); int counter = 0; - final Iterator it = container.getVCs().iterator(); - while (it.hasNext()) { - it.next(); + for (final VariantContext ignored : container.getVCs()) { counter++; } Assert.assertEquals(counter, 2); - } @@ -132,21 +142,20 @@ public void testWriteAndReadBCFWithIndex() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); Tribble.indexFile(bcfOutputFile).deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .setOptions(EnumSet.of(Options.INDEX_ON_THE_FLY)) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .setOptions(EnumSet.of(Options.INDEX_ON_THE_FLY)) + .build() + ) { writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider - .readAllVCs(bcfOutputFile, new BCF2Codec()); + final VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider + .readAllVCs(bcfOutputFile, new BCF2Codec()); int counter = 0; - final Iterator it = container.getVCs().iterator(); - while (it.hasNext()) { - it.next(); + for (final VariantContext ignored : container.getVCs()) { counter++; } Assert.assertEquals(counter, 2); @@ -162,41 +171,43 @@ public void testWriteAndReadBCFHeaderless() throws IOException { final File bcfOutputHeaderlessFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".bcf", tempDir); bcfOutputHeaderlessFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // we write two files, bcfOutputFile with the header, and bcfOutputHeaderlessFile with just the body try (final VariantContextWriter fakeBCFFileWriter = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { fakeBCFFileWriter.writeHeader(header); // writes header } try (final VariantContextWriter fakeBCFBodyFileWriter = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputHeaderlessFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputHeaderlessFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { fakeBCFBodyFileWriter.setHeader(header); // does not write header fakeBCFBodyFileWriter.add(createVC(header)); fakeBCFBodyFileWriter.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container; - - try (final PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputFile)); - final PositionalBufferedStream bodyPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputHeaderlessFile))) { + try (final PositionalBufferedStream headerPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputFile))); + final PositionalBufferedStream bodyPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputHeaderlessFile))) + ) { - BCF2Codec codec = new BCF2Codec(); + final BCF2Codec codec = new BCF2Codec(); codec.readHeader(headerPbs); // we use the header information read from identical file with header+body to read just the body of second file int counter = 0; while (!bodyPbs.isDone()) { - VariantContext vc = codec.decode(bodyPbs); + codec.decode(bodyPbs); counter++; } Assert.assertEquals(counter, 2); } - } /** @@ -208,42 +219,45 @@ public void testReadAndWritePhasedBCF() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - try ( VCFFileReader vcfFile = new VCFFileReader(vcfInputFile); - - VariantContextWriter bcfWriter = new VariantContextWriterBuilder().setOutputFile(bcfOutputFile).setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()).build(); - - ) { - bcfWriter.writeHeader(vcfFile.getFileHeader()); - - for (VariantContext vc : vcfFile.iterator().toList()) { - Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); - bcfWriter.add(vc); + try (final VCFFileReader vcfFile = new VCFFileReader(vcfInputFile)) { + try (final VariantContextWriter bcfWriter = new VariantContextWriterBuilder() + .setOutputFile(bcfOutputFile) + .setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()) + .build() + ) { + bcfWriter.writeHeader(vcfFile.getFileHeader()); + for (final VariantContext vc : vcfFile.iterator().toList()) { + Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); + bcfWriter.add(vc); + } } - bcfWriter.close(); // Reading the VCF and writing it to a BCF final File vcfOutputFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".vcf", tempDir); vcfOutputFile.deleteOnExit(); - try (final PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputFile)); - VariantContextWriter vcfWriter = new VariantContextWriterBuilder().setOutputFile(vcfOutputFile).setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()).build(); - ) { + try (final PositionalBufferedStream headerPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputFile))); + final VariantContextWriter vcfWriter = new VariantContextWriterBuilder() + .setOutputFile(vcfOutputFile) + .setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()) + .build() + ) { vcfWriter.writeHeader(vcfFile.getFileHeader()); - BCF2Codec codec = new BCF2Codec(); + final BCF2Codec codec = new BCF2Codec(); codec.readHeader(headerPbs); // we use the header information read from identical file with header+body to read just the body of second file while (!headerPbs.isDone()) { - VariantContext vc = codec.decode(headerPbs); + final VariantContext vc = codec.decode(headerPbs); Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); vcfWriter.add(vc); } - vcfWriter.close(); } - try (VCFFileReader vcfOutput = new VCFFileReader(vcfInputFile);) { - for (VariantContext vc : vcfOutput.iterator().toList()) { + try (final VCFFileReader vcfOutput = new VCFFileReader(vcfInputFile)) { + for (final VariantContext vc : vcfOutput.iterator().toList()) { Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); } } @@ -255,12 +269,13 @@ public void testWriteHeaderTwice() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent writing header twice try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.writeHeader(header); } @@ -271,12 +286,13 @@ public void testChangeHeaderAfterWritingHeader() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent changing header if it's already written try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.setHeader(header); } @@ -287,12 +303,13 @@ public void testChangeHeaderAfterWritingBody() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent changing header if part of body is already written try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.setHeader(header); writer.add(createVC(header)); writer.setHeader(header); @@ -305,7 +322,7 @@ public void testChangeHeaderAfterWritingBody() throws IOException { * @param header the VCF header * @return a VCFRecord */ - private VariantContext createVC(final VCFHeader header) { + private static VariantContext createVC(final VCFHeader header) { final List alleles = new ArrayList<>(); final Map attributes = new HashMap<>(); final GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size()); @@ -315,14 +332,104 @@ private VariantContext createVC(final VCFHeader header) { attributes.put("DP", "50"); for (final String name : header.getGenotypeSamples()) { - final Genotype gt = new GenotypeBuilder(name, alleles.subList(1, 2)).GQ(0).attribute("BB", "1").phased(true) - .make(); + final Genotype gt = new GenotypeBuilder(name, alleles.subList(1, 2)) + .GQ(0).attribute("BB", "1") + .phased(true) + .make(); genotypes.add(gt); } return new VariantContextBuilder("RANDOM", "1", 1, 1, alleles) - .genotypes(genotypes).attributes(attributes).make(); + .genotypes(genotypes).attributes(attributes).make(); + } + + @DataProvider + public Object[][] bcftoolsRoundTripProvider() { + return new Object[][]{ + {"phased.vcf"}, + {"test1.vcf"}, + {"NA12891.vcf"}, + {"NA12891.fp.vcf"}, + {"dbsnp_135.b37.1000.vcf"}, + {"structuralvariants.vcf"}, + // TODO the test testBCFToolsReadsHtsjdkOutput fails for the following two files + // due to what appears to be a bug in bcftools' VCF output where missing FORMAT + // values are sometimes encoded as an empty string and not '.' + // This seems to have something to do with the affected keys being in trailing + // position in the original VCF (trailing missing values can be dropped), and + // htsjdk reordering FORMAT keys by sorting them alphabetically +// {"ex2.vcf"}, +// {"test.vcf.bgz"}, + }; } + @Test(dataProvider = "bcftoolsRoundTripProvider") + public void testBCFToolsReadsHtsjdkOutput(final String testFile) throws IOException { + // Take an input VCF and read it into memory as our expected output + // Take the same VCF and write it out as a BCF using htsjdk's BCF2Writer, use bcftools to convert from + // BCF back to VCF, and read the converted VCF into memory again as our actual output + final Path path = new File(VariantBaseTest.variantTestDataRoot + testFile).toPath(); + final Tuple> expectedVCF = readEntireVCFIntoMemory(path); + final VCFHeader header = expectedVCF.a; + final List expectedVariantContexts = expectedVCF.b; + + final File bcfOutputFile = File.createTempFile("testBCFToolsRoundTrip" + testFile, ".bcf", tempDir); + bcfOutputFile.deleteOnExit(); + + try (final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { + writer.writeHeader(header); + for (final VariantContext vc : expectedVariantContexts) { + writer.add(vc); + } + } + + final Path converted = BCFToolsTestUtils.BCFToVCF(bcfOutputFile, "").toPath(); + final Tuple> actualVCF = readEntireVCFIntoMemory(converted); + final List actualVariantContexts = actualVCF.b; + + // Don't compare the headers, since they might contain extraneous lines, and the BCF codec isn't responsible + // for headers + Assert.assertEquals(expectedVariantContexts.size(), actualVCF.b.size()); + final int length = expectedVariantContexts.size(); + for (int i = 0; i < length; i++) { + // Fully decode both variant contexts so that we're comparing actual objects and not their string + // representations, which can be different without affecting semantics, e.g. number of digits in a double + VariantBaseTest.assertVariantContextsAreEqual( + actualVariantContexts.get(i).fullyDecode(header, false), + expectedVariantContexts.get(i).fullyDecode(header, false) + ); + } + } + @Test(dataProvider = "bcftoolsRoundTripProvider") + public void testHtsjdkReadsBCFToolsOutput(final String testFile) { + // Take an input VCF and read it into memory as our expected output + // Take the same VCF and convert it to BCF using bcftools, then read the BCF into memory again as our actual output + final Path path = new File(VariantBaseTest.variantTestDataRoot + testFile).toPath(); + final Tuple> expectedVCF = readEntireVCFIntoMemory(path); + final VCFHeader header = expectedVCF.a; + final List expectedVariantContexts = expectedVCF.b; + + final File converted = BCFToolsTestUtils.VCFtoBCF(path.toFile(), ""); + final VCFFileReader reader = new VCFFileReader(converted, false); + + final List actualVariantContexts = reader.iterator().stream().collect(Collectors.toList()); + + // Don't compare the headers, since they might contain extraneous lines, and the BCF codec isn't responsible + // for headers + Assert.assertEquals(expectedVariantContexts.size(), actualVariantContexts.size()); + final int length = expectedVariantContexts.size(); + for (int i = 0; i < length; i++) { + // Fully decode both variant contexts so that we're comparing actual objects and not their string + // representations, which can be different without affecting semantics, e.g. number of digits in a double + VariantBaseTest.assertVariantContextsAreEqual( + actualVariantContexts.get(i).fullyDecode(header, false), + expectedVariantContexts.get(i).fullyDecode(header, false) + ); + } + } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java b/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java index 39fce34b18..5c0af6c761 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java @@ -13,24 +13,13 @@ import java.io.IOException; public class BCFCodecTest extends VariantBaseTest { - final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/"; - - // should reject bcf v2.2 on read, see issue https://github.com/samtools/htsjdk/issues/1323 - @Test(expectedExceptions = TribbleException.class) - private void testRejectBCFVersion22() throws IOException { - BCF2Codec bcfCodec = new BCF2Codec(); - try (final FileInputStream fis = new FileInputStream(new File(TEST_DATA_DIR, "BCFVersion22Uncompressed.bcf")); - final PositionalBufferedStream pbs = new PositionalBufferedStream(fis)) { - bcfCodec.readHeader(pbs); - } - } + private static final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/"; @Test - private void testBCFCustomVersionCompatibility() throws IOException { + public void testBCFCustomVersionCompatibility() throws IOException { final BCF2Codec bcfCodec = new BCF2Codec() { @Override protected void validateVersionCompatibility(final BCFVersion supportedVersion, final BCFVersion actualVersion) { - return; } }; diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java index e04910eb0e..34adb5a4b0 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java @@ -27,11 +27,9 @@ import htsjdk.HtsjdkTest; import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.FeatureCodecHeader; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.LineIteratorImpl; -import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.readers.SynchronousLineReader; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.bcf2.BCF2Codec; @@ -48,7 +46,8 @@ import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; - +import htsjdk.variant.vcf.VCFIterator; +import htsjdk.variant.vcf.VCFIteratorBuilder; import org.testng.Assert; import java.io.BufferedInputStream; @@ -234,6 +233,7 @@ private static void createSyntheticHeader() { addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); + // TODO changed count type from UNBOUNDED to 1 to match VCF 4.3 spec, but might conflict with existing htsjdk code addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header @@ -674,6 +674,7 @@ public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTe assertEquals(g, expected.getGenotype(g.getSampleName())); } else { // missing + // TODO this may not be correct Assert.assertTrue(g.isNoCall()); } } @@ -755,29 +756,9 @@ public void remove() { } } public static VariantContextContainer readAllVCs(final File input, final BCF2Codec codec) throws IOException { - PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(input)); - FeatureCodecHeader header = codec.readHeader(headerPbs); - headerPbs.close(); - - final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(input)); - pbs.skip(header.getHeaderEnd()); - - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new VariantContextTestProvider.VariantContextContainer(vcfHeader, new VariantContextTestProvider.VCIterable(codec, vcfHeader) { - @Override - public boolean hasNext() { - try { - return !pbs.isDone(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public Object nextSource() { - return pbs; - } - }); + final VCFIterator iterator = new VCFIteratorBuilder().open(input); + final VCFHeader vcfHeader = iterator.getHeader(); + return new VariantContextTestProvider.VariantContextContainer(vcfHeader, () -> iterator); } public static VariantContextContainer readAllVCs(final File input, final VCFCodec codec) throws FileNotFoundException { @@ -868,7 +849,7 @@ public static void assertEquals(final Genotype actual, final Genotype expected) // inline attributes Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad"); Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index ceac4f95a8..4931fd8b09 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -29,12 +29,9 @@ import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.TestUtil; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureReader; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.AsciiLineReader; import htsjdk.tribble.readers.AsciiLineReaderIterator; -import htsjdk.tribble.util.TabixUtils; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -42,7 +39,19 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFStandardHeaderLines; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.io.File; import java.io.FileInputStream; @@ -52,16 +61,10 @@ import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - /** * @author aaron *

    @@ -104,31 +107,23 @@ public void testBasicWriteAndRead(final String extension) throws IOException { writer.add(createVC(header)); writer.add(createVC(header)); writer.close(); - final VCFCodec codec = new VCFCodec(); - final FeatureReader reader = AbstractFeatureReader.getFeatureReader(fakeVCFFile.getAbsolutePath(), codec, false); - final VCFHeader headerFromFile = (VCFHeader)reader.getHeader(); + final VCFFileReader reader = new VCFFileReader(fakeVCFFile.toPath(), false, VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VCFHeader headerFromFile = reader.getHeader(); int counter = 0; // validate what we're reading in validateHeader(headerFromFile, sequenceDict); - try { - final Iterator it = reader.iterator(); - while(it.hasNext()) { - it.next(); - counter++; - } - Assert.assertEquals(counter, 2); - } - catch (final IOException e ) { - throw new RuntimeException(e.getMessage()); + for (final VariantContext variantContext : reader) { + counter++; } + Assert.assertEquals(counter, 2); } /** test, using the writer and reader, that we can output and input a VCF body without problems */ - @Test(dataProvider = "vcfExtensionsDataProvider") + @Test(dataProvider = "vcfHeaderlessExtensionsDataProvider") public void testWriteAndReadVCFHeaderless(final String extension) throws IOException { final File fakeVCFFile = File.createTempFile("testWriteAndReadVCFHeaderless.", extension, tempDir); fakeVCFFile.deleteOnExit(); @@ -226,6 +221,12 @@ private static VCFHeader createFakeHeader(final Set metaData, fin final SAMSequenceDictionary sequenceDict) { metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); metaData.add(new VCFHeaderLine("two", "2")); + // Explicitly add GT, AD, and BB keys because the .bcf tests that use this fake header require that the header + // contain INFO/FORMAT lines for all the attributes written + metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); + metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY)); + metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); + metaData.add(new VCFFormatHeaderLine("BB", 1, VCFHeaderLineType.Integer, "test key")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); final VCFHeader ret = new VCFHeader(metaData, additionalColumns); @@ -326,14 +327,21 @@ public void TestWritingLargeVCF(final String extension) throws FileNotFoundExcep @DataProvider(name = "vcfExtensionsDataProvider") public Object[][]vcfExtensionsDataProvider() { return new Object[][] { - //TODO: fix this BCF problem! - // TODO: BCF doesn't work because header is not properly constructed. - // {".bcf"}, + {FileExtensions.BCF}, {FileExtensions.VCF}, {FileExtensions.COMPRESSED_VCF} }; } + // Testing writing headerless files does not make sense for .bcf because BCF's strong typing makes writing + // bodies without headers impossible, so we only test VCF and compressed VCF with headerless writing + @DataProvider(name = "vcfHeaderlessExtensionsDataProvider") + public Object[][]vcfHeaderlessExtensionsDataProvider() { + return new Object[][] { + {FileExtensions.VCF}, + {FileExtensions.COMPRESSED_VCF} + }; + } /** * A test to ensure that if we add a line to a VCFHeader it will persist through @@ -366,7 +374,7 @@ public void testModifyHeader() { * * A test to check that we can't write VCF with missing header. */ - @Test(dataProvider = "vcfExtensionsDataProvider", expectedExceptions = IllegalStateException.class) + @Test(dataProvider = "vcfHeaderlessExtensionsDataProvider", expectedExceptions = IllegalStateException.class) public void testWriteWithEmptyHeader(final String extension) throws IOException { final File fakeVCFFile = File.createTempFile("testWriteAndReadVCFHeaderless.", extension, tempDir); metaData = new HashSet<>(); diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 9709af8cc6..8bdc321b51 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -85,7 +85,7 @@ public Object[][] otherHeaderLines() { // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse // into a VCFHeaderLine (just not a VCFSimpleHeaderLine) { "ID=", - new VCFHeaderLine("ID", "") }, + new VCFHeaderLine("ID", "") }, }; } diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index 2ba980fbb9..d0cc69d565 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -142,6 +142,49 @@ public void testRepairFlagTypeWithNegativeCount() { Assert.assertEquals(infoLine.getCount(), 0); } + @DataProvider(name = "validHeaderIDs") + public Object[][] validHeaderIDs() { + return new Object[][] { + // 1000 Genomes ID key requires special handling + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + // Test all characters allowed after first character + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + // ID can start with underscore _ + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + }; + } + + @Test(dataProvider = "validHeaderIDs") + public void testValidHeaderIDs(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider(name = "invalidHeaderIDs") + public Object[][] invalidHeaderIDs() { + return new Object[][] { + // 1000G key is only allowed for INFO lines, not FORMAT + {new VCFFormatHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key with 1000G key as prefix should be rejected + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key cannot start with number + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key containing invalid character - + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + }; + } + + @Test(dataProvider = "invalidHeaderIDs") + public void testPre43LenientHandling(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeaderVersion.VCF4_2); + } + + @Test(dataProvider = "invalidHeaderIDs", expectedExceptions = TribbleException.class) + public void testInvalidHeaderIDs(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeader.DEFAULT_VCF_VERSION); + } + @DataProvider (name = "equalsData") public Object[][] getEqualsData() { return new Object[][] { diff --git a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java index 383d272a8d..b6835e1d25 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java @@ -75,7 +75,10 @@ Object[][] pathsData() { {TEST_DATA_DIR + "Vcf4.2WithSourceVersionInfoFields.vcf", null, false, true}, // // // should reject bcf v2.2 on read, see issue https://github.com/samtools/htsjdk/issues/1323 - {TEST_DATA_DIR + "BCFVersion22Uncompressed.bcf", null, false, false} + {TEST_DATA_DIR + "BCFVersion22Uncompressed.bcf", null, false, true}, + + // Test that gzipped BCFs can be read + {TEST_DATA_DIR + "bcfV22.bcf", null, false, true} }; } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java index 94859c8717..0ddacd7ec7 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java @@ -103,8 +103,6 @@ private Object[][] getInvalidHeaderLines() { return new Object[][]{ // to parse, expected, recommended, error message {"", idDesc, none, "Unexpected tag or tag order for tag \"Description\""}, - {"", idDesc, none, "Unexpected tag or tag order for tag \"Desc\""}, - {"<>", idDesc, none, "Unexpected tag or tag order for tag \"\""}, {"", idDesc, sourceVersion, "Unexpected tag or tag order for tag \"Source\""}, diff --git a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java index 02ccdb2a33..6664501df6 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java @@ -50,21 +50,13 @@ public class VCFIteratorTest extends VariantBaseTest { @DataProvider(name = "VariantFiles") public Object[][] getVariantFiles() { - return new Object[][] { + return new Object[][] { new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf", 25 }, new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf.gz", 25 }, new Object[] { "src/test/resources/htsjdk/variant/serialization_test.bcf", 12 } }; } - @DataProvider(name = "VcfFiles") - public Object[][] getVcfFiles() { - return new Object[][] { - new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf", 25 }, - new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf.gz", 25 } - }; - } - private void assertExpectedNumberOfVariants(final VCFIterator r, final int expectVariants) { try { Assert.assertNotNull(r.getHeader()); @@ -91,32 +83,34 @@ public void testUsingFile(final String file, final int nVariants) throws IOExcep private void testUsingZippedInput(final String filepath, final int nVariants, final Function outputStreamProvider) throws IOException { File tmp = new File(filepath); + // TODO I don't understand what problem the comment below is referencing + // Does it mean the code paths for handling zipped/unzipped BCFs should be unified + // under VCFFileReader once VCFFileReader supports zipped BCF? + /* TODO fix this when VCFFileReader will support BCF see * https://github.com/samtools/htsjdk/pull/837#discussion_r139490218 * https://github.com/samtools/htsjdk/issues/946 */ - if( tmp.getName().endsWith(FileExtensions.VCF)) { + if(!tmp.getName().endsWith(FileExtensions.COMPRESSED_VCF)) { tmp = File.createTempFile("tmp",FileExtensions.COMPRESSED_VCF); tmp.deleteOnExit(); try( FileInputStream in = new FileInputStream(filepath); OutputStream out = outputStreamProvider.apply(tmp); ) { IOUtil.copyStream(in, out); out.flush(); - } catch(final IOException err) { - throw err; - } } + } try (final VCFIterator r = new VCFIteratorBuilder().open(tmp) ) { assertExpectedNumberOfVariants(r, nVariants); } } - @Test(dataProvider = "VcfFiles") + @Test(dataProvider = "VariantFiles") public void testUsingBGZippedInput(final String filepath, final int nVariants) throws IOException { testUsingZippedInput(filepath, nVariants, (F)-> new BlockCompressedOutputStream(F)); } - @Test(dataProvider = "VcfFiles") + @Test(dataProvider = "VariantFiles") public void testUsingGZippedInput(final String filepath, final int nVariants) throws IOException { testUsingZippedInput(filepath, nVariants, (F)-> { try { diff --git a/src/test/resources/htsjdk/variant/bcfV22.bcf b/src/test/resources/htsjdk/variant/bcfV22.bcf new file mode 100644 index 0000000000000000000000000000000000000000..8ded3b5103f67cfbbfbc8d852204e12b44e4658e GIT binary patch literal 613 zcmV-r0-F6FiwFb&00000{{{d;LjnLu0>zR+Z__Xo$Fpn#rQ)(v4}cV!1Bb0OZ6_g3 z1gT4x5~kd2P@o^Wikm41`Bz0?@B(cRS zjwrKNkTDe1j)U%7_sMhdG9)R-asINYRa?2*EUE2WZI#rMTy0Br8qPQ&37f1jy15dm zWhu=@>uAndf~R$H%_}c(9$7A78oBar*NF`-nRp4{0;5fqf*?Y0NXaz{IRi1|3&3ay zIAR=7#7IU%!~mf&m{I~GoHCAi#u1pKB$=W#LKNh=Kp^h7K7u@>$QS+0Jtk{NF-$p| zi1tpfL>gzF-)d!PNR}&+L3FMm>b2_wJNy5&t;1O(-c%g##c1%-8V<%*(6vVAzSZep zSg$WTeQUcFR`ALnR1Dkr_l4r@iSsp^dxl}=S`}7%)y|>uXove|Y@-Ek`ncDt_6`e% z-*txJ*3jAEeu#5W=Axeb%9hnVjV0Pz)^@o20@^Qgv!@!z2Rg@i>$uB3)Nyl_`?;^- zu*&_?ao1{dB&RfIp>0n4Kx3JxsXyqrrOJKQacPm0aqXOm%H1NGVn=fc~eX+oAO5YE4M0lN4H<^ zizQ0#u5Q0Je@pxgM#&zZWCj2LABzYC000000RIL6LPG)o8vp|U0000000000Jdz^x literal 0 HcmV?d00001 diff --git a/src/test/resources/htsjdk/variant/bcfV22.bcf.gz b/src/test/resources/htsjdk/variant/bcfV22.bcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..8ded3b5103f67cfbbfbc8d852204e12b44e4658e GIT binary patch literal 613 zcmV-r0-F6FiwFb&00000{{{d;LjnLu0>zR+Z__Xo$Fpn#rQ)(v4}cV!1Bb0OZ6_g3 z1gT4x5~kd2P@o^Wikm41`Bz0?@B(cRS zjwrKNkTDe1j)U%7_sMhdG9)R-asINYRa?2*EUE2WZI#rMTy0Br8qPQ&37f1jy15dm zWhu=@>uAndf~R$H%_}c(9$7A78oBar*NF`-nRp4{0;5fqf*?Y0NXaz{IRi1|3&3ay zIAR=7#7IU%!~mf&m{I~GoHCAi#u1pKB$=W#LKNh=Kp^h7K7u@>$QS+0Jtk{NF-$p| zi1tpfL>gzF-)d!PNR}&+L3FMm>b2_wJNy5&t;1O(-c%g##c1%-8V<%*(6vVAzSZep zSg$WTeQUcFR`ALnR1Dkr_l4r@iSsp^dxl}=S`}7%)y|>uXove|Y@-Ek`ncDt_6`e% z-*txJ*3jAEeu#5W=Axeb%9hnVjV0Pz)^@o20@^Qgv!@!z2Rg@i>$uB3)Nyl_`?;^- zu*&_?ao1{dB&RfIp>0n4Kx3JxsXyqrrOJKQacPm0aqXOm%H1NGVn=fc~eX+oAO5YE4M0lN4H<^ zizQ0#u5Q0Je@pxgM#&zZWCj2LABzYC000000RIL6LPG)o8vp|U0000000000Jdz^x literal 0 HcmV?d00001 diff --git a/src/test/resources/htsjdk/variant/structuralvariants.vcf b/src/test/resources/htsjdk/variant/structuralvariants.vcf index 5ffad2f94c..4de882ea49 100644 --- a/src/test/resources/htsjdk/variant/structuralvariants.vcf +++ b/src/test/resources/htsjdk/variant/structuralvariants.vcf @@ -7,7 +7,7 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##ALT= ##ALT= ##ALT= diff --git a/src/test/resources/htsjdk/variant/test1.vcf b/src/test/resources/htsjdk/variant/test1.vcf index 39bed22e75..55566f3365 100644 --- a/src/test/resources/htsjdk/variant/test1.vcf +++ b/src/test/resources/htsjdk/variant/test1.vcf @@ -48,6 +48,6 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 1 8216712 rs11121115 A G 1540.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=0.917;DB;DP=131;Dels=0.00;FS=11.67;HaplotypeScore=3.35;MLEAC=3;MLEAF=0.500;MQ=57.74;MQ0=1;MQRankSum=0.427;QD=11.76;ReadPosRankSum=-2.190e-01;SB=-9.390e+02;VQSLOD=5.53;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:23,28:51:99:681,0,668:127 0/1:16,18:34:99:338,0,244:127 0/1:24,22:46:99:560,0,323:127 1 17032814 rs2773183 T C 2828.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 -1 17032818 rs2773183 T C 2828.26 FILTER AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 +1 17032818 rs2773183 T C 2828.26 LowQual AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 2 1143476 rs4998209 C T 1483.26 PASS AC=2;AF=0.333;AN=6;BaseQRankSum=-4.814e+00;DB;DP=189;Dels=0.00;FS=5.61;HaplotypeScore=0.324;MLEAC=2;MLEAF=0.333;MQ=58.36;MQ0=0;MQRankSum=1.58;QD=12.06;ReadPosRankSum=0.326;SB=-9.320e+02;VQSLOD=6.81;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0|0:66,0:66:99:0,178,2264:127 0|1:33,38:71:99:844,0,1024:127 0|1:26,26:52:99:678,0,719:127 2 9240279 rs56249990 A G 3978.01 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=1.70;DB;DP=213;Dels=0.00;FS=7.83;HaplotypeScore=1.19;MLEAC=3;MLEAF=0.500;MQ=59.40;MQ0=0;MQRankSum=0.143;QD=27.25;ReadPosRankSum=-9.700e-02;SB=-1.991e+03;VQSLOD=9.14;culprit=FS GT:AD:DP:GQ:PL:TP 0|1:33,42:75:99:1400,0,1031:127 0|0:67,0:67:99:0,178,2277:127 1|1:0,71:71:99:2578,199,0:127 From 8c47db2b5873782ee53af08ec4298e8267d5fc27 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Mon, 29 Nov 2021 16:40:30 -0500 Subject: [PATCH 08/22] Make scripts/install-bcftools.sh executable --- scripts/install-bcftools.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/install-bcftools.sh diff --git a/scripts/install-bcftools.sh b/scripts/install-bcftools.sh old mode 100644 new mode 100755 From eede351aca7e22d766f6ece473b72f3cc812dca6 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Mon, 29 Nov 2021 16:44:40 -0500 Subject: [PATCH 09/22] Add installing bcftools step in github workflow --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cbc1966dd6..38ba6b23ab 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,6 +36,8 @@ jobs: run: ./gradlew compileJava - name: Install Samtools run: scripts/install-samtools.sh + - name: Install Bcftools + run: scripts/install-bcftools.sh - name: Start the htsget server run: scripts/htsget-scripts/start-htsget-test-server.sh - name: Run tests From ed64146fb4850abb4c274556d4c9eb7e6070a80d Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 30 Nov 2021 08:41:37 -0500 Subject: [PATCH 10/22] Update to bcftools 1.14, set env variable in github workflow --- .github/workflows/tests.yml | 1 + scripts/install-bcftools.sh | 6 +++--- src/test/java/htsjdk/utils/BCFToolsTestUtils.java | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 38ba6b23ab..74adfbcbf9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,6 +11,7 @@ jobs: test: env: HTSJDK_SAMTOOLS_BIN: /usr/bin/samtools + HTSJDK_BCFTOOLS_BIN: /usr/bin/bcftools runs-on: ubuntu-latest strategy: matrix: diff --git a/scripts/install-bcftools.sh b/scripts/install-bcftools.sh index fca5a62134..1694c85912 100755 --- a/scripts/install-bcftools.sh +++ b/scripts/install-bcftools.sh @@ -1,5 +1,5 @@ #!/bin/sh set -ex -wget https://github.com/samtools/bcftools/releases/download/1.13/bcftools-1.13.tar.bz2 -tar -xjvf bcftools-1.13.tar.bz2 -cd bcftools-1.13 && ./configure --prefix=/usr && make && sudo make install +wget https://github.com/samtools/bcftools/releases/download/1.14/bcftools-1.14.tar.bz2 +tar -xjvf bcftools-1.14.tar.bz2 +cd bcftools-1.14 && ./configure --prefix=/usr && make && sudo make install diff --git a/src/test/java/htsjdk/utils/BCFToolsTestUtils.java b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java index 8193791e93..c6c4234f8e 100644 --- a/src/test/java/htsjdk/utils/BCFToolsTestUtils.java +++ b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java @@ -12,7 +12,7 @@ public class BCFToolsTestUtils { private static final String BCFTOOLS_BINARY_ENV_VARIABLE = "HTSJDK_BCFTOOLS_BIN"; - public static final String expectedBCFtoolsVersion = "1.13"; + public static final String expectedBCFtoolsVersion = "1.14"; /** * @return true if bcftools is available, otherwise false From f6fcac6ab8b285cab43e0f9d3075eb968fc7272a Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 30 Nov 2021 14:21:48 -0500 Subject: [PATCH 11/22] Add tests for BCF2Dictionary, refactor BCF2WriterUnitTest --- .../htsjdk/variant/bcf2/BCF2Dictionary.java | 15 +- .../variant/bcf2/BCF2DictionaryTest.java | 164 ++++++++++++++---- .../variant/bcf2/BCF2WriterUnitTest.java | 34 ++-- src/test/resources/htsjdk/variant/test2.vcf | 2 +- 4 files changed, 165 insertions(+), 50 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java index 7b30da8643..db2d342449 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java @@ -21,8 +21,7 @@ * Dictionary of strings or contigs for use with a BCF file. *

    * Provides an Integer -> String map interface, but determines during construction whether - * mapping can be stored as an array (if it can be stored as a dense array) or - * it must be stored using a map. + * the mapping can be stored as an array or it must be stored using a map. *

    * This class validates that IDX fields are used as required by the BCF 2.2 spec, namely * that either all lines of a given dictionary type (contig or FORMAT/INFO/FILTER) have @@ -126,11 +125,6 @@ private static BCF2Dictionary makeDictionary( for (final VCFSimpleHeaderLine line : headerLines) { final String id = line.getID(); final int IDX = Integer.parseUnsignedInt(line.getGenericFieldValue(BCF2Codec.IDXField)); - if (!seen.contains(id)) { - seen.add(id); - maxIDX = Math.max(maxIDX, IDX); - strings.put(IDX, line.getID()); - } // Have we seen this IDX before with a different string? if (strings.containsKey(IDX)) { @@ -142,6 +136,13 @@ private static BCF2Dictionary makeDictionary( )); } } + + if (!seen.contains(id)) { + seen.add(id); + maxIDX = Math.max(maxIDX, IDX); + strings.put(IDX, line.getID()); + } + } if (maxIDX == seen.size() - 1) { // By the pigeonhole principle, if we have N unique non-negative IDXs numbered starting from 0 diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java index 9d5b09a0ec..9fbf27842e 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java @@ -9,9 +9,7 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -57,45 +55,149 @@ public void testCreateDictionary(final BCF2Dictionary dict) { Assert.assertEquals(8, dict_size); } - /* - @DataProvider(name = "inconsistentIDXProvider") - public Object[][] inconsistentIDXProvider() { - final List cases = new ArrayList<>(); - - // TODO can't create FILTER/FORMAT/INFO lines with arbitrary attributes - // should probably be addressed as part of refactoring, would be simpler and more consistent - for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { - // String lines with inconsistent IDX - { - int counter = 0; - final List inputLines = new ArrayList<>(); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)).getGenericFieldValue()); - - new VCFSimpleHeaderLine() + @DataProvider(name = "invalidIDXProvider") + public Object[][] invalidIDXProvider() { + final List cases = new ArrayList<>(); + // String lines with inconsistent IDX + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); + } + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); + } + // Contig lines with inconsistent IDX + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 3 + )); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 4 + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, false}); + } + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 3 + )); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 4 + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, false}); + } - final VCFHeader header = new VCFHeader(new LinkedHashSet<>(inputLines)); - final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(header, version); - cases.add(new Object[]{dict}); - } - - // Contig lines with inconsistent IDX - { - - } + // Headers with one IDX mapped to multiple strings/contigs + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); } return cases.toArray(new Object[0][]); } - @Test(expectedExceptions = {TribbleException.class}) - public void inconsistentIDX(final VCFHeader header, final BCFVersion version, final boolean string) { - if (string) { + @Test(dataProvider = "invalidIDXProvider", expectedExceptions = TribbleException.class) + public void invalidIDXUsage(final VCFHeader header, final BCFVersion version, final boolean isString) { + if (isString) { BCF2Dictionary.makeBCF2StringDictionary(header, version); } else { BCF2Dictionary.makeBCF2ContigDictionary(header, version); } } - */ + + @Test + public void testOutOfOrderAndMissingIDX() { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + final VCFHeader header = new VCFHeader(lines); + + final BCF2Dictionary stringDict = BCF2Dictionary.makeBCF2StringDictionary(header, BCFVersion.BCF2_2Version); + Assert.assertEquals(stringDict.get(6), "FOO"); + Assert.assertEquals(stringDict.get(4), "BAR"); + Assert.assertEquals(stringDict.get(2), "BAZ"); + } + + @Test + public void testLinesWithDifferentKeySameIDShareIDX() { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFFormatHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFFilterHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + final VCFHeader header = new VCFHeader(lines); + + final BCF2Dictionary stringDict = BCF2Dictionary.makeBCF2StringDictionary(header, BCFVersion.BCF2_2Version); + Assert.assertEquals(stringDict.get(2), "FOO"); + } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 0bc493bf15..83258c54ae 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -343,26 +343,22 @@ private static VariantContext createVC(final VCFHeader header) { } @DataProvider - public Object[][] bcftoolsRoundTripProvider() { + public Object[][] bcftoolsReadsHtsjdkOutputProvider() { return new Object[][]{ {"phased.vcf"}, {"test1.vcf"}, + {"test2.vcf"}, {"NA12891.vcf"}, {"NA12891.fp.vcf"}, - {"dbsnp_135.b37.1000.vcf"}, {"structuralvariants.vcf"}, - // TODO the test testBCFToolsReadsHtsjdkOutput fails for the following two files - // due to what appears to be a bug in bcftools' VCF output where missing FORMAT - // values are sometimes encoded as an empty string and not '.' - // This seems to have something to do with the affected keys being in trailing - // position in the original VCF (trailing missing values can be dropped), and - // htsjdk reordering FORMAT keys by sorting them alphabetically + // These two tests appear to fail because of a bcftools bug // {"ex2.vcf"}, // {"test.vcf.bgz"}, + {"vcf43/all43Features.utf8.vcf"} }; } - @Test(dataProvider = "bcftoolsRoundTripProvider") + @Test(dataProvider = "bcftoolsReadsHtsjdkOutputProvider") public void testBCFToolsReadsHtsjdkOutput(final String testFile) throws IOException { // Take an input VCF and read it into memory as our expected output // Take the same VCF and write it out as a BCF using htsjdk's BCF2Writer, use bcftools to convert from @@ -382,7 +378,7 @@ public void testBCFToolsReadsHtsjdkOutput(final String testFile) throws IOExcept ) { writer.writeHeader(header); for (final VariantContext vc : expectedVariantContexts) { - writer.add(vc); + writer.add(vc.fullyDecode(header, false)); } } @@ -404,7 +400,23 @@ public void testBCFToolsReadsHtsjdkOutput(final String testFile) throws IOExcept } } - @Test(dataProvider = "bcftoolsRoundTripProvider") + @DataProvider + public Object[][] htsjdkReadsBCFToolsOutputProvider() { + return new Object[][]{ + {"phased.vcf"}, + {"test1.vcf"}, + {"test2.vcf"}, + {"NA12891.vcf"}, + {"NA12891.fp.vcf"}, + {"structuralvariants.vcf"}, + {"ex2.vcf"}, + {"test.vcf.bgz"}, + // bcftools does not to decoding of percent encoded VCFs, so its BCF output contains the literal characters +// {"vcf43/all43Features.utf8.vcf"} + }; + } + + @Test(dataProvider = "htsjdkReadsBCFToolsOutputProvider") public void testHtsjdkReadsBCFToolsOutput(final String testFile) { // Take an input VCF and read it into memory as our expected output // Take the same VCF and convert it to BCF using bcftools, then read the BCF into memory again as our actual output diff --git a/src/test/resources/htsjdk/variant/test2.vcf b/src/test/resources/htsjdk/variant/test2.vcf index 39bed22e75..55566f3365 100644 --- a/src/test/resources/htsjdk/variant/test2.vcf +++ b/src/test/resources/htsjdk/variant/test2.vcf @@ -48,6 +48,6 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 1 8216712 rs11121115 A G 1540.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=0.917;DB;DP=131;Dels=0.00;FS=11.67;HaplotypeScore=3.35;MLEAC=3;MLEAF=0.500;MQ=57.74;MQ0=1;MQRankSum=0.427;QD=11.76;ReadPosRankSum=-2.190e-01;SB=-9.390e+02;VQSLOD=5.53;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:23,28:51:99:681,0,668:127 0/1:16,18:34:99:338,0,244:127 0/1:24,22:46:99:560,0,323:127 1 17032814 rs2773183 T C 2828.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 -1 17032818 rs2773183 T C 2828.26 FILTER AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 +1 17032818 rs2773183 T C 2828.26 LowQual AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 2 1143476 rs4998209 C T 1483.26 PASS AC=2;AF=0.333;AN=6;BaseQRankSum=-4.814e+00;DB;DP=189;Dels=0.00;FS=5.61;HaplotypeScore=0.324;MLEAC=2;MLEAF=0.333;MQ=58.36;MQ0=0;MQRankSum=1.58;QD=12.06;ReadPosRankSum=0.326;SB=-9.320e+02;VQSLOD=6.81;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0|0:66,0:66:99:0,178,2264:127 0|1:33,38:71:99:844,0,1024:127 0|1:26,26:52:99:678,0,719:127 2 9240279 rs56249990 A G 3978.01 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=1.70;DB;DP=213;Dels=0.00;FS=7.83;HaplotypeScore=1.19;MLEAC=3;MLEAF=0.500;MQ=59.40;MQ0=0;MQRankSum=0.143;QD=27.25;ReadPosRankSum=-9.700e-02;SB=-1.991e+03;VQSLOD=9.14;culprit=FS GT:AD:DP:GQ:PL:TP 0|1:33,42:75:99:1400,0,1031:127 0|0:67,0:67:99:0,178,2277:127 1|1:0,71:71:99:2578,199,0:127 From 9b2d77c1dd762dd4b71cbfa61adc21a8f508b54f Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Mon, 6 Dec 2021 16:53:54 -0500 Subject: [PATCH 12/22] Begin removing BCF 2.1, update tests files --- .../java/htsjdk/samtools/util/IOUtil.java | 3 ++- .../htsjdk/tribble/index/IndexFactory.java | 8 ++++++-- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 2 ++ .../java/htsjdk/variant/bcf2/BCFVersion.java | 3 ++- .../tribble/index/IndexFactoryTest.java | 9 +++------ .../htsjdk/variant/vcf/VCFIteratorTest.java | 2 +- .../htsjdk/variant/VcfThatLacksAnIndex.bcf | Bin 7490 -> 3158 bytes .../htsjdk/variant/serialization_test.bcf | Bin 7490 -> 3158 bytes .../variant/serialization_test_unzipped.bcf | Bin 0 -> 7477 bytes 9 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf diff --git a/src/main/java/htsjdk/samtools/util/IOUtil.java b/src/main/java/htsjdk/samtools/util/IOUtil.java index 81351e297a..2d97d2284c 100755 --- a/src/main/java/htsjdk/samtools/util/IOUtil.java +++ b/src/main/java/htsjdk/samtools/util/IOUtil.java @@ -1277,7 +1277,8 @@ public static List filesToPaths(Collection files){ */ public static boolean isGZIPInputStream(final InputStream stream) { if (!stream.markSupported()) { - throw new IllegalArgumentException("isGZIPInputStream() : Cannot test a stream that doesn't support marking."); + // BufferedInputStream supports mark + return isGZIPInputStream(new BufferedInputStream(stream)); } stream.mark(GZIP_HEADER_READ_LENGTH); diff --git a/src/main/java/htsjdk/tribble/index/IndexFactory.java b/src/main/java/htsjdk/tribble/index/IndexFactory.java index 1e26c33300..be21977a2c 100644 --- a/src/main/java/htsjdk/tribble/index/IndexFactory.java +++ b/src/main/java/htsjdk/tribble/index/IndexFactory.java @@ -595,7 +595,9 @@ public FeatureIterator(final Path inputPath, final FeatureCodec SUPPORTED_VERSIONS = new HashSet<>(Arrays.asList(BCF2_1Version, BCF2_2Version)); + public static final Set SUPPORTED_VERSIONS = new HashSet<>(Collections.singletonList(BCF2_2Version)); final int majorVersion; diff --git a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java index e127fd4b2f..648f7080cd 100644 --- a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java +++ b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java @@ -196,12 +196,9 @@ public void testCreateTabixIndexFromVCF( @DataProvider(name = "bcfDataFactory") public Object[][] getBCFData(){ return new Object[][] { - //TODO: this needs more test cases, including block compressed and indexed, but bcftools can't - // generate indices for BCF2.1 files, which is all HTSJDK can read, and htsjdk also can't read/write - // block compressed BCFs (https://github.com/samtools/htsjdk/issues/946) - new Object[] { - new File("src/test/resources/htsjdk/variant/serialization_test.bcf") - } + {new File("src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf")}, + // TODO: this needs more test cases, including block compressed and indexed +// {new File("src/test/resources/htsjdk/variant/serialization_test.bcf")}, }; } diff --git a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java index 6664501df6..4030c180e2 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java @@ -91,7 +91,7 @@ private void testUsingZippedInput(final String filepath, final int nVariants, * https://github.com/samtools/htsjdk/pull/837#discussion_r139490218 * https://github.com/samtools/htsjdk/issues/946 */ - if(!tmp.getName().endsWith(FileExtensions.COMPRESSED_VCF)) { + if(!(tmp.getName().endsWith(FileExtensions.COMPRESSED_VCF) || tmp.getName().endsWith(FileExtensions.BCF))) { tmp = File.createTempFile("tmp",FileExtensions.COMPRESSED_VCF); tmp.deleteOnExit(); try( FileInputStream in = new FileInputStream(filepath); diff --git a/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf b/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf index 8c84efb3993976d32a69eee8fee04b2379e86cc2..8f8630d8fb2cd790950cf1a9d76f8064f10ab4f2 100644 GIT binary patch literal 3158 zcmV-c45{-UiwFb&00000{{{d;LjnLf46T?AY#Zen$KQOlu^Sj?lDZT|-z4pll*HKQ z_^Yks_#;j-@@bnQLB}}T=lF_!=iZ(3)j=yk#n6C)CIQVDFpZC~P1Q66V;hu?!ZtCg zGEIenm_`JHF_?z7wVTjDd)_;reZFRmyOJvLCHLO*JiqsOp8xY+L*ZzB{Rs;~ot+sj z$7TdE&&d9r;pl9?&1vrJjE;^ahGPC5qY?j@FgIOfa=j5&N{L)S<^U-I4`8(cSgow9FGa-{p2_fjDh+zt^{F9Ospm zl_DW~;PkpZw9R98xX!`<=i`2x-g`MIrM*$3nT(AG6G}9}Nbf{3pJYXUtT(YxVEv=K z%+9c4btu7Hj?Hl?oMsDhR#H665QoTSFvF)Ylat{amar1$c;%KLrdbi^I7w#HMxT)> z&8OpRJ_`j=nCCzoh$AMgI>+7QTEcT2T#rxtX_pIpslO&+5m_!_Mr1LFi_D5_y2t2# zc)Aw%!z?eza2HSOwMJ(pVXzXHMUJ0Y*@*GssWI*0;j@G=L8LLqEwbreJe38MJ}JfI z*febGjgXG4Af(k?%#_>!xCGmj;doe7N(2ZimI`c&%Wx@%*fqwIs3{wYe=Ma0*jXW? zSk}Dal#owyykcA_heDQ-;5IkQk`SOU^{dt7fE|%d$x1Av6VXYv7Q!o1`nuiuYbkP_u53cJ`4A*xI>zuf#*2A_qZQz;?Njrc5rsUyhRdaddB`Ye z&W%_q<}oyuOuPr7+jmMf9E0vpl1!`9m@_($gfy3S%_U>#k93j<*f>j8Nm;I7Fi=xv zuB;Q=u_WZxKS&@73pmTnvN*{?NovAia28)R+H2q`DDVZX0U*5vl7_S}$E(-rfWau6 zlUCpNsK}(0<^j7gCE&2Y%M8bZp5Ss79 zg`85taZ-b;M*v$Sk%odMEa!uR5>O@qXii=re&+-jpfe&n1L_2^Wi=-T=c9a5WZ5(< z2@7l{v%2@n10Y231efAikR#}Fz*TB7t+r8>Q0jHNz>0lp(crl(R3(ULfgu{p%s}QP zRWwH!F}=`t4a@Nm&J64*2)SHQ@pAm*V?(ux=QsnM0=-;`L~j&#gQ1@CcxQA zAwy@xUQ27M)4^Z~U(1SuswPIG=R6OhtIvbk(mYTEIA03of6RqxqhPJzdM9Ltfrf%E z&v0o-8+)Y&mRtZS1ck3Q8k-7gR&8fHQk7>YbSX%;3}VFQA?yMRno;CAI8sQZU&*nn zTm_{n>gM=VuBh}q9DZRQKx93JNUpBoV3;`5Z#S8xlF=CK=ae9wbINKM;Z-RsPlhvv zIUZWwZCq9L(3uY>Vyli-E!VK|43@HjC|9SY+9JH1L1V%U7SbTJA`t}won%Sc z7GT*@)0NduhtKEqIM24)sjYVSymlHMv7fP8QV9H0 zwwuUqnbcP%(?rlg zdp%zD)8{CC(q{A+Ldb{IuiI|83Dv=8?CVD(7o&RAfY6%9MakiC(D3cD2Cc&sqAsKw zs4uo1dFagn>e9Ajr!pbx3vKXFfU>q_GCK#UOW=Mh`3s_`wp)(w4p1A_`|WMF2X8n+ znJl*;YJFSmnQcMpGOCWch+0E+w4tNBU!=CYUwxjUE^j;Cy(36%20uRF_$O32T^YDp8IsZyN;4M+zw&rPDbVtfss@IDfQ4*|Y1R=YcX* z2b7x*TsjDpNTYoF?<-O+FSZcM#5+$tZ=eipcX|Dn1Lb1=x-#Wb6GB~U?}ET*8rA{j z2aPJ_!#c{3f4i@)o?N`@i?mge>2i5#+UIh24JUNjJURb?pW4)Zv4z6c1OOlS@GCmt z)8AVPDZms2ni#B*{%$-O0Mge;J`WF5Riyh59iuuCwWVu10)Ts4uO9@!OlFNSjXPRA zYqzukU~Fgy0EV|(0B|$pCyY*ko2}?G_z90(Kv^u7AHuc01o-I%&U%u@!?&5jIzSI? zw|jkVw>8`q4weCb^PQmp0Is(JU=RS0zY@^{b~k8%*?Wq*RPJq^I0}H@Ab?7)FM~xY z;DHafzMue}naxKEHw6H;XV>2X2H4KEc7$61w)e`d0Nc}QqFNkL)YFQ3zKTr1Xfm0! zQYCCpij>-s*`G#qrP}ZFd1<@P8ir&M!g?Uw<9|7zlqyHSKDqh(0|1tI=g|QGtFM&m zms^$ypK`ywZQ+B*27vE5q?GEDRS87gilut&)F3I<10TLo%49=BLsLT&z=crzmo{z$ zxX^Haxl|GQG5US;7zB4QLmQjl8*AV&+bq@dmf%2(&>)N4WjGw*{gKte%?ZK zXX#L0m%opu?+;XW+*0Xb8PY(DwK(!4g^% zuo1NOtzm?Q0qv^ra2ahLx^QhH0sK?5#r!Mc_zbj<-nXb)^Y*MB%|Uw{c6YxI&=O^| zd;a<#g0|~mPKWl)^pb{l{FSe%X!SuuN!qq| zZQ0NZXxvz&jP}wRgqEAW4bk4w*aDdH3R!!4@+S(~t`lzkv~c!&V9?lUtEMk?fOh2{ z-W^b6Wrm=&)L%i0bJxKaNY?5qXea0IB*m%RZ-)`{7owwoCTL5CE~!?RcaCY&a{QIe z1Z{coEqy^Y0NOO#+&PH=ZF+c0SCD^1{&k;0DLx4iA4gMW746m%8qv(=yEiC^t#7|P zbdhdCxcj~RE?>DG*C>T3sM*?2vgJ45N08QNNE_7~Cde<=}U=rv>sNp&cL_^VB|Upv?;%dL+CxyQ{&Ts5K z?wqdc+<4z+kvH zy_VmM zd%EqN{8leN&2Qy5+uAxi?Dl2c#e=)pZ8qHL>i^Yk=zO=s-R7`6JI?C{|Emii9On7C zfEe*v;yPo)BNrwOHdsg)hpVmK)nPX;;%sxYcY4lwj|-Azp_`B>&dv_Eqs?uzyUyeM z^A}~I;~`1@bTby4v#~L5vpqEA3+pB33$zYrlH;P{ZEFqBq($#QN)snUrT8koL_$nR z5k4xWwYaKRhZYxkMU3%6D#{B94PRIys=Soq=_Of-iVB~QR85SQo*5W1&$KUfW;(6N z(-O>q4iciq;e0baR{1Fj^*c1`b+}wO%Xm7DMYLSy1x4gxE}E!_(U#KV`$m^@e4m(- zH3Wnw2g-R(K5%{El3m6$MM_N+$IaZN^a>+`<{$Sh5rJW{#)LF0MqBxjI8ZjL5g{Q) z5!PEFM_iT>2y&{4>|Ft_B1|zUg+*B*V642F79&zjiU{POrFRK0OAY;eUX93ztQ^xN zHc#ava&lZs>6`26kd6zg$WKUlwPM4{eoIULQqEJf>0dM9tbfS5>mI0Ws)TaEdX~T@{9D1 zDRPp(K}tmAX*tECFW$? z*ek6niE{67&cT;Tp)pNP^J$sJ9pfOXNeN!BObUtOO9yk2Sp3RGc9hMqbrXTS+D^IGh>X zp=2liMp2OsHz}^}q4Vzt!E^V6$!tH+2eo7tYLUJ$T54De#AXLlOh7@Q%41R#MJ`^G z4G}*+!v|;#EyfCs_{>->HOLyXu5K7o64tH3jKn0`L|TMrWKt4F3N`zb#x9z!^ob%{ z3(Zn0lE{!zU{MVH05>xR%$#bW>&=T~?Q@~QFnsp8&D2ZP(#_BgMr8H>(ZY))%XNmc zjd?8B#usk~KqOc1Im^|@!XYyniw*&%)(E`7t8rO@_Z5bxSZKTrAKbo-3>uUtcsYsz zSs}k9#m8Zz6F5Rs(5oi12{wtrBco^ig_3$fzpvkxxp~S_VUa z{-Hqey&{=JT9Gu9b}8-QmFVf1>GWPa6_+rvPGwDWN>f-wbSnL zINi>rRy)nrc2B3x;b?c-&RVVN1l|#PYtl0lM!Kp=38KoW>PCNmX#0?5Wc!$9AYcjg z1}%8|U>UvMH)t_Fi?UE{TY~y~Kg+Ps?&x%P8b3Yu>`zBEdy+Bc!596z>z3PC8GeSo zd?auwD`ypqt$b2Z?RL8ZU#@yzeILiT4RuFP|6m)()!lhyS2tH*7mMxK#$8o+rg^K6 zvvOtJWn2Y!1=hrRxXbIXTQ_$xezT@7^xU;R?z44=AAhThtECqh*HCx2@0P=y#d;^> zK4YxC5<6^szj&+9H&T`zO18_HMEf3B*b{>n?NYu9iV&bhvgtL7q+2v_5H zs+_S$%R|8U2eaX^zl^(xyW@^Kp1^q~XY##@GU9A`+16`vKBTA)x7%s&bk>LK{aht5 z!|y!vA~5gvO&dh>=KO3;A?Cq!qy%%_+!fn^**|~nwmfFb&c|N_W=~ljlP!vQ z@^3|$-CJCp-m8E)o4hH9d7zT9rd9Vrp^1vs!2Do^f%(k>%#VI^M;TsX?AM+^U0zO? ztJC4|xH_8p!n&S~i1-GmxRNxTpAByV?3wQz=mG4W+MBl(XgYW53nj3J`Z$6;G5vv; zKGJkwl3hzHZRbr&EWdo56cNXG#2SKsLE2BHs$duoS|0P!n?xTZdjSh1L*bEh`F zREXQS%EtH_;I>|~3Ain_7Out~WG%I<<%`S$N{hu}<}1-Xt#C$N#($ivOXO?2$J6Ps zdFuTLJeB3#sW(Z&&(|CvzVJKW?gHP@)7#8^Ek{`%OiRS~@$KL3LRprhEZ5CFO%m?* zzS^D7*O?EV=mO#kRF?YMD{;bNWyvJU*NTdY%8E*qWe;ok+~t>puBWd(m#>Wdko~S| z5c$ehJb|PVWrpJhq(u?DkrH;5@MZO|qX4 zM<4D6Cu47!oMb--_9shp+_~>#;(Rz|EwG=Dp8WX>;5<=aKj3T%j+u#i@?SbzCE$py zVUwS+KCoTq@5`}GqZU`KAi_UZS*w2mlC#-9eBWwV&dJF_HoL=Zw{^68z-DN;+gna- zBlp(70Ji6@sw8VY_S%;U*>>(rl(6mgRukI;fBAO-+x%=hRpQ5svsJ!Qn3Zi?nl`R& z1)DS&$g#b$lCk;9uR*taR@8t~TR^vGxBp0I+j+brm!8ng+3rTiu{q$UM%$R5eVg+0 z#Pq$CpF8)xM7kY&ZG*{H2HoaPO%Pj6`PG!4WysG1`!6H5YkvQ30o&on|4e@RCt}mr z>iJn{Fz=AT=s8&d-A37l#_bGjqkSVeHpc$IysJM&Ha-o$V{D|_&}}Z8ljNOM_pjA+ zvi9Vwy_aM?lv+hcduO|gWHZnh>-2%iTm29*_0L!LWIbi2o{O@f*IsIcE5U?~nOSTW z>rV}Z@Tx(P&pz9Aq)?^)4;0MBx>!FbTwVP+iicJ`h}|Bm`aZrlS)QStiy9tL%(v_{ zDAdaPNo>f7}EqkbNXjsFaI5BiT&{#Czm zl<>|pzukqAg}iZ3ZJcx}@J6JT`H@H~xr@bMXF~a&<+v z-|lcYI&8TyW#nErwY8(Cs|X7`gypS%(d^uWHb2``g0`pjK2q)GpPa}sHPk)(_fMXO xYHt^yp{X?ujhafa_i|-LXakJ5ZKTm;V8owO3iAewR{s-%e6`{+-~_Tt{RhExM=SsU diff --git a/src/test/resources/htsjdk/variant/serialization_test.bcf b/src/test/resources/htsjdk/variant/serialization_test.bcf index 8c84efb3993976d32a69eee8fee04b2379e86cc2..8f8630d8fb2cd790950cf1a9d76f8064f10ab4f2 100644 GIT binary patch literal 3158 zcmV-c45{-UiwFb&00000{{{d;LjnLf46T?AY#Zen$KQOlu^Sj?lDZT|-z4pll*HKQ z_^Yks_#;j-@@bnQLB}}T=lF_!=iZ(3)j=yk#n6C)CIQVDFpZC~P1Q66V;hu?!ZtCg zGEIenm_`JHF_?z7wVTjDd)_;reZFRmyOJvLCHLO*JiqsOp8xY+L*ZzB{Rs;~ot+sj z$7TdE&&d9r;pl9?&1vrJjE;^ahGPC5qY?j@FgIOfa=j5&N{L)S<^U-I4`8(cSgow9FGa-{p2_fjDh+zt^{F9Ospm zl_DW~;PkpZw9R98xX!`<=i`2x-g`MIrM*$3nT(AG6G}9}Nbf{3pJYXUtT(YxVEv=K z%+9c4btu7Hj?Hl?oMsDhR#H665QoTSFvF)Ylat{amar1$c;%KLrdbi^I7w#HMxT)> z&8OpRJ_`j=nCCzoh$AMgI>+7QTEcT2T#rxtX_pIpslO&+5m_!_Mr1LFi_D5_y2t2# zc)Aw%!z?eza2HSOwMJ(pVXzXHMUJ0Y*@*GssWI*0;j@G=L8LLqEwbreJe38MJ}JfI z*febGjgXG4Af(k?%#_>!xCGmj;doe7N(2ZimI`c&%Wx@%*fqwIs3{wYe=Ma0*jXW? zSk}Dal#owyykcA_heDQ-;5IkQk`SOU^{dt7fE|%d$x1Av6VXYv7Q!o1`nuiuYbkP_u53cJ`4A*xI>zuf#*2A_qZQz;?Njrc5rsUyhRdaddB`Ye z&W%_q<}oyuOuPr7+jmMf9E0vpl1!`9m@_($gfy3S%_U>#k93j<*f>j8Nm;I7Fi=xv zuB;Q=u_WZxKS&@73pmTnvN*{?NovAia28)R+H2q`DDVZX0U*5vl7_S}$E(-rfWau6 zlUCpNsK}(0<^j7gCE&2Y%M8bZp5Ss79 zg`85taZ-b;M*v$Sk%odMEa!uR5>O@qXii=re&+-jpfe&n1L_2^Wi=-T=c9a5WZ5(< z2@7l{v%2@n10Y231efAikR#}Fz*TB7t+r8>Q0jHNz>0lp(crl(R3(ULfgu{p%s}QP zRWwH!F}=`t4a@Nm&J64*2)SHQ@pAm*V?(ux=QsnM0=-;`L~j&#gQ1@CcxQA zAwy@xUQ27M)4^Z~U(1SuswPIG=R6OhtIvbk(mYTEIA03of6RqxqhPJzdM9Ltfrf%E z&v0o-8+)Y&mRtZS1ck3Q8k-7gR&8fHQk7>YbSX%;3}VFQA?yMRno;CAI8sQZU&*nn zTm_{n>gM=VuBh}q9DZRQKx93JNUpBoV3;`5Z#S8xlF=CK=ae9wbINKM;Z-RsPlhvv zIUZWwZCq9L(3uY>Vyli-E!VK|43@HjC|9SY+9JH1L1V%U7SbTJA`t}won%Sc z7GT*@)0NduhtKEqIM24)sjYVSymlHMv7fP8QV9H0 zwwuUqnbcP%(?rlg zdp%zD)8{CC(q{A+Ldb{IuiI|83Dv=8?CVD(7o&RAfY6%9MakiC(D3cD2Cc&sqAsKw zs4uo1dFagn>e9Ajr!pbx3vKXFfU>q_GCK#UOW=Mh`3s_`wp)(w4p1A_`|WMF2X8n+ znJl*;YJFSmnQcMpGOCWch+0E+w4tNBU!=CYUwxjUE^j;Cy(36%20uRF_$O32T^YDp8IsZyN;4M+zw&rPDbVtfss@IDfQ4*|Y1R=YcX* z2b7x*TsjDpNTYoF?<-O+FSZcM#5+$tZ=eipcX|Dn1Lb1=x-#Wb6GB~U?}ET*8rA{j z2aPJ_!#c{3f4i@)o?N`@i?mge>2i5#+UIh24JUNjJURb?pW4)Zv4z6c1OOlS@GCmt z)8AVPDZms2ni#B*{%$-O0Mge;J`WF5Riyh59iuuCwWVu10)Ts4uO9@!OlFNSjXPRA zYqzukU~Fgy0EV|(0B|$pCyY*ko2}?G_z90(Kv^u7AHuc01o-I%&U%u@!?&5jIzSI? zw|jkVw>8`q4weCb^PQmp0Is(JU=RS0zY@^{b~k8%*?Wq*RPJq^I0}H@Ab?7)FM~xY z;DHafzMue}naxKEHw6H;XV>2X2H4KEc7$61w)e`d0Nc}QqFNkL)YFQ3zKTr1Xfm0! zQYCCpij>-s*`G#qrP}ZFd1<@P8ir&M!g?Uw<9|7zlqyHSKDqh(0|1tI=g|QGtFM&m zms^$ypK`ywZQ+B*27vE5q?GEDRS87gilut&)F3I<10TLo%49=BLsLT&z=crzmo{z$ zxX^Haxl|GQG5US;7zB4QLmQjl8*AV&+bq@dmf%2(&>)N4WjGw*{gKte%?ZK zXX#L0m%opu?+;XW+*0Xb8PY(DwK(!4g^% zuo1NOtzm?Q0qv^ra2ahLx^QhH0sK?5#r!Mc_zbj<-nXb)^Y*MB%|Uw{c6YxI&=O^| zd;a<#g0|~mPKWl)^pb{l{FSe%X!SuuN!qq| zZQ0NZXxvz&jP}wRgqEAW4bk4w*aDdH3R!!4@+S(~t`lzkv~c!&V9?lUtEMk?fOh2{ z-W^b6Wrm=&)L%i0bJxKaNY?5qXea0IB*m%RZ-)`{7owwoCTL5CE~!?RcaCY&a{QIe z1Z{coEqy^Y0NOO#+&PH=ZF+c0SCD^1{&k;0DLx4iA4gMW746m%8qv(=yEiC^t#7|P zbdhdCxcj~RE?>DG*C>T3sM*?2vgJ45N08QNNE_7~Cde<=}U=rv>sNp&cL_^VB|Upv?;%dL+CxyQ{&Ts5K z?wqdc+<4z+kvH zy_VmM zd%EqN{8leN&2Qy5+uAxi?Dl2c#e=)pZ8qHL>i^Yk=zO=s-R7`6JI?C{|Emii9On7C zfEe*v;yPo)BNrwOHdsg)hpVmK)nPX;;%sxYcY4lwj|-Azp_`B>&dv_Eqs?uzyUyeM z^A}~I;~`1@bTby4v#~L5vpqEA3+pB33$zYrlH;P{ZEFqBq($#QN)snUrT8koL_$nR z5k4xWwYaKRhZYxkMU3%6D#{B94PRIys=Soq=_Of-iVB~QR85SQo*5W1&$KUfW;(6N z(-O>q4iciq;e0baR{1Fj^*c1`b+}wO%Xm7DMYLSy1x4gxE}E!_(U#KV`$m^@e4m(- zH3Wnw2g-R(K5%{El3m6$MM_N+$IaZN^a>+`<{$Sh5rJW{#)LF0MqBxjI8ZjL5g{Q) z5!PEFM_iT>2y&{4>|Ft_B1|zUg+*B*V642F79&zjiU{POrFRK0OAY;eUX93ztQ^xN zHc#ava&lZs>6`26kd6zg$WKUlwPM4{eoIULQqEJf>0dM9tbfS5>mI0Ws)TaEdX~T@{9D1 zDRPp(K}tmAX*tECFW$? z*ek6niE{67&cT;Tp)pNP^J$sJ9pfOXNeN!BObUtOO9yk2Sp3RGc9hMqbrXTS+D^IGh>X zp=2liMp2OsHz}^}q4Vzt!E^V6$!tH+2eo7tYLUJ$T54De#AXLlOh7@Q%41R#MJ`^G z4G}*+!v|;#EyfCs_{>->HOLyXu5K7o64tH3jKn0`L|TMrWKt4F3N`zb#x9z!^ob%{ z3(Zn0lE{!zU{MVH05>xR%$#bW>&=T~?Q@~QFnsp8&D2ZP(#_BgMr8H>(ZY))%XNmc zjd?8B#usk~KqOc1Im^|@!XYyniw*&%)(E`7t8rO@_Z5bxSZKTrAKbo-3>uUtcsYsz zSs}k9#m8Zz6F5Rs(5oi12{wtrBco^ig_3$fzpvkxxp~S_VUa z{-Hqey&{=JT9Gu9b}8-QmFVf1>GWPa6_+rvPGwDWN>f-wbSnL zINi>rRy)nrc2B3x;b?c-&RVVN1l|#PYtl0lM!Kp=38KoW>PCNmX#0?5Wc!$9AYcjg z1}%8|U>UvMH)t_Fi?UE{TY~y~Kg+Ps?&x%P8b3Yu>`zBEdy+Bc!596z>z3PC8GeSo zd?auwD`ypqt$b2Z?RL8ZU#@yzeILiT4RuFP|6m)()!lhyS2tH*7mMxK#$8o+rg^K6 zvvOtJWn2Y!1=hrRxXbIXTQ_$xezT@7^xU;R?z44=AAhThtECqh*HCx2@0P=y#d;^> zK4YxC5<6^szj&+9H&T`zO18_HMEf3B*b{>n?NYu9iV&bhvgtL7q+2v_5H zs+_S$%R|8U2eaX^zl^(xyW@^Kp1^q~XY##@GU9A`+16`vKBTA)x7%s&bk>LK{aht5 z!|y!vA~5gvO&dh>=KO3;A?Cq!qy%%_+!fn^**|~nwmfFb&c|N_W=~ljlP!vQ z@^3|$-CJCp-m8E)o4hH9d7zT9rd9Vrp^1vs!2Do^f%(k>%#VI^M;TsX?AM+^U0zO? ztJC4|xH_8p!n&S~i1-GmxRNxTpAByV?3wQz=mG4W+MBl(XgYW53nj3J`Z$6;G5vv; zKGJkwl3hzHZRbr&EWdo56cNXG#2SKsLE2BHs$duoS|0P!n?xTZdjSh1L*bEh`F zREXQS%EtH_;I>|~3Ain_7Out~WG%I<<%`S$N{hu}<}1-Xt#C$N#($ivOXO?2$J6Ps zdFuTLJeB3#sW(Z&&(|CvzVJKW?gHP@)7#8^Ek{`%OiRS~@$KL3LRprhEZ5CFO%m?* zzS^D7*O?EV=mO#kRF?YMD{;bNWyvJU*NTdY%8E*qWe;ok+~t>puBWd(m#>Wdko~S| z5c$ehJb|PVWrpJhq(u?DkrH;5@MZO|qX4 zM<4D6Cu47!oMb--_9shp+_~>#;(Rz|EwG=Dp8WX>;5<=aKj3T%j+u#i@?SbzCE$py zVUwS+KCoTq@5`}GqZU`KAi_UZS*w2mlC#-9eBWwV&dJF_HoL=Zw{^68z-DN;+gna- zBlp(70Ji6@sw8VY_S%;U*>>(rl(6mgRukI;fBAO-+x%=hRpQ5svsJ!Qn3Zi?nl`R& z1)DS&$g#b$lCk;9uR*taR@8t~TR^vGxBp0I+j+brm!8ng+3rTiu{q$UM%$R5eVg+0 z#Pq$CpF8)xM7kY&ZG*{H2HoaPO%Pj6`PG!4WysG1`!6H5YkvQ30o&on|4e@RCt}mr z>iJn{Fz=AT=s8&d-A37l#_bGjqkSVeHpc$IysJM&Ha-o$V{D|_&}}Z8ljNOM_pjA+ zvi9Vwy_aM?lv+hcduO|gWHZnh>-2%iTm29*_0L!LWIbi2o{O@f*IsIcE5U?~nOSTW z>rV}Z@Tx(P&pz9Aq)?^)4;0MBx>!FbTwVP+iicJ`h}|Bm`aZrlS)QStiy9tL%(v_{ zDAdaPNo>f7}EqkbNXjsFaI5BiT&{#Czm zl<>|pzukqAg}iZ3ZJcx}@J6JT`H@H~xr@bMXF~a&<+v z-|lcYI&8TyW#nErwY8(Cs|X7`gypS%(d^uWHb2``g0`pjK2q)GpPa}sHPk)(_fMXO xYHt^yp{X?ujhafa_i|-LXakJ5ZKTm;V8owO3iAewR{s-%e6`{+-~_Tt{RhExM=SsU diff --git a/src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf b/src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf new file mode 100644 index 0000000000000000000000000000000000000000..a19ea048f148aa1e0b7f7c26e02b2ec8727a33dd GIT binary patch literal 7477 zcmcJT3v65E70184+Sm<@Ym%lsM!zKOl9a^w8OO9v;@EMLk*8yd1RdkXzP4|%ecgMn z^Xj0LSj5mM1x*5)F<=^xu}#%91Y;YNj>0xEs&1MJ12L@;493bdw5{EQ2HH9IxPIF63Xm}(t5cX{w z4*Euf*~uK6>IiaDROGWVFJyep@JG4}{>sbq%{|5UZ=akBM?^LgO>)Cy!GRI4*J`uL z0z7za#fwD5oSdW*myvjxpMj;QtiF$?xEU@*Au6gYLDNDEUb8ZV#p#F;yOwsktagvR zn{M;bIl7H@+N|AOcKa$~@j@(in+;;R`m>4#_xCtFR)^i$bzTvCRsq0q==&Q7qMpwR z;Y^K9T$nVlKsjL?ZmY}PW!D37S{<%#?>XbRAX&;qf}%LPyF3o7$7Xk*$NuLpO1bSp zlKk%aS#)Y=qee3x9`#3*67vT;#&YRtPV|L4BJ)|!H=L2V1Sb}U;!mZx6d$EAE-NP` zr8?v!M@249*-VVGDH(p@7EYpkhTgKc!pG$O>hx#3z^f}ya*h_sm4L9NK5@kh>g1N|?$i>=Aw;!Bb&Gv&_Mv&n^C_0eg zHTi=ZjFhZ0C5wC}QG9OtBBdcrjOcG3T;T-9&>BDRLXKuKF&v3bkNdRBCAqE*`G>L0ZvwkD-!Fw(x{N0 z<}=FTN;+hdti;g-Kf_@zfOTR*=y$8)z&auqm6ceA$3o+3E(Y{d@vm?yN-lD+X)Yrp zq_J>^auysLO;Z0zL`%pH{Tw7Vt!E`1QbA(ow-1$UA4&MZ5sBc^p()0#C z6&2=$428aUE0uC-3WX)(?tZBD?UI#-q57v$X|)^koK(DduwUQOrpg&gu_g5pn??g0 z<GvFt5D$8b+Cqb}RysPAt z2gVEsUnzyAWFbql0**WCMp%C zGT>j8d5*HsQK6Re9i(tk+6D`?8|_`S8P#be+6MiQX1FLPNvz1HU}A;<6;}mneC`jl zU#J}-$0}U-f1!x$mc0s)(1nAP&KPCNK0G=_N7-x^6SHv0$g4VOE6K#N!-vb za-yKRNpW?Lo_{|Gox2~*X#GGL)Y2N%S^C0csbQ5lo9&QdEEE(}d7O_yk#pB*M?|OR zX%NTIVyy6lUq37B3d4gs>neu9r@^{qFe5GvZ6eEoXXG+Gj1-dg62~r_u9S&FtA#c` z6HVpNDBz|z^aF@Y9WV{m!q@A8X!codFpQkJZyWYfsdO=D2NQzy|7hVwk{O-hOkbgkjwSt=z35xlSPc#4IFkKrTZtH_`cAwh*049Ft- zB`Q7*HkyDfWD$DRjFwdPOv#if_$WDeooH85TJZ1 zP#jk@m&%H~tkbTfJ$NPbbeQRsUOb!RVPc)tOmtQd!TX^(;P)aJuQtXHmJOjpHFDL9 zq4p%`?XyYsrA~+|>mv^pawRjj#H}XeDIq6DIf|bpup-dy)6kHhTfr0$AN4TGfbuDT z)6`TOY!v1gKA{Zv_Kp;nNytf`!{v3jZCws~2P;W1`K9K4(@qb5lT&cFBY(Hk?)5r7 z&Xw+V>UZ0{-8P5A<+Pn~x1N&PO$gtcu7DWCmJV}84%+taN+dhOby!%Utagm~c>f9|~HHc|nP;jbMI zUP>xS6(MV$6eYXe?tou+qrY*GA62%Keiz)B&&mGQFw(of4MWF1j0LqPvSM&iT(J4>-y)5POLLE{@-g)N5 z63X7K?rz^zK)H~<$)H@SA*6NfJ)lpbY8_C1P_0rvT0r^HZ}(KdB|?7V4L0P(bi2D9 z4zIhbbudz(=JB}?e9XqCODzm-i~#WB$uAWEpZ@MrzXHrOEH6a*@}$2To$dwF*HO=d zgG>?W-UCON7Q$?9oeYAEyX$Z61HgEEO{t9A>O5;VHvnLGU>g7iwpc*MO`uPJoPy}L zkWb-%Q2rvuVzK-Hp4;FTe?Ga$S&1?pywemY0Q5L)wr+1%S7V?x;5X9s*WVuK1;F)< z0O$w6qpt-E0o$r{z~p_o0x5Ubj~xcUuOlGt2~t*RQZAm{@{$60X3jpAy{#8u+jsn} zw?xgBwKhWQ0Jh_rEdbkIZ({1~A<|w?+P_Rpz-Tg=^i)N*r$t7s$mCCqiiA{kdA;2Z zo3}9lVqrm6LP75Gz1pj!Dvw|v-~PQGD9Fe=kN1FbmHC4FV%-w*Dc74C=0A9%2l!q< zzKx4d7YRuCvZ;FHR3E15;>kBPC9A5cYN~3WAp1$v7cRdX;Q9w$Mye9>Bl7#&5lB^1 z{RHIWu}U>nm*5XrkVyIZr*|ul?T^kG1qr_6?gG>GdTm9%^I*n;zO!^7U7+8GlMnV5 zSLD)xHw%2{qbGmy0$?6T%pLnbF2X$VZ@_E~O+k_p@)n})@>%?GD9Mu{pp;)oxJ%XxIGVT|`?v znLxC<%BwLsckF)&RjbG+=kd9Hn4HS>CTKB#K_C4yqAeY`tO#xIJ4bYDIr`crL|a~X zyD%ZEK($G-sb!o1+T`Gbfkwz5iErJfkcv+O-cd4PR#p3|mXPR~wfAmN60-ips{@y4 z-iaNe%hm02qi8DG8YK}0Rjs|)0Da~M5NVB$gq;sAm@Gd-2w0gEHUryrhCV~f=6--6 zq+8RcM)?5<;wC+02q4@&Lk7eHYww4(9lU>+gjjCht3r_DM(P1Xmj}My z_t+Z;!3WVhYCxVFV^^Se)E`v5}jt~WJ2^fEztK0z|U z($ZTFKwlo3d&Y|lJ%3ocKeL${#3TR>= z)`HgN?t$0n9Y}d_-8v-;eP(yRMU$%0sw5vaadgJnLmLkP-R}A%Dy2E2`Hdq;SKy5I z2aXjuWA*mi(fJ=zblbbXzktrXch~cJU7c=wy9bEk@#&w#Y+_j9)wSz4Y-k1?r*$jh zG&`WF5h9XztD7O{4=dmW{12ivH8mZ8=Q_^!r@yJ-Ja+F*Mp2>RyBrQjm(7?>5bNfj z9zP6N%L}aqSP)MM){#@S1z6Aiuto^U4IXXrZ+S~-ou(2odltL<`=PDPwew( j9n2sbCe*eW=aJ!wfFTqN86;-@7wGYgs)vCls0sC7=qN^9 literal 0 HcmV?d00001 From b6559a9c283fb862f9f79aa8be997403cd95c8f4 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 7 Dec 2021 14:49:32 -0500 Subject: [PATCH 13/22] Cleanup various TODOs, remove/fix failing tests --- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 18 +++++++++++++----- .../bcf2/BCF2FieldWriter/BCF2FieldEncoder.java | 11 +++++++++++ .../bcf2/BCF2FieldWriter/BCF2FieldWriter.java | 3 ++- .../BCF2FieldWriterManager.java | 14 +++++++++++--- .../variantcontext/writer/BCF2Writer.java | 1 - .../htsjdk/variant/vcf/VCFHeaderVersion.java | 14 -------------- .../variant/vcf/VCFStandardHeaderLines.java | 4 ++++ .../codecs/variants/vcf/HtsVCFCodecTest.java | 9 --------- .../htsjdk/tribble/index/IndexFactoryTest.java | 1 + .../variant/bcf2/BCF2DictionaryTest.java | 2 +- .../BCF2FieldWriter/BCF2FieldEncoderTest.java | 3 +-- .../htsjdk/variant/bcf2/BCF2UtilsUnitTest.java | 2 +- .../VariantContextTestProvider.java | 2 -- .../writer/VCFWriterUnitTest.java | 2 +- .../variant/vcf/VCFHeaderLineUnitTest.java | 6 ++---- .../vcf/VCFStandardHeaderLinesUnitTest.java | 2 +- 16 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index 62fcd3ede4..af18454db7 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -48,6 +48,7 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -219,9 +220,8 @@ public FeatureCodecHeader readHeader(final PositionalBufferedStream inputStream) } // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields - // Error on ##dictionary lines, we don't know what to do with them if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { - throw new TribbleException("Use of the ##dictionary line is not supported"); + log.warn("Use of the ##dictionary line is not supported"); } // create the contig dictionary @@ -448,7 +448,7 @@ private void decodeInfo(final VariantContextBuilder builder, final int numInfoFi for (int i = 0; i < numInfoFields; i++) { final String key = getDictionaryString(); Object value = decoder.decodeTypedValue(); - final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key); + final VCFInfoHeaderLine metaData = header.getInfoHeaderLine(key); if (metaData.getType() == VCFHeaderLineType.Flag) { // Despite contradictory language in the spec, bcftools/htslib encode the "payload" of // FLAG as 0x00 (MISSING type) which we would normally decode as MISSING/null, @@ -509,7 +509,11 @@ private String getDictionaryString() throws IOException { } protected final String getDictionaryString(final int offset) { - return stringDictionary.get(offset); + final String s = stringDictionary.get(offset); + if (s == null) { + error("No entry in the string dictionary matching key: " + offset + " was found"); + } + return s; } private BCF2Dictionary makeStringDictionary(final BCFVersion bcfVersion) { @@ -530,7 +534,11 @@ private BCF2Dictionary makeStringDictionary(final BCFVersion bcfVersion) { * @return */ private String lookupContigName(final int contigOffset) { - return contigDictionary.get(contigOffset); + final String s = contigDictionary.get(contigOffset); + if (s == null) { + error("No entry in the contig dictionary matching key: " + contigOffset + " was found"); + } + return s; } private BCF2Dictionary makeContigDictionary(final BCFVersion bcfVersion) { diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java index 3a6aeae2cb..3d95f4ae5e 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -237,6 +237,17 @@ void load(final Object o) { void encode() throws IOException { for (final Object o : vs) { if (o == null) { + // TODO we encode an entirely missing vector as all EOV, or essentially a 0-length vector + // padded to the appropriate length with EOV, this encoding is allowed but not required + // by the spec[1], but bcftools currently does not appear to handle it properly[2], + // printing such empty vectors in VCF as an empty string and not '.' or '.,.' + // bcfools encodes empty vectors uniformly as [MISSING, EOV*] which we handle appropriately, + // and the distinction between partially missing [MISSING, EOV] and fully missing [EOV, EOV] + // vectors is apparently not required to be preserved by implementations + // We could either match our output to bcftools' codec or keep it as is, and wait for + // bcftools to resolve this issue + // [1] https://github.com/samtools/hts-specs/issues/593#issuecomment-910266633 + // [2] https://github.com/samtools/bcftools/issues/1622 encoder.encodePaddingValues(nValues, type); } else if (o instanceof List) { final List v = (List) o; diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java index 3b645bf981..dd140b3b3a 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java @@ -455,8 +455,9 @@ void encode(final VariantContext vc, final List sampleNames) throws IOEx encoder.encodePaddingValues(padding, type); } } else { - // TODO read the spec more closely, look at htslib, this may not be correct // Entirely missing genotype, which we encode as vector of no call + // These cannot be encoded as MISSING values, because the BCF 2.2 spec explicitly forbids + // any negative values in the GT array and MISSING values are negative for (int i = 0; i < nValues; i++) { encoder.encodeRawInt(0, type); } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java index b73a88036d..02fb3e4cdd 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java @@ -5,6 +5,7 @@ import htsjdk.variant.bcf2.BCF2Encoder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCompoundHeaderLine; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -37,7 +38,14 @@ public BCF2FieldWriterManager(final VCFHeader header, final Map formatWriters = new HashMap<>(header.getFormatHeaderLines().size()); for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { final String field = line.getID(); - validateStandardHeader(line, VCFStandardHeaderLines.getFormatLine(field, false)); + // We skip validation for the FT key because its line count changed between VCF versions 4.2 and 4.3 + // from UNBOUNDED to 1, while VCFStandardHeaderLines keeps the 4.2 definition. + // This does not matter for our BCF writing code because the concrete BCF count encoded in the typing + // bytes for strings always has to be determined by inspecting the strings themselves, so this validation + // would only produce noisy but harmless warnings. + if (!field.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { + validateStandardHeader(line, VCFStandardHeaderLines.getFormatLine(field, false)); + } final int offset = dict.get(field); final BCF2FieldWriter.GenotypeWriter writer = BCF2FieldWriter.createGenotypeWriter(line, offset, encoder); formatWriters.put(field, writer); @@ -72,7 +80,7 @@ private static void validateStandardHeader( final VCFHeaderLineType actualType = actualLine.getType(); final VCFHeaderLineType expectedType = expectedLine.getType(); if (actualType != expectedType) { - log.error(String.format( + log.warn(String.format( "Header with standard key: `%s` has type: %s which does not match standard type: %s", actualLine.getID(), actualType, @@ -83,7 +91,7 @@ private static void validateStandardHeader( final VCFHeaderLineCount actualCountType = actualLine.getCountType(); final VCFHeaderLineCount expectedCountType = expectedLine.getCountType(); if (actualCountType != expectedCountType || actualLine.isFixedCount() && actualLine.getCount() != expectedLine.getCount()) { - log.error(String.format( + log.warn(String.format( "Header with standard key: `%s` has count: %s which does not match standard count: %s", actualLine.getID(), actualLine.isFixedCount() ? actualLine.getCount() : actualCountType, diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index fd95161be2..ac095d6f83 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -236,7 +236,6 @@ public void setHeader(final VCFHeader header) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } - // TODO we default to 2.2 here, is this alright? encoder = BCF2Encoder.getEncoder(BCF2Codec.ALLOWED_BCF_VERSION); // make sure the header is sorted correctly diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index ce5ed1920a..454d567300 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -136,20 +136,6 @@ public boolean isAtLeastAsRecentAs(final VCFHeaderVersion target) { return this.ordinal() >= target.ordinal(); } - /** - * Determine if two header versions are compatible (header lines from these versions are interchangeable). - * For now, the only incompatibility is between V4.3 and any other version. All other version combinations - * are compatible. - * @param v1 first version to compare - * @param v2 scond version to compare - * @return true if the versions are compatible - */ - //TODO: this method can be removed once this is rebased on the vcf4.3 writing branch - public static boolean versionsAreCompatible(final VCFHeaderVersion v1, final VCFHeaderVersion v2) { - return v1.equals(v2) || - (!v1.isAtLeastAsRecentAs(VCF4_3) && !v2.isAtLeastAsRecentAs(VCF4_3)); - } - public String getVersionString() { return versionString; } diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index 6dd5f3906f..1032762f0d 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -170,6 +170,10 @@ private static void registerStandard(final VCFFormatHeaderLine line) { registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + // This line's count changed from UNBOUNDED in VCF 4.2 to 1 in VCF 4.3, but we keep it at UNBOUNDED + // because VCFStandardHeaderLines is now mainly a facility for upgrading headers from pre-4.2 versions + // to conform to the 4.2 spec. + // Version upgrading for other versions is more difficult, so we do not rely on VCFStandardHeaderLines registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phasing set (typically the position of the first variant in the set)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); diff --git a/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java b/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java index 2ea873212e..fa5201b754 100644 --- a/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java +++ b/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java @@ -105,15 +105,6 @@ public void testRoundTripVCFThroughStream(final IOPath inputPath, final HtsVersi } } - @Test(expectedExceptions = IllegalArgumentException.class) - public void testRejectWritingV43HeaderAsV42() { - // read vcf v4.3 and try to write it to a vcf v4.2 (header is rejected) - final IOPath outputPath = IOUtils.createTempPath("rejectWrite43HeaderVCF", ".vcf"); - readWriteVCFToPath(new HtsPath(VARIANTS_TEST_DIR + "variant/vcf43/all43Features.vcf"), - outputPath, - VCFCodecV4_3.VCF_V43_VERSION); - } - @DataProvider(name="gzipSuffixTests") private Object[][] gzipSuffixTests() { return new Object[][] { diff --git a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java index 648f7080cd..77a5902ea0 100644 --- a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java +++ b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java @@ -198,6 +198,7 @@ public Object[][] getBCFData(){ return new Object[][] { {new File("src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf")}, // TODO: this needs more test cases, including block compressed and indexed + // The test below, with a bgzipped BCF, fails // {new File("src/test/resources/htsjdk/variant/serialization_test.bcf")}, }; } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java index 9fbf27842e..43b8329993 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java @@ -27,7 +27,7 @@ public Object[][] dictionaryProvider() { final List inputLines = new ArrayList<>(); int counter = 0; - inputLines.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); inputLines.add(new VCFFilterHeaderLine("l" + counter++)); inputLines.add(new VCFFilterHeaderLine("l" + counter++)); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java index afb198286e..5946caa8a4 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -356,7 +356,6 @@ public static Object[][] genotypeWriterCases() { }); } - // TODO revisit this test once the correct behavior is determined // Test encoding for a VC entirely missing genotype data { final VariantContext vcMissingGenotypes = new VariantContextBuilder() @@ -371,7 +370,7 @@ public static Object[][] genotypeWriterCases() { .make(); final byte[] bytes = new byte[]{ 0x21, // 2 8-bit ints - (byte) BCF2Type.INT8.getMissingBytes(), (byte) BCF2Type.INT8.getMissingBytes(), + (byte) 0, (byte) 0 }; cases.add(new Object[]{ writer, vcMissingGenotypes, diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 5f658bd69b..e07d23cfd9 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -105,7 +105,7 @@ public Object[][] makeHeaderOrderTestProvider() { if (i >= 0) allLines.remove(i); allLines.addAll(permutation); - allLines.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + allLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<>(allLines)); final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter); tests.add(new Object[]{new HeaderOrderTestCase(inputHeader, testHeader, expectedConsistent)}); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java index 34adb5a4b0..91efd8bcf0 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java @@ -233,7 +233,6 @@ private static void createSyntheticHeader() { addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - // TODO changed count type from UNBOUNDED to 1 to match VCF 4.3 spec, but might conflict with existing htsjdk code addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header @@ -674,7 +673,6 @@ public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTe assertEquals(g, expected.getGenotype(g.getSampleName())); } else { // missing - // TODO this may not be correct Assert.assertTrue(g.isNoCall()); } } diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index 4931fd8b09..83376588c3 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -219,7 +219,7 @@ public void testChangeHeaderAfterWritingBody() { */ private static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); // Explicitly add GT, AD, and BB keys because the .bcf tests that use this fake header require that the header // contain INFO/FORMAT lines for all the attributes written diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index 2cd81e7ef9..f02ccd0585 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -114,8 +114,7 @@ public Object[][] vcfVersions() { @Test(dataProvider = "vcfVersions") public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { - VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); - headerLine.validateForVersion(vcfVersion); + VCFHeader.makeHeaderVersionLine(vcfVersion).validateForVersion(vcfVersion); } @DataProvider(name = "incompatibleVersions") @@ -133,8 +132,7 @@ public Object[][] incompatibleVersionPairs() { @Test(dataProvider="incompatibleVersions", expectedExceptions= TribbleException.VersionValidationFailure.class) public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { - VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); - headerLine.validateForVersion(incompatibleVersion); + VCFHeader.makeHeaderVersionLine(vcfVersion).validateForVersion(incompatibleVersion); } @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index 38a8c983f7..c17360a770 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -192,7 +192,7 @@ public void testRepairHeaderTest(final RepairHeaderTest cfg) { final Set headerLines = new LinkedHashSet<>(); // The standard header line repair facility is not sufficiently powerful to fix broken lines // starting from version 4.3, so it is only used for versions <= 4.2, and we use version 4.2 for this test - headerLines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); + headerLines.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); headerLines.add(cfg.original); final VCFHeader toRepair = new VCFHeader(headerLines); From 44d3f34eeb87ac8e2269b2eceb541fef45c74146 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 7 Dec 2021 14:54:35 -0500 Subject: [PATCH 14/22] Fully remove BCF 2.1 encoder and decoder --- .../java/htsjdk/variant/bcf2/BCF2Decoder.java | 10 ------ .../java/htsjdk/variant/bcf2/BCF2Encoder.java | 32 ------------------- 2 files changed, 42 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java index 1544d9ed6c..db0814839c 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java @@ -47,8 +47,6 @@ private BCF2Decoder() { public static BCF2Decoder getDecoder(final BCFVersion version) { switch (version.getMinorVersion()) { - case 1: - return new BCF2Decoder.BCF2_1Decoder(); case 2: return new BCF2Decoder.BCF2_2Decoder(); default: @@ -466,14 +464,6 @@ public final byte readTypeDescriptor() throws IOException { public abstract int getPaddingValue(final BCF2Type type); - public static class BCF2_1Decoder extends BCF2Decoder { - - @Override - public int getPaddingValue(final BCF2Type type) { - return type.getMissingBytes(); - } - } - public static class BCF2_2Decoder extends BCF2Decoder { @Override diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java index ae99f8e7e1..55fe4a7a7e 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java @@ -49,8 +49,6 @@ public abstract class BCF2Encoder { public static BCF2Encoder getEncoder(final BCFVersion version) { switch (version.getMinorVersion()) { - case 1: - return new BCF2_1Encoder(); case 2: return new BCF2_2Encoder(); default: @@ -323,36 +321,6 @@ public final byte[] compactStrings(final String[] strings) { // // -------------------------------------------------------------------------------- - public static class BCF2_1Encoder extends BCF2Encoder { - - @Override - public void encodePaddingValue(final BCF2Type type) throws IOException { - type.write(type.getMissingBytes(), encodeStream); - } - - @Override - public byte[] compactStrings(final List strings) { - if (strings.isEmpty()) return new byte[0]; - - // 1 comma for each string, then add on individual string lengths - int size = strings.size(); - final byte[][] bytes = new byte[strings.size()][]; - int i = 0; - for (final String s : strings) { - final byte[] b = s.getBytes(StandardCharsets.UTF_8); - size += b.length; - bytes[i++] = b; - } - final ByteBuffer buff = ByteBuffer.allocate(size); - for (final byte[] bs : bytes) { - buff.put((byte) ','); - buff.put(bs); - } - - return buff.array(); - } - } - public static class BCF2_2Encoder extends BCF2Encoder { @Override From f79513396e1d7bb344c3957e40e8f5da2d65d5c4 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 7 Dec 2021 15:10:32 -0500 Subject: [PATCH 15/22] Tag BCF lazy data with version, only use lazy data in BCF2Writer if versions match --- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 12 +++++---- .../variantcontext/writer/BCF2Writer.java | 26 +++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index af18454db7..acd5cc3090 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -481,7 +481,7 @@ private void createLazyGenotypesDecoder(final SitesInfoForDecoding siteInfo, final LazyGenotypesContext.LazyParser lazyParser = new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); - final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes()); + final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes(), bcfVersion); final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples()); // did we resort the sample names? If so, we need to load the genotype data @@ -493,14 +493,16 @@ private void createLazyGenotypesDecoder(final SitesInfoForDecoding siteInfo, } public static class LazyData { - final public VCFHeader header; - final public int nGenotypeFields; - final public byte[] bytes; + public final VCFHeader header; + public final int nGenotypeFields; + public final byte[] bytes; + public final BCFVersion version; - public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) { + public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes, final BCFVersion version) { this.header = header; this.nGenotypeFields = nGenotypeFields; this.bytes = bytes; + this.version = version; } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index ac095d6f83..c81e9898e1 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -110,7 +110,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private final Map> genotypeKeys = new HashMap<>(); private BCF2Encoder encoder; // initialized after the header arrives - + private BCFVersion version; private BCF2FieldWriterManager fieldWriterManager; /** @@ -194,8 +194,12 @@ public void add(VariantContext vc) { // Genotypes data final int genotypesLength; final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects - if (lazyData != null) { - // we never decoded any data from this BCF file so we don't need to re-encode the samples data + final boolean lazyDataUsable = lazyData != null && lazyData.version == this.version; + if (lazyDataUsable) { + // We never decoded any data from this BCF file, and its contents were already encoded in the same BCF + // version as we are currently writing, so we don't need to re-encode the samples data. + // Note that the version check is necessary so that we do not write contents encoded using an old + // version of BCF as if it were a newer version, as this can cause problems with e.g. MISSING values genotypesLength = lazyData.bytes.length; } else { // we have to do work to convert the VC into a BCF2 byte stream @@ -210,7 +214,7 @@ public void add(VariantContext vc) { // Write the encoder's buffer into the output stream // If there was no lazy data, this also contains the genotypes data encoder.write(outputStream); - if (lazyData != null) { + if (lazyDataUsable) { // The encoder only contained sites data, so we need to write the lazy data outputStream.write(lazyData.bytes); } @@ -236,7 +240,8 @@ public void setHeader(final VCFHeader header) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } - encoder = BCF2Encoder.getEncoder(BCF2Codec.ALLOWED_BCF_VERSION); + version = getBCFVersionFromHeader(header); + encoder = BCF2Encoder.getEncoder(version); // make sure the header is sorted correctly this.header = doNotWriteGenotypes @@ -273,6 +278,17 @@ public void setHeader(final VCFHeader header) { fieldWriterManager = new BCF2FieldWriterManager(header, stringDictionaryMap, encoder); } + /** + * Determine the appropriate BCF version to use to encode a VCF with based on the version of its VCF header + * Note: currently htsjdk only supports one version of BCF (2.2), but this method is here for if/when + * new BCF versions are added. + * @param header + * @return + */ + private static BCFVersion getBCFVersionFromHeader(final VCFHeader header) { + return BCF2Codec.ALLOWED_BCF_VERSION; + } + // -------------------------------------------------------------------------------- // // implicit block From b5b264995d18fc4e794c7c3cb4fba1cda44f37b9 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Wed, 8 Dec 2021 10:21:37 -0500 Subject: [PATCH 16/22] Fix BCF lazy data version checking and genotype key computation --- .../java/htsjdk/variant/variantcontext/writer/BCF2Writer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index c81e9898e1..38e012fb2a 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -194,7 +194,7 @@ public void add(VariantContext vc) { // Genotypes data final int genotypesLength; final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects - final boolean lazyDataUsable = lazyData != null && lazyData.version == this.version; + final boolean lazyDataUsable = lazyData != null && lazyData.version.equals(this.version); if (lazyDataUsable) { // We never decoded any data from this BCF file, and its contents were already encoded in the same BCF // version as we are currently writing, so we don't need to re-encode the samples data. @@ -423,7 +423,7 @@ private void buildInfo(final VariantContext vc) throws IOException { } private void buildSamplesData(final VariantContext vc) throws IOException { - fieldWriterManager.writeFormat(vc, genotypeKeys.get(vc)); + fieldWriterManager.writeFormat(vc, genotypeKeys.computeIfAbsent(vc, v -> v.calcVCFGenotypeKeys(header))); } // -------------------------------------------------------------------------------- From 65cd7f77f81252b2a959f1e41a33e250014967fe Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Thu, 9 Dec 2021 17:11:19 -0500 Subject: [PATCH 17/22] Match bcftools behavior when writing empty vectors --- .../java/htsjdk/variant/bcf2/BCF2Encoder.java | 19 ++-- .../BCF2FieldWriter/BCF2FieldEncoder.java | 92 +++++++++++-------- .../java/htsjdk/variant/VariantBaseTest.java | 2 +- .../BCF2FieldWriter/BCF2FieldEncoderTest.java | 8 +- .../variant/bcf2/BCF2WriterUnitTest.java | 5 +- 5 files changed, 68 insertions(+), 58 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java index 55fe4a7a7e..0eccbf3711 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java @@ -251,10 +251,14 @@ public final void encodeRawString(final byte[] s, final int paddedSize) { } public final void encodeRawVecInt(final int[] vs, final int paddedSize, final BCF2Type type) throws IOException { + encodeRawVecInt(vs, type); + encodePaddingValues(paddedSize - vs.length, type); + } + + public final void encodeRawVecInt(final int[] vs, final BCF2Type type) throws IOException { for (final int v : vs) { type.write(v, encodeStream); } - encodePaddingValues(paddedSize - vs.length, type); } public final void encodeRawVecInt(final List vs, final BCF2Type type) throws IOException { @@ -267,16 +271,10 @@ public final void encodeRawVecInt(final List vs, final BCF2Type type) t } } - public final void encodeRawVecInt(final List vs, final int paddedSize, final BCF2Type type) throws IOException { - encodeRawVecInt(vs, type); - encodePaddingValues(paddedSize - vs.size(), type); - } - - public final void encodeRawVecFloat(final double[] vs, final int paddedSize) throws IOException { + public final void encodeRawVecFloat(final double[] vs) throws IOException { for (final double v : vs) { encodeRawFloat(v); } - encodePaddingValues(paddedSize - vs.length, BCF2Type.FLOAT); } public final void encodeRawVecFloat(final List vs) throws IOException { @@ -289,11 +287,6 @@ public final void encodeRawVecFloat(final List vs) throws IOException { } } - public final void encodeRawVecFloat(final List vs, final int paddedSize) throws IOException { - encodeRawVecFloat(vs); - encodePaddingValues(paddedSize - vs.size(), BCF2Type.FLOAT); - } - public final void encodePaddingValues(final int size, final BCF2Type type) throws IOException { for (int i = 0; i < size; i++) { encodePaddingValue(type); diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java index 3d95f4ae5e..1c252d80ec 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -235,30 +235,33 @@ void load(final Object o) { @Override void encode() throws IOException { - for (final Object o : vs) { - if (o == null) { - // TODO we encode an entirely missing vector as all EOV, or essentially a 0-length vector - // padded to the appropriate length with EOV, this encoding is allowed but not required - // by the spec[1], but bcftools currently does not appear to handle it properly[2], - // printing such empty vectors in VCF as an empty string and not '.' or '.,.' - // bcfools encodes empty vectors uniformly as [MISSING, EOV*] which we handle appropriately, - // and the distinction between partially missing [MISSING, EOV] and fully missing [EOV, EOV] - // vectors is apparently not required to be preserved by implementations - // We could either match our output to bcftools' codec or keep it as is, and wait for - // bcftools to resolve this issue - // [1] https://github.com/samtools/hts-specs/issues/593#issuecomment-910266633 - // [2] https://github.com/samtools/bcftools/issues/1622 - encoder.encodePaddingValues(nValues, type); - } else if (o instanceof List) { - final List v = (List) o; - encoder.encodeRawVecInt(v, nValues, type); - } else if (o instanceof Integer) { - final Integer v = (Integer) o; - encoder.encodeRawInt(v, type); - encoder.encodePaddingValues(nValues - 1, type); - } else if (o instanceof int[]) { - final int[] v = (int[]) o; - encoder.encodeRawVecInt(v, nValues, type); + if (nValues > 0) { + for (final Object o : vs) { + final int valuesWritten; + if (o == null) { + valuesWritten = 0; + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecInt(v, type); + valuesWritten = v.size(); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + encoder.encodeRawInt(v, type); + valuesWritten = 1; + } else if (o instanceof int[]) { + final int[] v = (int[]) o; + encoder.encodeRawVecInt(v, type); + valuesWritten = v.length; + } else { + throw new TribbleException(""); + } + // In order to produce output that bcftools can interpret, we always write one MISSING + // value even if the input is entirely absent, which we would otherwise write as a vector of + // all EOV values + if (valuesWritten == 0) { + encoder.encodeRawMissingValue(type); + } + encoder.encodePaddingValues(nValues - Math.max(valuesWritten, 1), type); } } vs.clear(); @@ -298,19 +301,34 @@ void load(final Object o) { @Override void encode() throws IOException { - for (final Object o : vs) { - if (o == null) { - encoder.encodePaddingValues(nValues, type); - } else if (o instanceof List) { - final List v = (List) o; - encoder.encodeRawVecFloat(v, nValues); - } else if (o instanceof Double) { - final Double v = (Double) o; - encoder.encodeRawFloat(v); - encoder.encodePaddingValues(nValues - 1, BCF2Type.FLOAT); - } else if (o instanceof double[]) { - final double[] v = (double[]) o; - encoder.encodeRawVecFloat(v, nValues); + if (nValues > 0) { + for (final Object o : vs) { + final int valuesWritten; + if (o == null) { + valuesWritten = 0; + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecFloat(v); + valuesWritten = v.size(); + } else if (o instanceof Double) { + final Double v = (Double) o; + encoder.encodeRawFloat(v); + valuesWritten = 1; + } else if (o instanceof double[]) { + final double[] v = (double[]) o; + encoder.encodeRawVecFloat(v); + valuesWritten = v.length; + } else { + throw new TribbleException(""); + } + + // In order to produce output that bcftools can interpret, we always write one MISSING + // value even if the input is entirely absent, which we would otherwise write as a vector of + // all EOV values + if (valuesWritten == 0) { + encoder.encodeRawMissingValue(type); + } + encoder.encodePaddingValues(nValues - Math.max(valuesWritten, 1), BCF2Type.FLOAT); } } vs.clear(); diff --git a/src/test/java/htsjdk/variant/VariantBaseTest.java b/src/test/java/htsjdk/variant/VariantBaseTest.java index 749ffe69e9..58e6cef658 100644 --- a/src/test/java/htsjdk/variant/VariantBaseTest.java +++ b/src/test/java/htsjdk/variant/VariantBaseTest.java @@ -254,7 +254,7 @@ private static void assertAttributesEquals(final Map actual, Map } else { // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + Assert.assertTrue(isMissingAttribute(actualValue), act.getKey() + " present in one but not in the other"); } expectedKeys.remove(act.getKey()); } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java index 5946caa8a4..2903fc4a63 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -122,7 +122,7 @@ public static Object[][] fieldEncoderCases() { final List vecsToEncode = Arrays.asList( Arrays.asList(null, 1), // Internal null should be missing bytes, not EOV new int[]{1}, // Short vector should be EOV padded - null, // Entirely missing vector should be all EOV + null, // Entirely missing vector should start with one MISSING, then be EOV padded 1 << (byteWidth * 8 - 2) // Atomic value should be treated as vector of size 1 ); final int nValues = 2; @@ -130,7 +130,7 @@ public static Object[][] fieldEncoderCases() { final int[] ints = new int[]{ intType.getMissingBytes(), 1, 1, intType.getEOVBytes(), - intType.getEOVBytes(), intType.getEOVBytes(), + intType.getMissingBytes(), intType.getEOVBytes(), 1 << (byteWidth * 8 - 2), intType.getEOVBytes(), }; for (final int i : ints) { @@ -152,7 +152,7 @@ public static Object[][] fieldEncoderCases() { final List vecsToEncode = Arrays.asList( Arrays.asList(null, 1.0), // Internal null should be missing bytes, not EOV new double[]{1.0}, // Short vector should be EOV padded - null, // Entirely missing vector should be all EOV + null, // Entirely missing vector should start with one MISSING, then be EOV padded Double.NaN // Atomic value should be treated as vector of size 1 ); final int nValues = 2; @@ -160,7 +160,7 @@ public static Object[][] fieldEncoderCases() { final int[] ints = new int[]{ BCF2Type.FLOAT.getMissingBytes(), Float.floatToRawIntBits(1.0f), Float.floatToRawIntBits(1.0f), BCF2Type.FLOAT.getEOVBytes(), - BCF2Type.FLOAT.getEOVBytes(), BCF2Type.FLOAT.getEOVBytes(), + BCF2Type.FLOAT.getMissingBytes(), BCF2Type.FLOAT.getEOVBytes(), Float.floatToRawIntBits((float) Double.NaN), BCF2Type.FLOAT.getEOVBytes(), }; for (final int i : ints) { diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 83258c54ae..95e5ce65dd 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -351,9 +351,8 @@ public Object[][] bcftoolsReadsHtsjdkOutputProvider() { {"NA12891.vcf"}, {"NA12891.fp.vcf"}, {"structuralvariants.vcf"}, - // These two tests appear to fail because of a bcftools bug -// {"ex2.vcf"}, -// {"test.vcf.bgz"}, + {"ex2.vcf"}, + {"test.vcf.bgz"}, {"vcf43/all43Features.utf8.vcf"} }; } From 56a07db63112df343e5377aaddede74a9f9b0ab3 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Fri, 10 Dec 2021 13:29:49 -0500 Subject: [PATCH 18/22] Fix spotbugs warning --- src/main/java/htsjdk/variant/bcf2/BCF2Codec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index acd5cc3090..7409a48e77 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -69,7 +69,7 @@ public class BCF2Codec extends BinaryFeatureCodec { private static final Log log = Log.getInstance(BCF2Codec.class); - public static String IDXField = "IDX"; // BCF2.2 IDX field name + public static final String IDXField = "IDX"; // BCF2.2 IDX field name protected final static int ALLOWED_MAJOR_VERSION = 2; protected final static int ALLOWED_MINOR_VERSION = 2; From e917a3e841d04a7f695b4ba2b7c3493c25978f43 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Fri, 10 Dec 2021 15:16:16 -0500 Subject: [PATCH 19/22] Clean up BCF2Encoder, better error in BCF2FieldEncoder --- src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java | 10 ---------- .../variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java | 4 ++-- .../variant/bcf2/BCF2EncoderDecoderUnitTest.java | 9 +++++++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java index 0eccbf3711..335fc2e7f2 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java @@ -153,16 +153,6 @@ public final void encodeTypedVecInt(final int[] vs, final int paddedSize) throws encodeRawVecInt(vs, paddedSize, type); } - // TODO only used in testing, should remove and update tests - public final void encodeTyped(final List v, final BCF2Type type) throws IOException { - if (type == BCF2Type.CHAR && !v.isEmpty()) { - encodeTypedString(compactStrings((List) v)); - } else { - encodeType(v.size(), type); - encodeRawValues(v, type); - } - } - // -------------------------------------------------------------------------------- // diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java index 1c252d80ec..c7bf8bbdba 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -253,7 +253,7 @@ void encode() throws IOException { encoder.encodeRawVecInt(v, type); valuesWritten = v.length; } else { - throw new TribbleException(""); + throw BCF2FieldEncoder.incompatibleType(o, type); } // In order to produce output that bcftools can interpret, we always write one MISSING // value even if the input is entirely absent, which we would otherwise write as a vector of @@ -319,7 +319,7 @@ void encode() throws IOException { encoder.encodeRawVecFloat(v); valuesWritten = v.length; } else { - throw new TribbleException(""); + throw BCF2FieldEncoder.incompatibleType(o, type); } // In order to produce output that bcftools can interpret, we always write one MISSING diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java index 050931444b..5d888f76fd 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java @@ -244,8 +244,13 @@ public void testBCF2EncodingVectors(final List toEncode, final B for (final BCF2TypedValue tv : toEncode) { for (final int length : Arrays.asList(2, 5, 10, 15, 20, 25)) { final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); - final List expected = Collections.nCopies(length, tv.value); - encoder.encodeTyped(expected, tv.type); + final List expected = Collections.nCopies(length, tv.value); + if (tv.type == BCF2Type.CHAR && !expected.isEmpty()) { + encoder.encodeTypedString(encoder.compactStrings((List) expected)); + } else { + encoder.encodeType(expected.size(), tv.type); + encoder.encodeRawValues(expected, tv.type); + } final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); From 4af39399433b25fedc4d02f11e806a9273c48307 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Tue, 14 Dec 2021 14:39:14 -0500 Subject: [PATCH 20/22] Cleanup in BCF code --- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 11 ++--------- .../java/htsjdk/variant/bcf2/BCF2Decoder.java | 18 ++++++++---------- .../bcf2/BCF2FieldWriter/BCF2FieldWriter.java | 1 - .../variantcontext/writer/BCF2Writer.java | 10 +++++----- .../BCF2FieldWriter/BCF2FieldEncoderTest.java | 2 +- 5 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index 7409a48e77..04adb8cbf1 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -25,7 +25,6 @@ package htsjdk.variant.bcf2; -import htsjdk.samtools.BAMIndexer; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.tribble.BinaryFeatureCodec; @@ -36,16 +35,12 @@ import htsjdk.tribble.readers.LineIteratorImpl; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.readers.SynchronousLineReader; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.variantcontext.VariantContextUtils; import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFCompoundHeaderLine; -import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; @@ -54,10 +49,8 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -141,8 +134,8 @@ public VariantContext decode(final PositionalBufferedStream inputStream) { recordNo++; final VariantContextBuilder builder = new VariantContextBuilder(); - final int sitesBlockSize = decoder.readBlockSize(inputStream); - final int genotypeBlockSize = decoder.readBlockSize(inputStream); + final int sitesBlockSize = BCF2Decoder.readBlockSize(inputStream); + final int genotypeBlockSize = BCF2Decoder.readBlockSize(inputStream); decoder.readNextBlock(sitesBlockSize, inputStream); decodeSiteLoc(builder); diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java index db0814839c..e88db1e115 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java @@ -375,7 +375,7 @@ public final int[] decodeIntArray(final byte typeDescriptor, final int size) thr return decodeIntArray(size, type, null); } - private double rawFloatToFloat(final int rawFloat) { + private static double rawFloatToFloat(final int rawFloat) { return Float.intBitsToFloat(rawFloat); } @@ -391,7 +391,7 @@ private double rawFloatToFloat(final int rawFloat) { * @param inputStream * @return */ - public final int readBlockSize(final InputStream inputStream) throws IOException { + public static int readBlockSize(final InputStream inputStream) throws IOException { return BCF2Type.INT32.read(inputStream); } @@ -410,21 +410,19 @@ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStr final byte[] record = new byte[blockSizeInBytes]; try { int bytesRead = 0; - final int nReadAttempts = 0; // keep track of how many times we've read + int nReadAttempts = 0; // keep track of how many times we've read // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF while (bytesRead < blockSizeInBytes) { final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); - if (read1 == -1) + nReadAttempts++; + if (read1 == -1) { validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); + break; + } else bytesRead += read1; } - - if (GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1) { // TODO -- remove me - System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); - } - validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 file", e); @@ -445,7 +443,7 @@ private static void validateReadBytes(final int actuallyRead, final int nReadAtt if (actuallyRead < expected) { throw new TribbleException( - String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", + String.format("Failed to read next complete record: expected %d bytes but read only %d after %d read attempts", expected, actuallyRead, nReadAttempts)); } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java index dd140b3b3a..d91b706681 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java @@ -179,7 +179,6 @@ void encode(final VariantContext vc) throws IOException { // TODO in the genotype writers, a missing genotype (one where variantContext.getGenotype(sampleName) == null) // is treated like one where all its attributes/inline fields are missing, this matches the behavior // of the old writer, which previously created a new empty Genotype object for each missing genotypes, is this right? - // For example, should the FT string of a missing genotype be PASS or a padded empty string /** * Class that writes one field specified by a {@link VCFFormatHeaderLine} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index 38e012fb2a..517d5eeb3d 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -27,8 +27,8 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; -import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.bcf2.BCF2Dictionary; @@ -95,6 +95,8 @@ * @since 06/12 */ class BCF2Writer extends IndexingVariantContextWriter { + private static final Log log = Log.getInstance(BCF2Writer.class); + public static final int MAJOR_VERSION = 2; public static final int MINOR_VERSION = 2; @@ -251,15 +253,13 @@ public void setHeader(final VCFHeader header) { // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields // Error on ##dictionary lines, we don't know what to do with them if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { - throw new TribbleException("Use of the ##dictionary line is not supported"); + log.warn("Use of the ##dictionary line is not supported"); } // create the config offsets map if (this.header.getContigLines().isEmpty()) { if (ALLOW_MISSING_CONTIG_LINES) { - if (GeneralUtils.DEBUG_MODE_ENABLED) { - System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary"); - } + log.debug("No contig dictionary found in header, falling back to reference sequence dictionary"); // The reference sequence dictionary should never contain IDX fields createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null)); } else { diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java index 2903fc4a63..6b72308c54 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -250,7 +250,7 @@ public void testSiteWriters( final VariantContext vc, final byte[] expectedBytes ) throws IOException { - // Skip starting so we don't get key in output + // Skip writing key so that we don't get key in output writer.encode(vc); Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); } From a991f7e77f54f79e1aca9dfe9528aaa21f76f2a9 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Wed, 22 Dec 2021 12:37:06 -0500 Subject: [PATCH 21/22] Add disabled test for missing Character and String VCF types --- .../BCF2FieldWriter/BCF2FieldEncoder.java | 21 +++++++++++++++++-- .../bcf2/BCF2GenotypeFieldDecoders.java | 8 +++++++ .../java/htsjdk/variant/bcf2/BCF2Type.java | 8 +++---- .../BCF2FieldWriter/BCF2FieldEncoderTest.java | 2 ++ .../variant/bcf2/BCF2WriterUnitTest.java | 8 ++++++- .../variant/missingStringAndCharacterTest.vcf | 17 +++++++++++++++ 6 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java index c7bf8bbdba..546fd3eff0 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -8,6 +8,7 @@ import htsjdk.variant.vcf.VCFCompoundHeaderLine; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -109,6 +110,8 @@ void encode() throws IOException { static class CharFieldEncoder extends BCF2FieldEncoder { + // TODO see https://github.com/samtools/hts-specs/issues/618 + // private static final byte[] MISSING = new byte[] {(byte) BCF2Type.CHAR.getMissingBytes()}; private static final byte[] EMPTY = new byte[0]; private final List vs = new ArrayList<>(); @@ -127,6 +130,20 @@ void load(final Object o) { final byte[] b = ((String) o).getBytes(StandardCharsets.UTF_8); nValues = Math.max(nValues, b.length); vs.add(b); + } else if (o instanceof List) { + final List strings = (List) o; + nValues = Math.max(nValues, strings.size()); + final ByteBuffer buff = ByteBuffer.allocate(strings.size()); + for (final String s : strings) { + if (s == null) { + buff.put((byte) type.getMissingBytes()); + } else if (s.length() > 1) { + throw new TribbleException("Value of VCF type Character is a string with more than 1 character: " + s); + } else { + buff.put(s.getBytes(StandardCharsets.UTF_8)[0]); + } + } + vs.add(buff.array()); } else { throw BCF2FieldEncoder.incompatibleType(o, type); } @@ -337,7 +354,7 @@ void encode() throws IOException { } static TribbleException incompatibleType(final Object o, final BCF2Type type) { - final String error = "Could not write object: %s whose type is incompatible with declared header of type: %s"; - return new TribbleException(String.format(error, o, type)); + final String error = "Could not write object: %s whose type %s is incompatible with declared header of type: %s"; + return new TribbleException(String.format(error, o, o.getClass().getSimpleName(), type)); } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java index 173e095687..34d49546c1 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java @@ -257,6 +257,14 @@ public void decode(final List siteAlleles, final String field, final BCF for (final GenotypeBuilder gb : gbs) { final Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if (value == null) continue; + // TODO see https://github.com/samtools/hts-specs/issues/618 + // Although it seems like a very rare corner case, this decoder cannot distinguish between + // a vector of Character and a String, which are different VCF types but identical in BCF, + // which should be decoded differently as Java objects + // as List chars = Arrays.asList("a", "b", "c") vs String str = new String("abc") + // We would need the associated header line for each key to inspect its VCF type like we do in the + // BCF writer. This would require a rewrite of this class, which would be desirable either way + // so we can do stricter validation of the number and type of attributes being deserialized if (value instanceof List && ((List) value).size() == 1) { // TODO not sure what this refers to, htsjdk itself doesn't make any assumptions about // the concrete type of the data contained in the attributes map. diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java index 89610c7569..ae6f6ed90f 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java @@ -80,7 +80,6 @@ public int read(final InputStream in) throws IOException { @Override public void write(final int value, final OutputStream out) throws IOException { - // TODO -- optimization -- should we put this in a local buffer? out.write(value); out.write(value >> 8); } @@ -117,10 +116,9 @@ public void write(final int value, final OutputStream out) throws IOException { } }, - // CHAR isn't given a MISSING or EOV value in the spec, but for the purposes of - // padding strings (i.e. variable length vectors of chars), it is treated as if - // '\0' or NULL is both the MISSING and EOV value of CHAR - CHAR(7, 1, 0x00000000) { + // TODO uncertain as to the correct MISSING and EOV representations of Character/String + // see https://github.com/samtools/hts-specs/issues/618 + CHAR(7, 1, 0x07, 0x00, 0, 0) { @Override public int read(final InputStream in) throws IOException { return INT8.read(in); diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java index 6b72308c54..7c5583c99f 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -80,6 +80,7 @@ public static Object[][] fieldEncoderCases() { } // Char encoding + // TODO see https://github.com/samtools/hts-specs/issues/618 { final List stringsToEncode = Arrays.asList("str", null, "\0a\0"); final int maxByteWidth = stringsToEncode @@ -100,6 +101,7 @@ public static Object[][] fieldEncoderCases() { } // String encoding + // TODO see https://github.com/samtools/hts-specs/issues/618 { final List stringsToEncode = Arrays.asList("st", null, Arrays.asList("a", "b"), new String[]{"a", "b"}); final byte[] bytes = new byte[]{ diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 95e5ce65dd..7256c9c967 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -353,7 +353,9 @@ public Object[][] bcftoolsReadsHtsjdkOutputProvider() { {"structuralvariants.vcf"}, {"ex2.vcf"}, {"test.vcf.bgz"}, - {"vcf43/all43Features.utf8.vcf"} + {"vcf43/all43Features.utf8.vcf"}, + // This test fails because the BCF decoder cannot distinguish between a vector of Characters and a String +// {"missingStringAndCharacterTest.vcf"}, }; } @@ -410,6 +412,10 @@ public Object[][] htsjdkReadsBCFToolsOutputProvider() { {"structuralvariants.vcf"}, {"ex2.vcf"}, {"test.vcf.bgz"}, + // TODO bcftools does not convert '.' into the MISSING value for Character (0x07), + // but writes it out as literal '.' which causes this test to fail when we compare '.' against null, + // see https://github.com/samtools/hts-specs/issues/618 +// {"missingStringAndCharacterTest.vcf"}, // bcftools does not to decoding of percent encoded VCFs, so its BCF output contains the literal characters // {"vcf43/all43Features.utf8.vcf"} }; diff --git a/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf b/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf new file mode 100644 index 0000000000..eb9b8d0b7e --- /dev/null +++ b/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.3 +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 17330 . T A . PASS NS=3;DP=11;AF=0.017;CHAR=.,b,c;STR=. GT:GQ:DP:HQ:CHAR:STR 0|0:49:3:58,50:a,b,c:abc 0|1:3:5:65,3:.,.,c:c 0/0:41:3:4,5:.:. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;CHAR=.;STR=. GT:GQ:DP:HQ:CHAR:STR 1|2:21:6:23,27:.:a 2|1:2:0:18,2:.:ab 2/2:35:4:10,20:.:abc From b54ae828328c7e6e31c4f133b92d986722525a02 Mon Sep 17 00:00:00 2001 From: Anders Leung Date: Thu, 23 Dec 2021 15:47:05 -0500 Subject: [PATCH 22/22] Change BCF2Dictionary interface to be immutable --- .../java/htsjdk/variant/bcf2/BCF2Codec.java | 8 +- .../htsjdk/variant/bcf2/BCF2Dictionary.java | 74 ++++++++----------- 2 files changed, 33 insertions(+), 49 deletions(-) diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index 04adb8cbf1..b987ee9cdf 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -512,13 +512,7 @@ protected final String getDictionaryString(final int offset) { } private BCF2Dictionary makeStringDictionary(final BCFVersion bcfVersion) { - final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(header, bcfVersion); - - // if we got here we never found a dictionary, or there are no elements in the dictionary - if (dict.isEmpty()) - error("Dictionary header element was absent or empty"); - - return dict; + return BCF2Dictionary.makeBCF2StringDictionary(header, bcfVersion); } /** diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java index db2d342449..5a1d0ffd94 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java @@ -6,7 +6,6 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFSimpleHeaderLine; -import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -32,7 +31,7 @@ * n-to-1 IDX-to-string mapping might result from tools that do not deduplicate IDXs, so * we accept them. */ -public abstract class BCF2Dictionary extends AbstractMap { +public abstract class BCF2Dictionary { /** * Create and return a BCF string dictionary @@ -182,6 +181,23 @@ private static BCF2Dictionary makeDictionary( */ public abstract String get(final int i); + /** + * Performs the given action for each entry in the dictionary. + * @param action the action to be performed + */ + public abstract void forEach(final BiConsumer action); + + /** + * @return the number of elements in the dictionary + */ + public abstract int size(); + + /** + * @param i the BCF index to search for + * @return true if there is a string or contig mapped to the given index + */ + public abstract boolean containsIndex(final int i); + /** * BCF 2.2 dense sequence dictionary. Strings are assigned an index corresponding to its position in a 0-indexed * array. This dictionary is used if no IDX fields are present in the header, or they are present, but they @@ -196,44 +212,28 @@ private BCF2DenseDictionary(final List dictionary) { this.dictionary = dictionary; } - @Override - public Set> entrySet() { - final Set> set = new HashSet<>(dictionary.size()); - int i = 0; - for (final String s : dictionary) { - set.add(new AbstractMap.SimpleEntry<>(i, s)); - i++; - } - return set; - } - @Override public String get(final int i) { return i < 0 || i >= dictionary.size() ? null : dictionary.get(i); } @Override - public String get(final Object key) { - return dictionary.get((Integer) key); + public void forEach(final BiConsumer action) { + int i = 0; + for (final String s : dictionary) { + action.accept(i, s); + i++; + } } @Override public int size() { - return dictionary.size(); + return this.dictionary.size(); } @Override - public boolean isEmpty() { - return dictionary.isEmpty(); - } - - @Override - public void forEach(final BiConsumer action) { - int i = 0; - for (final String s : dictionary) { - action.accept(i, s); - i++; - } + public boolean containsIndex(final int i) { + return i < this.dictionary.size(); } } @@ -251,34 +251,24 @@ private BCF2SparseDictionary(final Map dictionary) { this.dictionary = dictionary; } - @Override - public Set> entrySet() { - return dictionary.entrySet(); - } - @Override public String get(final int i) { return dictionary.get(i); } @Override - public String get(final Object key) { - return dictionary.get(key); + public void forEach(final BiConsumer action) { + this.dictionary.forEach(action); } @Override public int size() { - return dictionary.size(); + return this.dictionary.size(); } @Override - public boolean isEmpty() { - return dictionary.isEmpty(); - } - - @Override - public void forEach(final BiConsumer action) { - dictionary.forEach(action); + public boolean containsIndex(final int i) { + return this.dictionary.containsKey(i); } } }