diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cbc1966dd6..74adfbcbf9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,6 +11,7 @@ jobs: test: env: HTSJDK_SAMTOOLS_BIN: /usr/bin/samtools + HTSJDK_BCFTOOLS_BIN: /usr/bin/bcftools runs-on: ubuntu-latest strategy: matrix: @@ -36,6 +37,8 @@ jobs: run: ./gradlew compileJava - name: Install Samtools run: scripts/install-samtools.sh + - name: Install Bcftools + run: scripts/install-bcftools.sh - name: Start the htsget server run: scripts/htsget-scripts/start-htsget-test-server.sh - name: Run tests diff --git a/.travis.yml b/.travis.yml index f00fe8b27e..dab05066b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ cache: env: global: - HTSJDK_SAMTOOLS_BIN=/usr/bin/samtools + - HTSJDK_BCFTOOLS_BIN=/usr/bin/bcftools jdk: - oraclejdk8 - openjdk8 @@ -32,6 +33,7 @@ matrix: before_install: - scripts/install-samtools.sh + - scripts/install-bcftools.sh - scripts/htsget-scripts/start-htsget-test-server.sh script: diff --git a/scripts/install-bcftools.sh b/scripts/install-bcftools.sh new file mode 100755 index 0000000000..1694c85912 --- /dev/null +++ b/scripts/install-bcftools.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +wget https://github.com/samtools/bcftools/releases/download/1.14/bcftools-1.14.tar.bz2 +tar -xjvf bcftools-1.14.tar.bz2 +cd bcftools-1.14 && ./configure --prefix=/usr && make && sudo make install diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index e2ecf3d1f7..5aa3e9052e 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -1,6 +1,7 @@ package htsjdk.samtools; import htsjdk.samtools.util.Log; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.File; import java.util.Collections; @@ -110,6 +111,16 @@ public class Defaults { */ public static final boolean DISABLE_SNAPPY_COMPRESSOR; + /** + * Strict VCF version validation. Default = true. + */ + public static final boolean STRICT_VCF_VERSION_VALIDATION; + + /** + * How to treat files from VCF versions older than the current version. Default = UPGRADE_OR_FALLBACK + */ + public static final VCFVersionUpgradePolicy VCF_VERSION_TRANSITION_POLICY; + public static final String SAMJDK_PREFIX = "samjdk."; static { @@ -134,6 +145,11 @@ public class Defaults { SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); + STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true); + VCF_VERSION_TRANSITION_POLICY = VCFVersionUpgradePolicy.valueOf(getStringProperty( + "vcf_version_transition_policy", + VCFVersionUpgradePolicy.UPGRADE_OR_FALLBACK.name() + )); } /** @@ -157,6 +173,7 @@ public static SortedMap allDefaults(){ result.put("CUSTOM_READER_FACTORY", CUSTOM_READER_FACTORY); result.put("SAM_FLAG_FIELD_FORMAT", SAM_FLAG_FIELD_FORMAT); result.put("DISABLE_SNAPPY_COMPRESSOR", DISABLE_SNAPPY_COMPRESSOR); + result.put("VCF_VERSION_TRANSITION_POLICY", VCF_VERSION_TRANSITION_POLICY); return Collections.unmodifiableSortedMap(result); } diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index cf40fe6532..1e6cb764e0 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List list) { setSequences(list); } + //TODO: this returns sequences in the internal list order instead of + // honoring each sequence's contigIndex + /** + * Get a list of sequences for this dictionary. + * @return the list of sequences for this dictionary in internal order (the order in which the sequences + * were added to this dictionary) + */ public List getSequences() { return Collections.unmodifiableList(mSequences); } @@ -75,6 +82,14 @@ public void setSequences(final List list) { list.forEach(this::addSequence); } + /** + * Add a sequence to the dictionary. + * @param sequenceRecord the sequence record to add - note that this method mutates the contig + * index of the sequenceRecord to match the newly added record's relative + * order in the list + */ + //TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match + // the record's relative placement in the dictionary's internal list public void addSequence(final SAMSequenceRecord sequenceRecord) { if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java new file mode 100644 index 0000000000..0d5073a0ba --- /dev/null +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -0,0 +1,346 @@ +package htsjdk.samtools; + +import htsjdk.utils.ValidationUtils; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * + * A series of utility functions that enable comparison of two sequence dictionaries -- from the reference, + * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that + * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will + * blow up with a UserException if the dicts are too incompatible. + * + * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, + * if available. + */ +public final class SAMSequenceDictionaryUtils { + + private SAMSequenceDictionaryUtils(){} + + /** + * Compares sequence records by their order + */ + private static final Comparator SEQUENCE_INDEX_ORDER = Comparator.comparing(SAMSequenceRecord::getSequenceIndex); + + // The following sets of contig records are used to perform the non-canonical human ordering check. + // This check ensures that the order is 1,2,3... instead of 1, 10, 11, 12...2, 20, 21... + + // hg18 + protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + + // b36 + protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); + protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); + protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); + + // b37 + protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); + protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); + protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); + + + public enum SequenceDictionaryCompatibility { + IDENTICAL, // the dictionaries are identical + COMMON_SUBSET, // there exists a common subset of equivalent contigs + SUPERSET, // the first dict's set of contigs supersets the second dict's set + NO_COMMON_CONTIGS, // no overlap between dictionaries + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different + // orders with respect to each other + DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } + } + + /** + * Workhorse routine that takes two dictionaries and returns their compatibility. + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @param checkContigOrdering if true, perform checks related to contig ordering: forbid lexicographically-sorted + * dictionaries, and require common contigs to be in the same relative order and at the + * same absolute indices + * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries + */ + public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final boolean checkContigOrdering ) { + if ( checkContigOrdering && (nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2)) ) { + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; + } + + final Set commonContigs = getCommonContigsByName(dict1, dict2); + + if (commonContigs.isEmpty()) { + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; + } + else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; + } + + final boolean commonContigsAreInSameRelativeOrder = commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2); + + if ( checkContigOrdering && ! commonContigsAreInSameRelativeOrder ) { + return SequenceDictionaryCompatibility.OUT_OF_ORDER; + } + else if ( commonContigsAreInSameRelativeOrder && commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) { + return SequenceDictionaryCompatibility.IDENTICAL; + } + else if ( checkContigOrdering && ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.DIFFERENT_INDICES; + } + else if ( supersets(dict1, dict2) ) { + return SequenceDictionaryCompatibility.SUPERSET; + } + else { + return SequenceDictionaryCompatibility.COMMON_SUBSET; + } + } + + + /** + * Utility function that tests whether dict1's set of contigs is a superset of dict2's + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if dict1's set of contigs supersets dict2's + */ + private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) { + // Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account). + for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) { + final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName()); + if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) { + return false; + } + } + + return true; + } + + + + /** + * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means + * that the seq records have the same length, if both are non-zero. + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return true if all of the common contigs are equivalent + */ + private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + } + + /** + * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * null if all common contigs are equivalent + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return + */ + private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + for ( String name : commonContigs ) { + SAMSequenceRecord elt1 = dict1.getSequence(name); + SAMSequenceRecord elt2 = dict2.getSequence(name); + if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) + return Arrays.asList(elt1,elt2); + } + + return null; + } + + /** + * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and + * lengths. + * + * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH + * + * @param first first sequence record to compare + * @param second second sequence record to compare + * @return true if first and second have the same names and lengths, otherwise false + */ + public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) { + if ( first == second ) { + return true; + } + if ( first == null || second == null ) { + return false; + } + final int length1 = first.getSequenceLength(); + final int length2 = second.getSequenceLength(); + + if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){ + return false; + } + if (! first.getSequenceName().equals(second.getSequenceName())){ + return false; + } + return true; + } + + /** + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. + * + * @param dict + * @return + */ + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt; + } + if ( chr1 != null && chr2 != null && chr10 != null) { + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } + + return false; + } + + /** + * Trivial helper that returns true if elt has the same name and length as rec1 or rec2 + * @param elt record to test + * @param recs the list of records to check for name and length equivalence + * @return true if elt has the same name and length as any of the recs + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) { + for (SAMSequenceRecord rec : recs) { + if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) { + return true; + } + } + return false; + } + + /** + * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to + * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting + * these according to their indices, and then walking through the sorted list to ensure that each ordered contig + * is equivalent + * + * @param commonContigs names of the contigs common to both dictionaries + * @param dict1 first SAMSequenceDictionary + * @param dict2 second SAMSequenceDictionary + * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false + */ + private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + final List list1 = getSequencesOfName(commonContigs, dict1); + final List list2 = getSequencesOfName(commonContigs, dict2); + list1.sort(SEQUENCE_INDEX_ORDER); + list2.sort(SEQUENCE_INDEX_ORDER); + + for ( int i = 0; i < list1.size(); i++ ) { + SAMSequenceRecord elt1 = list1.get(i); + SAMSequenceRecord elt2 = list2.get(i); + if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) + return false; + } + + return true; + } + + /** + * Gets the subset of SAMSequenceRecords in commonContigs in dict + * + * @param commonContigs + * @param dict + * @return + */ + private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { + List l = new ArrayList<>(commonContigs.size()); + for ( String name : commonContigs ) { + l.add(dict.getSequence(name) ); + } + + return l; + } + + /** + * Checks whether the common contigs in the given sequence dictionaries occur at the same indices + * in both dictionaries + * + * @param commonContigs Set of names of the contigs that occur in both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, + * otherwise false + */ + private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { + for ( String commonContig : commonContigs ) { + SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + + // Each common contig must have the same index in both dictionaries + if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { + return false; + } + } + + return true; + } + + /** + * Returns the set of contig names found in both dicts. + * @param dict1 + * @param dict2 + * @return + */ + public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + Set intersectingSequenceNames = getContigNames(dict1); + intersectingSequenceNames.retainAll(getContigNames(dict2)); + return intersectingSequenceNames; + } + + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new LinkedHashSet(dict.size()); + for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) + contigNames.add(dictionaryEntry.getSequenceName()); + return contigNames; + } + + public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { + ValidationUtils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); + } + + /** + * Returns a compact String representation of the sequence dictionary it's passed + * + * The format of the returned String is: + * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] + * + * @param dict a non-null SAMSequenceDictionary + * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed + */ + public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { + ValidationUtils.nonNull(dict, "Sequence dictionary must be non-null"); + + StringBuilder s = new StringBuilder("[ "); + + for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { + s.append(dictionaryEntry.getSequenceName()); + s.append("(length:"); + s.append(dictionaryEntry.getSequenceLength()); + s.append(") "); + } + + s.append("]"); + + return s.toString(); + } + +} diff --git a/src/main/java/htsjdk/samtools/util/FileExtensions.java b/src/main/java/htsjdk/samtools/util/FileExtensions.java index fc2e37d6c6..dcb8c889f9 100755 --- a/src/main/java/htsjdk/samtools/util/FileExtensions.java +++ b/src/main/java/htsjdk/samtools/util/FileExtensions.java @@ -65,6 +65,9 @@ public final class FileExtensions { public static final String VCF = ".vcf"; public static final String VCF_INDEX = TRIBBLE_INDEX; public static final String BCF = ".bcf"; + // Note that .bcf on its own may be gzip compressed and usually is, + // but files with the extension .bcf.gz to seem to exist in the wild and should be supported + public static final String COMPRESSED_BCF = ".bcf.gz"; public static final String COMPRESSED_VCF = ".vcf.gz"; public static final String COMPRESSED_VCF_INDEX = ".tbi"; public static final List VCF_LIST = Collections.unmodifiableList(Arrays.asList(VCF, COMPRESSED_VCF, BCF)); diff --git a/src/main/java/htsjdk/samtools/util/IOUtil.java b/src/main/java/htsjdk/samtools/util/IOUtil.java index 81351e297a..2d97d2284c 100755 --- a/src/main/java/htsjdk/samtools/util/IOUtil.java +++ b/src/main/java/htsjdk/samtools/util/IOUtil.java @@ -1277,7 +1277,8 @@ public static List filesToPaths(Collection files){ */ public static boolean isGZIPInputStream(final InputStream stream) { if (!stream.markSupported()) { - throw new IllegalArgumentException("isGZIPInputStream() : Cannot test a stream that doesn't support marking."); + // BufferedInputStream supports mark + return isGZIPInputStream(new BufferedInputStream(stream)); } stream.mark(GZIP_HEADER_READ_LENGTH); diff --git a/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java b/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java new file mode 100644 index 0000000000..d183a3b90f --- /dev/null +++ b/src/main/java/htsjdk/samtools/util/ListByteBufferOutputStream.java @@ -0,0 +1,138 @@ +package htsjdk.samtools.util; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Growable byte buffer backed by a list of byte arrays, which can + * be used to buffer data without reallocating an underlying array. + * Once data is accumulated, it can either be retrieved by converting + * into a byte[] for interfaces that require a contiguous block of bytes, + * or written directly to an OutputStream to avoid array copies. + */ +public class ListByteBufferOutputStream extends OutputStream { + + private final int blockSize; + private final ArrayList blocks; + private byte[] currentBlock; + private int nextBlockIndex; + private int nextBytePosition; + private int size; + + public ListByteBufferOutputStream(final int blockSize) { + this.blockSize = blockSize; + blocks = new ArrayList<>(); + nextBlockIndex = 0; + advanceBlock(); + size = 0; + } + + @Override + public void write(final int b) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + currentBlock[nextBytePosition++] = (byte) b; + size++; + } + + public void write(final byte b, final int nCopies) { + assert nCopies >= 0; + + int bytesRemaining = nCopies; + while (bytesRemaining > 0) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + final int toIndex = Math.min(nextBytePosition + bytesRemaining, blockSize); + Arrays.fill(currentBlock, nextBytePosition, toIndex, b); + bytesRemaining -= toIndex - nextBytePosition; + nextBytePosition = toIndex; + } + size += nCopies; + } + + @Override + public void write(final byte[] b) { + write(b, 0, b.length); + } + + @Override + public void write(final byte[] b, int off, final int len) { + assert b != null; + assert off >= 0; + assert len >= 0; + assert off + len <= b.length; + + int bytesRemaining = len; + while (bytesRemaining > 0) { + if (nextBytePosition == blockSize) { + advanceBlock(); + } + final int lengthToWrite = Math.min(bytesRemaining, blockSize - nextBytePosition); + System.arraycopy(b, off, currentBlock, nextBytePosition, lengthToWrite); + nextBytePosition += lengthToWrite; + off += lengthToWrite; + bytesRemaining -= lengthToWrite; + } + size += len; + } + + public int size() { + return size; + } + + public void writeTo(final OutputStream out) throws IOException { + for (final byte[] b : blocks) { + if (b == currentBlock) { + out.write(b, 0, nextBytePosition); + break; + } else { + out.write(b); + } + } + } + + public byte[] toByteArray() { + final byte[] bytes = new byte[size]; + final ByteBuffer buff = ByteBuffer.wrap(bytes); + for (final byte[] b : blocks) { + if (b == currentBlock) { + buff.put(b, 0, nextBytePosition); + break; + } else { + buff.put(b); + } + } + return bytes; + } + + public void reset() { + currentBlock = blocks.get(0); + nextBytePosition = 0; + nextBlockIndex = 1; + size = 0; + } + + public void clear() { + reset(); + // blocks always has at least 1 element + blocks.subList(1, blocks.size()).clear(); + } + + private void advanceBlock() { + if (nextBlockIndex == blocks.size()) { + // Need to add a new block + currentBlock = new byte[blockSize]; + blocks.add(currentBlock); + } else { + // Reuse old block + currentBlock = blocks.get(nextBlockIndex); + } + nextBytePosition = 0; + nextBlockIndex++; + } +} diff --git a/src/main/java/htsjdk/tribble/TribbleException.java b/src/main/java/htsjdk/tribble/TribbleException.java index abcbc25ca0..4e2651640b 100644 --- a/src/main/java/htsjdk/tribble/TribbleException.java +++ b/src/main/java/htsjdk/tribble/TribbleException.java @@ -86,6 +86,12 @@ public static class InternalCodecException extends TribbleException { public InternalCodecException(String message) { super (message); } } + public static class VersionValidationFailure extends TribbleException { + public VersionValidationFailure(final String message) { + super(String.format("Version validation failure: %s", message)); + } + } + // ////////////////////////////////////////////////////////////////////// // Index exceptions // ////////////////////////////////////////////////////////////////////// diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index 768c797ac0..7e2c10ebc0 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -33,6 +33,7 @@ import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.util.ParsingUtils; +import htsjdk.variant.vcf.VCFFileReader; import java.io.BufferedInputStream; import java.io.IOException; @@ -252,7 +253,11 @@ private void readHeader() throws IOException { PositionalBufferedStream pbs = null; try { is = ParsingUtils.openInputStream(path, wrapper); - if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) { + // BCFs are usually gzipped but do not have the .gz extension, + // so we explicitly check for the presence of a gzip header + if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8"))) + || (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(is)) + ) { // TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944 // TODO -- warning I don't think this can work, the buffered input stream screws up position is = new GZIPInputStream(new BufferedInputStream(is)); @@ -326,7 +331,8 @@ public WFIterator() throws IOException { final InputStream inputStream = ParsingUtils.openInputStream(path, wrapper); final PositionalBufferedStream pbs; - if (IOUtil.hasBlockCompressedExtension(path)) { + // BCFs can be gzipped but usually do not have a compressed extension, so an extra check is needed + if (IOUtil.hasBlockCompressedExtension(path) || (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(inputStream))) { // Gzipped -- we need to buffer the GZIPInputStream methods as this class makes read() calls, // and seekableStream does not support single byte reads final InputStream is = new GZIPInputStream(new BufferedInputStream(inputStream, 512000)); diff --git a/src/main/java/htsjdk/tribble/index/IndexFactory.java b/src/main/java/htsjdk/tribble/index/IndexFactory.java index 1e26c33300..be21977a2c 100644 --- a/src/main/java/htsjdk/tribble/index/IndexFactory.java +++ b/src/main/java/htsjdk/tribble/index/IndexFactory.java @@ -595,7 +595,9 @@ public FeatureIterator(final Path inputPath, final FeatureCodec { + private static final Log log = Log.getInstance(BCF2Codec.class); + + public static final String IDXField = "IDX"; // BCF2.2 IDX field name + protected final static int ALLOWED_MAJOR_VERSION = 2; - protected final static int ALLOWED_MINOR_VERSION = 1; + protected final static int ALLOWED_MINOR_VERSION = 2; public static final BCFVersion ALLOWED_BCF_VERSION = new BCFVersion(ALLOWED_MAJOR_VERSION, ALLOWED_MINOR_VERSION); - /** sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header */ - public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2*Byte.BYTES; - + /** + * sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header + */ + public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2 * Byte.BYTES; + private BCFVersion bcfVersion = null; private VCFHeader header = null; @@ -70,19 +80,19 @@ public class BCF2Codec extends BinaryFeatureCodec { /** * Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field */ - private final ArrayList contigNames = new ArrayList(); + private BCF2Dictionary contigDictionary; /** * Maps header string names (encoded in VCF) into strings found in the BCF header - * + *

* Initialized when processing the header */ - private ArrayList dictionary; + private BCF2Dictionary stringDictionary; /** * Our decoder that reads low-level objects from the BCF2 records */ - private final BCF2Decoder decoder = new BCF2Decoder(); + private BCF2Decoder decoder; /** * Provides some sanity checking on the header @@ -96,7 +106,7 @@ public class BCF2Codec extends BinaryFeatureCodec { /** * A cached array of GenotypeBuilders for efficient genotype decoding. - * + *

* Caching it allows us to avoid recreating this intermediate data * structure each time we decode genotypes */ @@ -114,18 +124,18 @@ public class BCF2Codec extends BinaryFeatureCodec { // ---------------------------------------------------------------------- @Override - public Feature decodeLoc( final PositionalBufferedStream inputStream ) { + public Feature decodeLoc(final PositionalBufferedStream inputStream) { return decode(inputStream); } @Override - public VariantContext decode( final PositionalBufferedStream inputStream ) { + public VariantContext decode(final PositionalBufferedStream inputStream) { try { recordNo++; final VariantContextBuilder builder = new VariantContextBuilder(); - final int sitesBlockSize = decoder.readBlockSize(inputStream); - final int genotypeBlockSize = decoder.readBlockSize(inputStream); + final int sitesBlockSize = BCF2Decoder.readBlockSize(inputStream); + final int genotypeBlockSize = BCF2Decoder.readBlockSize(inputStream); decoder.readNextBlock(sitesBlockSize, inputStream); decodeSiteLoc(builder); @@ -134,7 +144,7 @@ public VariantContext decode( final PositionalBufferedStream inputStream ) { decoder.readNextBlock(genotypeBlockSize, inputStream); createLazyGenotypesDecoder(info, builder); return builder.fullyDecoded(true).make(); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("Failed to read BCF file", e); } } @@ -153,10 +163,13 @@ public Class getFeatureType() { * The default policy is to require an exact version match. * @param supportedVersion the current BCF implementation version * @param actualVersion the actual version - * @thows TribbleException if the version policy determines that {@code actualVersion} is not compatible + * @throws TribbleException if the version policy determines that {@code actualVersion} is not compatible * with {@code supportedVersion} */ - protected void validateVersionCompatibility(final BCFVersion supportedVersion, final BCFVersion actualVersion) { + protected void validateVersionCompatibility( + final BCFVersion supportedVersion, + final BCFVersion actualVersion + ) throws TribbleException { if ( actualVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) { error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion()); } @@ -168,26 +181,26 @@ protected void validateVersionCompatibility(final BCFVersion supportedVersion, f } @Override - public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { + public FeatureCodecHeader readHeader(final PositionalBufferedStream inputStream) { try { // note that this reads the magic as well, and so does double duty bcfVersion = BCFVersion.readBCFVersion(inputStream); - if ( bcfVersion == null ) { + if (bcfVersion == null) { error("Input stream does not contain a BCF encoded file; BCF magic header info not found"); + } else if (!BCFVersion.SUPPORTED_VERSIONS.contains(bcfVersion)) { + error(bcfVersion + " is not supported by htsjdk"); } - validateVersionCompatibility(BCF2Codec.ALLOWED_BCF_VERSION, bcfVersion); - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Parsing data stream with BCF version " + bcfVersion); - } + decoder = BCF2Decoder.getDecoder(bcfVersion); + log.debug("Parsing data stream with BCF version " + bcfVersion); final int headerSizeInBytes = BCF2Type.INT32.read(inputStream); - if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB - error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); + if (headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB + error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < " + MAX_HEADER_SIZE); final byte[] headerBytes = new byte[headerSizeInBytes]; - if ( inputStream.read(headerBytes) != headerSizeInBytes ) + if (inputStream.read(headerBytes) != headerSizeInBytes) error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); @@ -195,24 +208,20 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream final VCFCodec headerParser = new VCFCodec(); this.header = (VCFHeader) headerParser.readActualHeader(lineIterator); bps.close(); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 header"); } - // create the config offsets - if ( ! header.getContigLines().isEmpty() ) { - contigNames.clear(); - for ( final VCFContigHeaderLine contig : header.getContigLines()) { - if ( contig.getID() == null || contig.getID().equals("") ) - error("found a contig with an invalid ID " + contig); - contigNames.add(contig.getID()); - } - } else { - error("Didn't find any contig lines in BCF2 file header"); + // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields + if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { + log.warn("Use of the ##dictionary line is not supported"); } + // create the contig dictionary + contigDictionary = makeContigDictionary(bcfVersion); + // create the string dictionary - dictionary = parseDictionary(header); + stringDictionary = makeStringDictionary(bcfVersion); // prepare the genotype field decoders gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); @@ -220,7 +229,7 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream // create and initialize the genotype builder array final int nSamples = header.getNGenotypeSamples(); builders = new GenotypeBuilder[nSamples]; - for ( int i = 0; i < nSamples; i++ ) { + for (int i = 0; i < nSamples; i++) { builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i)); } @@ -229,11 +238,20 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream } @Override - public boolean canDecode( final String path ) { - try (InputStream fis = Files.newInputStream(IOUtil.getPath(path)) ){ - final BCFVersion version = BCFVersion.readBCFVersion(fis); - return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION; - } catch ( final IOException e ) { + public boolean canDecode(final String path) { + try (final InputStream fis = Files.newInputStream(IOUtil.getPath(path))) { + final InputStream is = IOUtil.isGZIPInputStream(fis) ? new GZIPInputStream(fis) : fis; + final BCFVersion version = BCFVersion.readBCFVersion(is); + if (version == null) { + return false; + } else { + // Validation will throw a TribbleException for incompatible versions + // The default policy is to require an exact major and minor version match + // but subclasses can implement more permissive policies + validateVersionCompatibility(ALLOWED_BCF_VERSION, version); + return true; + } + } catch (final IOException | TribbleException e) { return false; } } @@ -264,8 +282,8 @@ private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOE this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based final int refLength = decoder.decodeInt(BCF2Type.INT32); - builder.start((long)pos); - builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open + builder.start(pos); + builder.stop(pos + refLength - 1); // minus one because GATK has closed intervals but BCF2 is open } /** @@ -276,21 +294,22 @@ private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOE */ private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) throws IOException { final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT); - if ( qual != null ) { - builder.log10PError(((Double)qual) / -10.0); + if (qual != null) { + builder.log10PError(((Double) qual) / -10.0); } final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32); final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32); - final int nAlleles = nAlleleInfo >> 16; + // Use logical shift to not introduce leading 1s + final int nAlleles = nAlleleInfo >>> 16; final int nInfo = nAlleleInfo & 0x0000FFFF; - final int nFormatFields = nFormatSamples >> 24; + final int nFormatFields = nFormatSamples >>> 24; final int nSamples = nFormatSamples & 0x00FFFFF; - if ( header.getNGenotypeSamples() != nSamples ) + if (header.getNGenotypeSamples() != nSamples) error("Reading BCF2 files with different numbers of samples per record " + - "is not currently supported. Saw " + header.getNGenotypeSamples() + - " samples in header but have a record with " + nSamples + " samples"); + "is not currently supported. Saw " + header.getNGenotypeSamples() + + " samples in header but have a record with " + nSamples + " samples"); decodeID(builder); final List alleles = decodeAlleles(builder, pos, nAlleles); @@ -298,7 +317,7 @@ private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextB decodeInfo(builder, nInfo); final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles); - if ( ! info.isValid() ) + if (!info.isValid()) error("Sites info is malformed: " + info); return info; } @@ -316,8 +335,8 @@ private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final public boolean isValid() { return nFormatFields >= 0 && - nSamples >= 0 && - alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference(); + nSamples >= 0 && + alleles != null && !alleles.isEmpty() && alleles.get(0).isReference(); } @Override @@ -328,12 +347,13 @@ public String toString() { /** * Decode the id field in this BCF2 file and store it in the builder + * * @param builder */ - private void decodeID( final VariantContextBuilder builder ) throws IOException { - final String id = (String)decoder.decodeTypedValue(); + private void decodeID(final VariantContextBuilder builder) throws IOException { + final String id = decoder.decodeUnexplodedString(); - if ( id == null ) + if (id == null || id.isEmpty()) builder.noID(); else builder.id(id); @@ -341,54 +361,67 @@ private void decodeID( final VariantContextBuilder builder ) throws IOException /** * Decode the alleles from this BCF2 file and put the results in builder + * * @param builder * @param pos * @param nAlleles * @return the alleles */ - private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) throws IOException { - // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes - List alleles = new ArrayList(nAlleles); - String ref = null; - - for ( int i = 0; i < nAlleles; i++ ) { - final String alleleBases = (String)decoder.decodeTypedValue(); + private List decodeAlleles(final VariantContextBuilder builder, final int pos, final int nAlleles) throws IOException { + final List alleles = new ArrayList<>(nAlleles); + byte[] ref = null; + + for (int i = 0; i < nAlleles; i++) { + // Some decoder functionality is inlined here to avoid conversion from bytes -> string -> bytes + final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + if (type != BCF2Type.CHAR) { + error("Expected to find vector of type CHAR while decoding Allele bases, found type " + type); + } + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final byte[] alleleBases = decoder.decodeRawBytes(size); final boolean isRef = i == 0; + if (isRef) { + ref = alleleBases; + } + final Allele allele = Allele.create(alleleBases, isRef); - if ( isRef ) ref = alleleBases; alleles.add(allele); } + assert ref != null; + assert ref.length > 0; builder.alleles(alleles); - - assert !ref.isEmpty(); - return alleles; } /** * Decode the filter field of this BCF2 file and store the result in the builder + * * @param builder */ - private void decodeFilter( final VariantContextBuilder builder ) throws IOException { - final Object value = decoder.decodeTypedValue(); + private void decodeFilter(final VariantContextBuilder builder) throws IOException { + final byte typeDescriptor = decoder.readTypeDescriptor(); + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - if ( value == null ) + if (size == 0) { + // No filters builder.unfiltered(); - else { - if ( value instanceof Integer ) { - // fast path for single integer result - final String filterString = getDictionaryString((Integer)value); - if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString)) - builder.passFilters(); - else - builder.filter(filterString); + } else if (size == 1) { + final int i = decoder.decodeInt(type); + if (i == 0) { + // PASS is always implicitly encoded as 0 + builder.passFilters(); } else { - for ( final int offset : (List)value ) - builder.filter(getDictionaryString(offset)); + builder.filter(getDictionaryString(i)); + } + } else { + for (final int offset : decoder.decodeIntArray(size, type, null)) { + builder.filter(getDictionaryString(offset)); } } } @@ -399,17 +432,23 @@ private void decodeFilter( final VariantContextBuilder builder ) throws IOExcept * @param builder * @param numInfoFields */ - private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) throws IOException { - if ( numInfoFields == 0 ) + private void decodeInfo(final VariantContextBuilder builder, final int numInfoFields) throws IOException { + if (numInfoFields == 0) // fast path, don't bother doing any work if there are no fields return; - final Map infoFieldEntries = new HashMap(numInfoFields); - for ( int i = 0; i < numInfoFields; i++ ) { + final Map infoFieldEntries = new HashMap<>(numInfoFields); + for (int i = 0; i < numInfoFields; i++) { final String key = getDictionaryString(); Object value = decoder.decodeTypedValue(); - final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key); - if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags + final VCFInfoHeaderLine metaData = header.getInfoHeaderLine(key); + if (metaData.getType() == VCFHeaderLineType.Flag) { + // Despite contradictory language in the spec, bcftools/htslib encode the "payload" of + // FLAG as 0x00 (MISSING type) which we would normally decode as MISSING/null, + // so we consider this value to be Boolean TRUE simply based on the presence of the key + // See https://github.com/samtools/hts-specs/issues/384 + value = Boolean.TRUE; // special case for flags + } infoFieldEntries.put(key, value); } @@ -429,17 +468,17 @@ private void decodeInfo( final VariantContextBuilder builder, final int numInfoF * @param siteInfo * @param builder */ - private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo, - final VariantContextBuilder builder ) { + private void createLazyGenotypesDecoder(final SitesInfoForDecoding siteInfo, + final VariantContextBuilder builder) { if (siteInfo.nSamples > 0) { final LazyGenotypesContext.LazyParser lazyParser = - new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); + new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); - final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes()); + final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes(), bcfVersion); final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples()); // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) + if (!header.samplesWereAlreadySorted()) lazy.decode(); builder.genotypesNoValidation(lazy); @@ -447,23 +486,33 @@ private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo, } public static class LazyData { - final public VCFHeader header; - final public int nGenotypeFields; - final public byte[] bytes; + public final VCFHeader header; + public final int nGenotypeFields; + public final byte[] bytes; + public final BCFVersion version; - public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) { + public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes, final BCFVersion version) { this.header = header; this.nGenotypeFields = nGenotypeFields; this.bytes = bytes; + this.version = version; } } - private final String getDictionaryString() throws IOException { + private String getDictionaryString() throws IOException { return getDictionaryString((Integer) decoder.decodeTypedValue()); } protected final String getDictionaryString(final int offset) { - return dictionary.get(offset); + final String s = stringDictionary.get(offset); + if (s == null) { + error("No entry in the string dictionary matching key: " + offset + " was found"); + } + return s; + } + + private BCF2Dictionary makeStringDictionary(final BCFVersion bcfVersion) { + return BCF2Dictionary.makeBCF2StringDictionary(header, bcfVersion); } /** @@ -473,18 +522,20 @@ protected final String getDictionaryString(final int offset) { * @param contigOffset * @return */ - private final String lookupContigName( final int contigOffset ) { - return contigNames.get(contigOffset); + private String lookupContigName(final int contigOffset) { + final String s = contigDictionary.get(contigOffset); + if (s == null) { + error("No entry in the contig dictionary matching key: " + contigOffset + " was found"); + } + return s; } - private final ArrayList parseDictionary(final VCFHeader header) { - final ArrayList dict = BCF2Utils.makeDictionary(header); - - // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dict.isEmpty() ) - error("Dictionary header element was absent or empty"); + private BCF2Dictionary makeContigDictionary(final BCFVersion bcfVersion) { + // create the config offsets + if (header.getContigLines().isEmpty()) + error("Didn't find any contig lines in BCF2 file header"); - return dict; + return BCF2Dictionary.makeBCF2ContigDictionary(header, bcfVersion); } /** @@ -501,8 +552,9 @@ protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String protected void error(final String message) throws RuntimeException { throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos)); } - - /** try to read a BCFVersion from an uncompressed BufferedInputStream. + + /** + * Try to read a BCFVersion from an uncompressed BufferedInputStream. * The buffer must be large enough to contain {@link #SIZEOF_BCF_HEADER} * * @param uncompressedBufferedInput the uncompressed input stream @@ -515,5 +567,8 @@ public static BCFVersion tryReadBCFVersion(final BufferedInputStream uncompresse uncompressedBufferedInput.reset(); return bcfVersion; } - + + public BCFVersion getBCFVersion() { + return bcfVersion; + } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java index 0dd166eef6..e88db1e115 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Decoder.java @@ -31,24 +31,33 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.List; -public final class BCF2Decoder { - byte[] recordBytes = null; - ByteArrayInputStream recordStream = null; +public abstract class BCF2Decoder { + protected byte[] recordBytes = null; + protected ByteArrayInputStream recordStream = null; - public BCF2Decoder() { + private BCF2Decoder() { // nothing to do } - /** - * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes - * - * @param recordBytes - */ - protected BCF2Decoder(final byte[] recordBytes) { - setRecordBytes(recordBytes); + public static BCF2Decoder getDecoder(final BCFVersion version) { + switch (version.getMinorVersion()) { + case 2: + return new BCF2Decoder.BCF2_2Decoder(); + default: + throw new TribbleException("BCF2Codec can only process BCF2 files with minor version <= " + BCF2Codec.ALLOWED_MINOR_VERSION + " but this file has minor version " + version.getMinorVersion()); + } + } + + public static BCF2Decoder getDecoder(final BCFVersion version, final byte[] recordBytes) { + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version); + decoder.setRecordBytes(recordBytes); + return decoder; } // ---------------------------------------------------------------------- @@ -63,7 +72,7 @@ protected BCF2Decoder(final byte[] recordBytes) { * @param stream */ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { - if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes); + if (blockSizeInBytes < 0) throw new TribbleException("Invalid block size " + blockSizeInBytes); setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); } @@ -74,9 +83,9 @@ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) */ public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { try { - final int bytesRead = (int)stream.skip(blockSizeInBytes); + final int bytesRead = (int) stream.skip(blockSizeInBytes); validateReadBytes(bytesRead, 1, blockSizeInBytes); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 file", e); } this.recordBytes = null; @@ -85,6 +94,7 @@ public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) /** * Returns the byte[] for the block of data we are currently decoding + * * @return */ public byte[] getRecordBytes() { @@ -131,41 +141,54 @@ public final Object decodeTypedValue(final byte typeDescriptor) throws IOExcepti } public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException { - if ( size == 0 ) { + if (size == 0) { // missing value => null in java return null; } else { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency - return decodeLiteralString(size); - } else if ( size == 1 ) { - return decodeSingleValue(type); + if (type == BCF2Type.CHAR) { // special case string decoding for efficiency + final List strings = decodeExplodedStrings(size, ','); + if (strings.isEmpty()) { + return null; + } else if (strings.size() == 1) { + return strings.get(0); + } else { + return strings; + } + } else if (size == 1) { + final Object o = decodeSingleValue(type); + return o == BCF2Type.EOVValue() ? null : o; } else { - final ArrayList ints = new ArrayList(size); - for ( int i = 0; i < size; i++ ) { + final ArrayList ints = new ArrayList<>(size); + for (int i = 0; i < size; i++) { final Object val = decodeSingleValue(type); - if ( val == null ) continue; // auto-pruning. We remove trailing nulls + if (val == BCF2Type.EOVValue()) continue; ints.add(val); } - return ints.isEmpty() ? null : ints; // return null when all of the values are null + return ints.isEmpty() ? null : ints; } } } public final Object decodeSingleValue(final BCF2Type type) throws IOException { - // TODO -- decodeTypedValue should integrate this routine final int value = decodeInt(type); - if ( value == type.getMissingBytes() ) + if (value == type.getMissingBytes()) { return null; - else { + } else if (value == type.getEOVBytes()) { + return BCF2Type.EOVValue(); + } else { switch (type) { case INT8: case INT16: - case INT32: return value; - case FLOAT: return rawFloatToFloat(value); - case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased - default: throw new TribbleException("BCF2 codec doesn't know how to decode type " + type ); + case INT32: + return value; + case FLOAT: + return rawFloatToFloat(value); + case CHAR: + return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased + default: + throw new TribbleException("BCF2 codec doesn't know how to decode type " + type); } } } @@ -176,31 +199,8 @@ public final Object decodeSingleValue(final BCF2Type type) throws IOException { // // ---------------------------------------------------------------------- - private final Object decodeLiteralString(final int size) { - assert size > 0; - - // TODO -- assumes size > 0 - final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array - try { - recordStream.read(bytes); - - int goodLength = 0; - for ( ; goodLength < bytes.length ; goodLength++ ) - if ( bytes[goodLength] == 0 ) break; - - if ( goodLength == 0 ) - return null; - else { - final String s = new String(bytes, 0, goodLength); - return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s; - } - } catch ( IOException e ) { - throw new TribbleException("readByte failure", e); - } - } - public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException { - if ( BCF2Utils.sizeIsOverflow(typeDescriptor) ) + if (BCF2Utils.sizeIsOverflow(typeDescriptor)) // -1 ensures we explode immediately with a bad size if the result is missing return decodeInt(readTypeDescriptor(), -1); else @@ -228,14 +228,22 @@ public final int decodeInt(final BCF2Type type) throws IOException { /** * Low-level reader for int[] - * + *

* Requires a typeDescriptor so the function knows how many elements to read, * and how they are encoded. - * + *

+ * Note that this method is only suitable for reading arrays which are known + * to not contain any internal MISSING values (e.g. filter or GT, + * in the case of GT in BCF 2.1, the vector may be MISSING padded if the + * sample ploidy is less than the maximum, but these missing values are + * not considered to be part of the array, and will not be returned). + * Parts of the decoder that require missing values to be preserved should + * use decodeTyped + *

* If size == 0 => result is null * If size > 0 => result depends on the actual values in the stream - * -- If the first element read is MISSING, result is null (all values are missing) - * -- Else result = int[N] where N is the first N non-missing values decoded + * -- If the first element read is MISSING, result is null (all values are missing) + * -- Else result = int[N] where N is the first N non-missing values decoded * * @param maybeDest if not null we'll not allocate space for the vector, but instead use * the externally allocated array of ints to store values. If the @@ -244,45 +252,131 @@ public final int decodeInt(final BCF2Type type) throws IOException { * int elements are still forced to do a fresh allocation as well. * @return see description */ - public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { - if ( size == 0 ) { + public int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { + if (size == 0) { return null; } else { - if ( maybeDest != null && maybeDest.length < size ) + if (maybeDest != null && maybeDest.length < size) maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small final int val1 = decodeInt(type); - if ( val1 == type.getMissingBytes() ) { - // fast path for first element being missing - for ( int i = 1; i < size; i++ ) decodeInt(type); + if (val1 == getPaddingValue(type)) { + // Fast path for first element being padding, meaning the whole array is empty + final int bytesToDrop = type.getSizeInBytes() * (size - 1); + // Skip the rest of the padding values + recordStream.skip(bytesToDrop); return null; } else { // we know we will have at least 1 element, so making the int[] is worth it final int[] ints = maybeDest == null ? new int[size] : maybeDest; - ints[0] = val1; // we already read the first one - for ( int i = 1; i < size; i++ ) { + ints[0] = val1; + for (int i = 1; i < size; i++) { ints[i] = decodeInt(type); - if ( ints[i] == type.getMissingBytes() ) { - // read the rest of the missing values, dropping them - for ( int j = i + 1; j < size; j++ ) decodeInt(type); + if (ints[i] == getPaddingValue(type)) { + final int bytesToDrop = type.getSizeInBytes() * (size - (i + 1)); + // Skip the rest of the padding values + recordStream.skip(bytesToDrop); // deal with auto-pruning by returning an int[] containing - // only the non-MISSING values. We do this by copying the first + // only the non-padding values. We do this by copying the first // i elements, as i itself is missing return Arrays.copyOf(ints, i); } } - return ints; // all of the elements were non-MISSING + return ints; // all of the elements were non-padding } } } + public byte[] decodeRawBytes(final int size) throws IOException { + final byte[] bytes = new byte[size]; + recordStream.read(bytes); + return bytes; + } + + /** + * Decode a single ASCII encoded string which may be padded with NULL bytes. + * Multiple strings which were encoded as a single comma separated string are + * returned unexploded. + *

+ * Reads directly from underlying byte buffer to avoid unnecessary array copies. + * + * @param size + * @return + */ + public String decodeUnexplodedString(final int size) { + // Get our current position in the buffer so we can index directly into it + final int currentBufferPosition = recordBytes.length - recordStream.available(); + + // Jump over all bytes, including NULL padding + recordStream.skip(size); + + // Scan for first NULL padding byte + int realLength = 0; + for (; realLength < size; realLength++) + if (recordBytes[currentBufferPosition + realLength] == '\0') break; + + // The BCF spec states that strings are ASCII encoded, but we use UTF-8 for future proofing + return new String(recordBytes, currentBufferPosition, realLength, StandardCharsets.UTF_8); + } + + public String decodeUnexplodedString() throws IOException { + final byte typeDescriptor = readTypeDescriptor(); + final int size = decodeNumberOfElements(typeDescriptor); + + return size > 0 ? decodeUnexplodedString(size) : ""; + } + + /** + * Decode a list of ASCII encoded strings. + * Multiple strings as a single separator delimited string are + * exploded. If only a single string was encoded with no separators, returns a + * list of length 1. + *

+ * Reads directly from underlying byte buffer to avoid unnecessary array copies. + * + * @param size + * @return + */ + public List decodeExplodedStrings(final int size, final char separator) { + // Get our current position in the buffer so we can index directly into it + final int currentBufferPosition = recordBytes.length - recordStream.available(); + + // Jump over all bytes + recordStream.skip(size); + + if (size == 0 || recordBytes[currentBufferPosition] == '\0') return Collections.emptyList(); + + int numStrings = 1; + // Start at offset 1 to avoid counting optional leading comma + // Real length may be shorter than provided one because of NULL padding + int realLength = 1; + for (; realLength < size; realLength++) { + final byte currentByte = recordBytes[currentBufferPosition + realLength]; + if (currentByte == separator) numStrings++; + else if (currentByte == '\0') break; + } + + final List strings = new ArrayList<>(numStrings); + int currentStringStart = recordBytes[currentBufferPosition] == separator ? 1 : 0; + for (int i = 1; i < realLength; i++) { + if (recordBytes[currentBufferPosition + i] == separator) { + strings.add(new String(recordBytes, currentBufferPosition + currentStringStart, i - currentStringStart, StandardCharsets.UTF_8)); + currentStringStart = i + 1; + } + } + // Add final string + strings.add(new String(recordBytes, currentBufferPosition + currentStringStart, realLength - currentStringStart, StandardCharsets.UTF_8)); + + return strings; + } + public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); return decodeIntArray(size, type, null); } - private double rawFloatToFloat(final int rawFloat) { - return (double)Float.intBitsToFloat(rawFloat); + private static double rawFloatToFloat(final int rawFloat) { + return Float.intBitsToFloat(rawFloat); } // ---------------------------------------------------------------------- @@ -297,17 +391,17 @@ private double rawFloatToFloat(final int rawFloat) { * @param inputStream * @return */ - public final int readBlockSize(final InputStream inputStream) throws IOException { + public static int readBlockSize(final InputStream inputStream) throws IOException { return BCF2Type.INT32.read(inputStream); } /** * Read all bytes for a BCF record block into a byte[], and return it - * + *

* Is smart about reading from the stream multiple times to fill the buffer, if necessary * * @param blockSizeInBytes number of bytes to read - * @param inputStream the stream to read from + * @param inputStream the stream to read from * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream */ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { @@ -319,20 +413,18 @@ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStr int nReadAttempts = 0; // keep track of how many times we've read // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF - while ( bytesRead < blockSizeInBytes ) { + while (bytesRead < blockSizeInBytes) { final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); - if ( read1 == -1 ) + nReadAttempts++; + if (read1 == -1) { validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); + break; + } else bytesRead += read1; } - - if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me - System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); - } - validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); - } catch ( IOException e ) { + } catch (final IOException e) { throw new TribbleException("I/O error while reading BCF2 file", e); } @@ -349,14 +441,32 @@ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStr private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { assert expected >= 0; - if ( actuallyRead < expected ) { + if (actuallyRead < expected) { throw new TribbleException( - String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", - expected, actuallyRead, nReadAttempts)); + String.format("Failed to read next complete record: expected %d bytes but read only %d after %d read attempts", + expected, actuallyRead, nReadAttempts)); } } public final byte readTypeDescriptor() throws IOException { - return BCF2Utils.readByte(recordStream); + return (byte) recordStream.read(); + } + + + // ---------------------------------------------------------------------- + // + // Version specific behavior + // + // ---------------------------------------------------------------------- + + + public abstract int getPaddingValue(final BCF2Type type); + + public static class BCF2_2Decoder extends BCF2Decoder { + + @Override + public int getPaddingValue(final BCF2Type type) { + return type.getEOVBytes(); + } } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java new file mode 100644 index 0000000000..5a1d0ffd94 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Dictionary.java @@ -0,0 +1,274 @@ +package htsjdk.variant.bcf2; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.stream.Collectors; + +/** + * Dictionary of strings or contigs for use with a BCF file. + *

+ * Provides an Integer -> String map interface, but determines during construction whether + * the mapping can be stored as an array or it must be stored using a map. + *

+ * This class validates that IDX fields are used as required by the BCF 2.2 spec, namely + * that either all lines of a given dictionary type (contig or FORMAT/INFO/FILTER) have + * IDX fields or none do. + *

+ * The spec does not require a 1-to-1 IDX-to-string mapping, but logically a header with a + * 1-to-n IDX-to-string mapping would be unparsable, and we reject such headers, while an + * n-to-1 IDX-to-string mapping might result from tools that do not deduplicate IDXs, so + * we accept them. + */ +public abstract class BCF2Dictionary { + + /** + * Create and return a BCF string dictionary + * The dictionary is an ordered list of common VCF identifiers (FILTER, INFO, and FORMAT) fields. + *

+ * Note that it's critical that the list be dedupped and sorted in a consistent manner each time, + * as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly + * the same way as in the header each time it's very bad + * + * @param vcfHeader VCFHeader containing the strings to be stored + * @param version BCF version for which the dictionary will be used + * @return BCF2Dictionary suitable for use with a BCF file + */ + public static BCF2Dictionary makeBCF2StringDictionary(final VCFHeader vcfHeader, final BCFVersion version) { + final List headerLines = vcfHeader.getMetaDataInInputOrder().stream() + .filter(BCF2Dictionary::isStringDictionaryDefining) + .map(l -> (VCFSimpleHeaderLine) l) + .collect(Collectors.toList()); + + return BCF2Dictionary.makeDictionary(headerLines, version, true); + } + + private static boolean isStringDictionaryDefining(final VCFHeaderLine line) { + switch (line.getKey()) { + case VCFConstants.INFO_HEADER_KEY: + case VCFConstants.FORMAT_HEADER_KEY: + case VCFConstants.FILTER_HEADER_KEY: + return true; + default: + return false; + } + } + + /** + * Create and return a BCF contig dictionary + * + * @param vcfHeader VCFHeader containing the contig header lines to be stored + * @param version BCF version for which the dictionary will be used + * @return BCF2Dictionary suitable for use with a BCF file + */ + public static BCF2Dictionary makeBCF2ContigDictionary(final VCFHeader vcfHeader, final BCFVersion version) { + return BCF2Dictionary.makeDictionary(vcfHeader.getContigLines(), version, false); + } + + private static BCF2Dictionary makeDictionary( + final List headerLines, + final BCFVersion version, + final boolean stringDictionary + ) { + if (headerLines.isEmpty()) { + return new BCF2DenseDictionary(Collections.emptyList()); + } + + // Note that we count FILTER/FORMAT/INFO header lines with the same ID but different key + // (e.g. a FORMAT line and an INFO line both with ID "A") to define the same string + // for the purposes of building the dictionary + // c.f. https://github.com/samtools/hts-specs/issues/591#issuecomment-904487133 + final Set seen = new HashSet<>(headerLines.size() + 1); + + if (stringDictionary) { + // Special case the special PASS field which may not show up in the FILTER field definitions + seen.add(VCFConstants.PASSES_FILTERS_v4); + } + + // Check version and possibly peek at first value to see if lines should contain IDX fields or not + final boolean shouldHaveIDX = version.getMinorVersion() > 1 && + headerLines.get(0).getGenericFieldValue(BCF2Codec.IDXField) != null; + + // Validate + for (final VCFSimpleHeaderLine headerLine : headerLines) { + final String idxString = headerLine.getGenericFieldValue(BCF2Codec.IDXField); + if ((idxString == null) == shouldHaveIDX) { + // If any line had an IDX then they all should + throw new TribbleException.InvalidHeader(String.format( + "Inconsistent IDX field usage in BCF file %s header line %s, %s", + headerLine.getKey(), + headerLine.getID(), + shouldHaveIDX ? "did not find expected IDX field" : "unexpected IDX field" + )); + } + } + + if (shouldHaveIDX) { + final HashMap strings = new HashMap<>(headerLines.size() + 1); + int maxIDX = 0; + if (stringDictionary) { + strings.put(0, VCFConstants.PASSES_FILTERS_v4); + } + + for (final VCFSimpleHeaderLine line : headerLines) { + final String id = line.getID(); + final int IDX = Integer.parseUnsignedInt(line.getGenericFieldValue(BCF2Codec.IDXField)); + + // Have we seen this IDX before with a different string? + if (strings.containsKey(IDX)) { + final String oldString = strings.get(IDX); + if (!oldString.equals(id)) { + throw new TribbleException.InvalidHeader(String.format( + "IDX %d associated with multiple dictionary defining strings: %s and %s", + IDX, oldString, id + )); + } + } + + if (!seen.contains(id)) { + seen.add(id); + maxIDX = Math.max(maxIDX, IDX); + strings.put(IDX, line.getID()); + } + + } + if (maxIDX == seen.size() - 1) { + // By the pigeonhole principle, if we have N unique non-negative IDXs numbered starting from 0 + // (possibly including 0 -> PASS implicitly) and (N - 1) is the highest IDX we have seen, + // we have all the IDXs in [0, N), which we can represent as a length N dense array. + // This check is useful because bcftools will always add IDX fields to headers even when not + // strictly necessary, so we can avoid the cost of the hash map in many cases. + final ArrayList stringsList = new ArrayList<>(seen.size()); + strings.forEach(stringsList::add); + return new BCF2DenseDictionary(stringsList); + } else { + return new BCF2SparseDictionary(strings); + } + } else { + final ArrayList strings = new ArrayList<>(headerLines.size() + 1); + if (stringDictionary) { + strings.add(VCFConstants.PASSES_FILTERS_v4); + } + + for (final VCFSimpleHeaderLine line : headerLines) { + final String id = line.getID(); + if (!seen.contains(id)) { + strings.add(line.getID()); + seen.add(id); + } + } + return new BCF2DenseDictionary(strings); + } + } + + /** + * Additional method in interface to avoid boxing when indexing into a + * dictionary backed by a List + * + * @param i index + * @return the string associated with the index or null + */ + public abstract String get(final int i); + + /** + * Performs the given action for each entry in the dictionary. + * @param action the action to be performed + */ + public abstract void forEach(final BiConsumer action); + + /** + * @return the number of elements in the dictionary + */ + public abstract int size(); + + /** + * @param i the BCF index to search for + * @return true if there is a string or contig mapped to the given index + */ + public abstract boolean containsIndex(final int i); + + /** + * BCF 2.2 dense sequence dictionary. Strings are assigned an index corresponding to its position in a 0-indexed + * array. This dictionary is used if no IDX fields are present in the header, or they are present, but they + * represent a set of indices that are of the form 0, 1, ..., n, that is, the set has no gaps and is numbered + * starting at 0. + */ + private static class BCF2DenseDictionary extends BCF2Dictionary { + + private final List dictionary; + + private BCF2DenseDictionary(final List dictionary) { + this.dictionary = dictionary; + } + + @Override + public String get(final int i) { + return i < 0 || i >= dictionary.size() ? null : dictionary.get(i); + } + + @Override + public void forEach(final BiConsumer action) { + int i = 0; + for (final String s : dictionary) { + action.accept(i, s); + i++; + } + } + + @Override + public int size() { + return this.dictionary.size(); + } + + @Override + public boolean containsIndex(final int i) { + return i < this.dictionary.size(); + } + } + + /** + * BCF 2.2 sparse dictionary. Strings are assigned an index corresponding to its line's IDX field. + * This dictionary is used if IDX fields are present in the header, and they represent a set of + * indices that is not of the form 0, 1, ..., n, that is, the set has gaps or is not numbered starting + * at 0. + */ + private static class BCF2SparseDictionary extends BCF2Dictionary { + + private final Map dictionary; + + private BCF2SparseDictionary(final Map dictionary) { + this.dictionary = dictionary; + } + + @Override + public String get(final int i) { + return dictionary.get(i); + } + + @Override + public void forEach(final BiConsumer action) { + this.dictionary.forEach(action); + } + + @Override + public int size() { + return this.dictionary.size(); + } + + @Override + public boolean containsIndex(final int i) { + return this.dictionary.containsKey(i); + } + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java new file mode 100644 index 0000000000..335fc2e7f2 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Encoder.java @@ -0,0 +1,337 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package htsjdk.variant.bcf2; + +import htsjdk.samtools.util.ListByteBufferOutputStream; +import htsjdk.tribble.TribbleException; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +/** + * See #BCFWriter for documentation on this classes role in encoding BCF2 files + * + * @author Mark DePristo + * @since 06/12 + */ +public abstract class BCF2Encoder { + // TODO -- increase default size? + public static final int WRITE_BUFFER_INITIAL_SIZE = 16_384; + protected final ListByteBufferOutputStream encodeStream = new ListByteBufferOutputStream(WRITE_BUFFER_INITIAL_SIZE); + + public static BCF2Encoder getEncoder(final BCFVersion version) { + switch (version.getMinorVersion()) { + case 2: + return new BCF2_2Encoder(); + default: + throw new TribbleException("BCF2Codec can only process BCF2 files with minor version <= " + 2 + " but this file has minor version " + version.getMinorVersion()); + } + } + + + // -------------------------------------------------------------------------------- + // + // Functions to return the data being encoded here + // + // -------------------------------------------------------------------------------- + + /** + * This allocates a new array and copies the stream's contents over so it + * should not be used in the actual encoder, but may be useful for testing + */ + public byte[] getRecordBytes() { + final byte[] bytes = encodeStream.toByteArray(); + encodeStream.reset(); + return bytes; + } + + public final int getSize() { + return encodeStream.size(); + } + + public final void write(final OutputStream out) throws IOException { + encodeStream.writeTo(out); + encodeStream.reset(); + } + + + // -------------------------------------------------------------------------------- + // + // Writing typed values (writes out typing byte(s) first) + // + // -------------------------------------------------------------------------------- + + public final void encodeTypedMissing(final BCF2Type type) throws IOException { + encodeType(0, type); + } + + public final void encodeTyped(final Object value, final BCF2Type type) throws IOException { + if (value == null) + encodeTypedMissing(type); + else { + switch (type) { + case INT8: + case INT16: + case INT32: + encodeTypedInt((Integer) value, type); + break; + case FLOAT: + encodeTypedFloat((Double) value); + break; + case CHAR: + encodeTypedString((String) value); + break; + default: + throw new IllegalArgumentException("Illegal type encountered " + type); + } + } + } + + public final void encodeTypedInt(final int v) throws IOException { + final BCF2Type type = BCF2Utils.determineIntegerType(v); + encodeTypedInt(v, type); + } + + public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException { + encodeType(1, type); + encodeRawInt(v, type); + } + + public final void encodeTypedFloat(final double v) throws IOException { + encodeType(1, BCF2Type.FLOAT); + encodeRawFloat(v); + } + + public final void encodeTypedString(final String s) throws IOException { + encodeTypedString(s.getBytes(StandardCharsets.UTF_8)); + } + + public final void encodeTypedString(final byte[] s) throws IOException { + encodeType(s.length, BCF2Type.CHAR); + encodeStream.write(s); + } + + public final void encodeTypedVecInt(final int[] vs) throws IOException { + final int size = vs.length; + final BCF2Type type = BCF2Utils.determineIntegerType(vs); + encodeType(size, type); + encodeRawVecInt(vs, size, type); + } + + + public final void encodeTypedVecInt(final int[] vs, final int paddedSize) throws IOException { + final BCF2Type type = BCF2Utils.determineIntegerType(vs); + encodeType(paddedSize, type); + encodeRawVecInt(vs, paddedSize, type); + } + + + // -------------------------------------------------------------------------------- + // + // Writing raw values (does not write out typing byte(s)) + // + // -------------------------------------------------------------------------------- + + public final void encodeRawValues(final Collection v, final BCF2Type type) throws IOException { + for (final T v1 : v) { + encodeRawValue(v1, type); + } + } + + public final void encodeRawValue(final T value, final BCF2Type type) throws IOException { + try { + if (value == type.getMissingJavaValue()) + encodeRawMissingValue(type); + else { + switch (type) { + case INT8: + case INT16: + case INT32: + encodeRawBytes((Integer) value, type); + break; + case FLOAT: + encodeRawFloat((Double) value); + break; + case CHAR: + encodeRawChar((Byte) value); + break; + default: + throw new IllegalArgumentException("Illegal type encountered " + type); + } + } + } catch (final ClassCastException e) { + throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value); + } + } + + public final void encodeRawMissingValue(final BCF2Type type) throws IOException { + encodeRawBytes(type.getMissingBytes(), type); + } + + + // -------------------------------------------------------------------------------- + // + // Low-level encoders + // + // -------------------------------------------------------------------------------- + + public final void encodeType(final int size, final BCF2Type type) throws IOException { + if (size <= BCF2Utils.MAX_INLINE_ELEMENTS) { + final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type); + encodeStream.write(typeByte); + } else { + final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type); + encodeStream.write(typeByte); + // write in the overflow size + encodeTypedInt(size); + } + } + + public final void encodeRawBytes(final int v, final BCF2Type type) throws IOException { + type.write(v, encodeStream); + } + + public final void encodeRawInt(final int v, final BCF2Type type) throws IOException { + type.write(v, encodeStream); + } + + public final void encodeRawFloat(final double v) throws IOException { + encodeRawBytes(Float.floatToIntBits((float) v), BCF2Type.FLOAT); + } + + public final void encodeRawChar(final byte c) { + encodeStream.write(c); + } + + public final void encodeRawString(final byte[] s, final int paddedSize) { + encodeStream.write(s); + final int padding = paddedSize - s.length; + if (padding > 0) { + // Pad with zeros, see https://github.com/samtools/hts-specs/issues/232 + encodeStream.write((byte) 0, padding); + } + } + + public final void encodeRawVecInt(final int[] vs, final int paddedSize, final BCF2Type type) throws IOException { + encodeRawVecInt(vs, type); + encodePaddingValues(paddedSize - vs.length, type); + } + + public final void encodeRawVecInt(final int[] vs, final BCF2Type type) throws IOException { + for (final int v : vs) { + type.write(v, encodeStream); + } + } + + public final void encodeRawVecInt(final List vs, final BCF2Type type) throws IOException { + for (final Integer v : vs) { + if (v == null) { + type.write(type.getMissingBytes(), encodeStream); + } else { + type.write(v, encodeStream); + } + } + } + + public final void encodeRawVecFloat(final double[] vs) throws IOException { + for (final double v : vs) { + encodeRawFloat(v); + } + } + + public final void encodeRawVecFloat(final List vs) throws IOException { + for (final Double v : vs) { + if (v == null) { + encodeRawMissingValue(BCF2Type.FLOAT); + } else { + encodeRawFloat(v); + } + } + } + + public final void encodePaddingValues(final int size, final BCF2Type type) throws IOException { + for (int i = 0; i < size; i++) { + encodePaddingValue(type); + } + } + + public abstract void encodePaddingValue(final BCF2Type type) throws IOException; + + // -------------------------------------------------------------------------------- + // + // Utility Functions + // + // -------------------------------------------------------------------------------- + + public final byte[] compactStrings(final String[] strings) { + return compactStrings(Arrays.asList(strings)); + } + + public abstract byte[] compactStrings(final List strings); + + + // -------------------------------------------------------------------------------- + // + // Version specific behavior + // + // -------------------------------------------------------------------------------- + + public static class BCF2_2Encoder extends BCF2Encoder { + + @Override + public void encodePaddingValue(final BCF2Type type) throws IOException { + type.write(type.getEOVBytes(), encodeStream); + } + + @Override + public byte[] compactStrings(final List strings) { + if (strings.isEmpty()) return new byte[0]; + + // 1 comma for each string except the first, then add on individual string lengths + int size = strings.size() - 1; + final byte[][] bytes = new byte[strings.size()][]; + int i = 0; + for (final String s : strings) { + final byte[] b = s.getBytes(StandardCharsets.UTF_8); + size += b.length; + bytes[i++] = b; + } + final ByteBuffer buff = ByteBuffer.allocate(size); + buff.put(bytes[0]); + for (int j = 1; j < strings.size(); j++) { + buff.put((byte) ','); + buff.put(bytes[j]); + } + + return buff.array(); + } + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java new file mode 100644 index 0000000000..546fd3eff0 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoder.java @@ -0,0 +1,360 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +abstract class BCF2FieldEncoder { + + final BCF2Encoder encoder; + + BCF2Type type; + + /* + The number of VCF values this encoder has seen, taking the maximum over all objects loaded. + This value is not identical to either the number of Java objects loaded or the BCF2 typing byte length + but is primarily useful for checking that the number of VCF values matches the header's declared count. + + For example, for a writer of type Character having loaded the String "abc", nValues is 3 matching its typing byte, + while for a writer of type String having loaded the String "abc", nValues is 1, but its typing byte length is 3. + */ + int nValues; + + BCF2FieldEncoder(final BCF2Encoder encoder) { + this.encoder = encoder; + } + + abstract void load(final Object o); + + void encodeType() throws IOException { + encoder.encodeType(nValues, type); + } + + void checkNValues(final VCFCompoundHeaderLine headerLine, final VariantContext vc) { + final int expectedValues = headerLine.getCount(vc); + if (nValues > expectedValues) + throw BCF2FieldWriter.tooManyValues(nValues, expectedValues, headerLine.getKey(), vc); + nValues = expectedValues; + } + + abstract void encode() throws IOException; + + + static class AtomicIntFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + AtomicIntFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.INT8; + nValues = 1; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(null); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + vs.add(v); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() throws IOException { + encoder.encodeRawVecInt(vs, type); + vs.clear(); + type = BCF2Type.INT8; + } + } + + static class AtomicFloatFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + AtomicFloatFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.FLOAT; + nValues = 1; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(null); + } else if (o instanceof Double) { + vs.add((Double) o); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() throws IOException { + encoder.encodeRawVecFloat(vs); + vs.clear(); + } + } + + static class CharFieldEncoder extends BCF2FieldEncoder { + + // TODO see https://github.com/samtools/hts-specs/issues/618 + // private static final byte[] MISSING = new byte[] {(byte) BCF2Type.CHAR.getMissingBytes()}; + private static final byte[] EMPTY = new byte[0]; + + private final List vs = new ArrayList<>(); + + CharFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.CHAR; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(EMPTY); + } else if (o instanceof String) { + final byte[] b = ((String) o).getBytes(StandardCharsets.UTF_8); + nValues = Math.max(nValues, b.length); + vs.add(b); + } else if (o instanceof List) { + final List strings = (List) o; + nValues = Math.max(nValues, strings.size()); + final ByteBuffer buff = ByteBuffer.allocate(strings.size()); + for (final String s : strings) { + if (s == null) { + buff.put((byte) type.getMissingBytes()); + } else if (s.length() > 1) { + throw new TribbleException("Value of VCF type Character is a string with more than 1 character: " + s); + } else { + buff.put(s.getBytes(StandardCharsets.UTF_8)[0]); + } + } + vs.add(buff.array()); + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + + @Override + void encode() { + for (final byte[] v : vs) { + encoder.encodeRawString(v, nValues); + } + vs.clear(); + nValues = 0; + } + } + + static class StringFieldEncoder extends BCF2FieldEncoder { + + private static final byte[] EMPTY = new byte[0]; + + private final List vs = new ArrayList<>(); + private int charLength; + + StringFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.CHAR; + nValues = 0; + charLength = 0; + } + + @Override + void load(final Object o) { + if (o == null) { + vs.add(EMPTY); + } else { + final byte[] v; + final int stringsSeen; + if (o instanceof String) { + v = ((String) o).getBytes(StandardCharsets.UTF_8); + stringsSeen = 1; + } else if (o instanceof List) { + final List strings = (List) o; + v = encoder.compactStrings(strings); + stringsSeen = strings.size(); + } else if (o instanceof String[]) { + final String[] strings = (String[]) o; + v = encoder.compactStrings(strings); + stringsSeen = strings.length; + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + + vs.add(v); + nValues = Math.max(nValues, stringsSeen); + charLength = Math.max(charLength, v.length); + } + } + + @Override + void encodeType() throws IOException { + encoder.encodeType(charLength, type); + } + + @Override + void encode() { + for (final byte[] v : vs) { + encoder.encodeRawString(v, charLength); + } + vs.clear(); + nValues = 0; + charLength = 0; + } + } + + static class VecIntFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + VecIntFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.INT8; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o != null) { + if (o instanceof List) { + final List v = (List) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, v.size()); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, 1); + } else if (o instanceof int[]) { + final int[] v = (int[]) o; + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + nValues = Math.max(nValues, v.length); + } else { + // TODO do we need to support Integer[] ? + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + vs.add(o); + } + + @Override + void encode() throws IOException { + if (nValues > 0) { + for (final Object o : vs) { + final int valuesWritten; + if (o == null) { + valuesWritten = 0; + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecInt(v, type); + valuesWritten = v.size(); + } else if (o instanceof Integer) { + final Integer v = (Integer) o; + encoder.encodeRawInt(v, type); + valuesWritten = 1; + } else if (o instanceof int[]) { + final int[] v = (int[]) o; + encoder.encodeRawVecInt(v, type); + valuesWritten = v.length; + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + // In order to produce output that bcftools can interpret, we always write one MISSING + // value even if the input is entirely absent, which we would otherwise write as a vector of + // all EOV values + if (valuesWritten == 0) { + encoder.encodeRawMissingValue(type); + } + encoder.encodePaddingValues(nValues - Math.max(valuesWritten, 1), type); + } + } + vs.clear(); + type = BCF2Type.INT8; + nValues = 0; + } + } + + static class VecFloatFieldEncoder extends BCF2FieldEncoder { + + private final List vs = new ArrayList<>(); + + VecFloatFieldEncoder(final BCF2Encoder encoder) { + super(encoder); + type = BCF2Type.FLOAT; + nValues = 0; + } + + @Override + void load(final Object o) { + if (o != null) { + if (o instanceof List) { + final List v = (List) o; + nValues = Math.max(nValues, v.size()); + } else if (o instanceof Double) { + nValues = Math.max(nValues, 1); + } else if (o instanceof double[]) { + final double[] v = (double[]) o; + nValues = Math.max(nValues, v.length); + } else { + // TODO do we need to support Double[] ? + throw BCF2FieldEncoder.incompatibleType(o, type); + } + } + vs.add(o); + } + + @Override + void encode() throws IOException { + if (nValues > 0) { + for (final Object o : vs) { + final int valuesWritten; + if (o == null) { + valuesWritten = 0; + } else if (o instanceof List) { + final List v = (List) o; + encoder.encodeRawVecFloat(v); + valuesWritten = v.size(); + } else if (o instanceof Double) { + final Double v = (Double) o; + encoder.encodeRawFloat(v); + valuesWritten = 1; + } else if (o instanceof double[]) { + final double[] v = (double[]) o; + encoder.encodeRawVecFloat(v); + valuesWritten = v.length; + } else { + throw BCF2FieldEncoder.incompatibleType(o, type); + } + + // In order to produce output that bcftools can interpret, we always write one MISSING + // value even if the input is entirely absent, which we would otherwise write as a vector of + // all EOV values + if (valuesWritten == 0) { + encoder.encodeRawMissingValue(type); + } + encoder.encodePaddingValues(nValues - Math.max(valuesWritten, 1), BCF2Type.FLOAT); + } + } + vs.clear(); + nValues = 0; + } + } + + static TribbleException incompatibleType(final Object o, final BCF2Type type) { + final String error = "Could not write object: %s whose type %s is incompatible with declared header of type: %s"; + return new TribbleException(String.format(error, o, o.getClass().getSimpleName(), type)); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java new file mode 100644 index 0000000000..d91b706681 --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriter.java @@ -0,0 +1,515 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * INFO and FORMAT writers + */ +class BCF2FieldWriter { + final VCFCompoundHeaderLine headerLine; + final int dictionaryOffset; + final BCF2Type dictionaryOffsetType; + final String key; + final BCF2Encoder encoder; + + BCF2FieldWriter(final VCFCompoundHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + this.headerLine = headerLine; + this.dictionaryOffset = dictionaryOffset; + this.dictionaryOffsetType = BCF2Utils.determineIntegerType(dictionaryOffset); + this.key = headerLine.getID(); + this.encoder = encoder; + } + + /** + * This should be called before encoding every VariantContext in both INFO and FORMAT writers + */ + void encodeKey() throws IOException { + encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType); + } + + + ////////////////////////////////////////////////// + // Factory Methods // + ////////////////////////////////////////////////// + static SiteWriter createSiteWriter( + final VCFInfoHeaderLine line, + final int offset, + final BCF2Encoder encoder + ) { + return line.getType() == VCFHeaderLineType.Flag + ? new SiteFlagWriter(line, offset, encoder) + : new SiteAttributeWriter(line, offset, encoder); + } + + static GenotypeWriter createGenotypeWriter( + final VCFFormatHeaderLine line, + final int offset, + final BCF2Encoder encoder + ) { + // Specialized writers for fields stored inline in the Genotype and not in its attributes map + switch (line.getID()) { + case VCFConstants.GENOTYPE_KEY: + return new GTWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_FILTER_KEY: + return new FTWriter(line, offset, encoder); + case VCFConstants.DEPTH_KEY: + return new DPWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_QUALITY_KEY: + return new GQWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_ALLELE_DEPTHS: + return new ADWriter(line, offset, encoder); + case VCFConstants.GENOTYPE_PL_KEY: + return new PLWriter(line, offset, encoder); + } + + if (line.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException("Format lines cannot have type Flag"); + } else { + return new GenotypeAttributeWriter(line, offset, encoder); + } + } + + private static BCF2FieldEncoder getEncoder(final VCFCompoundHeaderLine line, final BCF2Encoder encoder) { + switch (line.getType()) { + case Integer: + return line.isFixedCount() && line.getCount() == 1 + ? new BCF2FieldEncoder.AtomicIntFieldEncoder(encoder) + : new BCF2FieldEncoder.VecIntFieldEncoder(encoder); + case Float: + return line.isFixedCount() && line.getCount() == 1 + ? new BCF2FieldEncoder.AtomicFloatFieldEncoder(encoder) + : new BCF2FieldEncoder.VecFloatFieldEncoder(encoder); + case String: + return new BCF2FieldEncoder.StringFieldEncoder(encoder); + case Character: + return new BCF2FieldEncoder.CharFieldEncoder(encoder); + default: + throw new TribbleException("Unrecognized line type: " + line.getType()); + } + } + + + /** + * Class that writes one field specified by a {@link VCFInfoHeaderLine} + * contained the attributes map of a {@link VariantContext} + */ + abstract static class SiteWriter extends BCF2FieldWriter { + + SiteWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + abstract void encode(final VariantContext vc) throws IOException; + } + + /** + * INFO writer that accesses variant context fields stored in the VC's attributes map + */ + static class SiteAttributeWriter extends SiteWriter { + + private final BCF2FieldEncoder siteEncoder; + private final boolean boundedNonAtomic; + + SiteAttributeWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + this.siteEncoder = BCF2FieldWriter.getEncoder(headerLine, encoder); + + // If this line's count is unbounded, or the inner encoder is one of the atomic specializations, + // the inner encoder can always figure out the correct number of BCF2 values to write out by itself. + // Otherwise we need to inspect the context to determine the number of values to encode + // and possibly error if too many values were provided + this.boundedNonAtomic = headerLine.getCountType() != VCFHeaderLineCount.UNBOUNDED && !( + siteEncoder instanceof BCF2FieldEncoder.AtomicIntFieldEncoder || siteEncoder instanceof BCF2FieldEncoder.AtomicFloatFieldEncoder + ); + } + + @Override + void encode(final VariantContext vc) throws IOException { + final Object o = vc.getAttribute(key); + if (o == null) { + encoder.encodeTypedMissing(siteEncoder.type); + } else { + siteEncoder.load(o); + if (boundedNonAtomic) { + siteEncoder.checkNValues(headerLine, vc); + } + + siteEncoder.encodeType(); + siteEncoder.encode(); + } + } + } + + /** + * INFO writer that accesses Flags stored in the VariantContext's attributes map + */ + static class SiteFlagWriter extends SiteWriter { + + SiteFlagWriter(final VCFInfoHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc) throws IOException { + // This seems unintuitive, but it matches the behavior of htslib/bcftools + // See https://github.com/samtools/hts-specs/issues/384 + encoder.encodeRawBytes(0, BCF2Type.INT8); + } + } + + + // TODO in the genotype writers, a missing genotype (one where variantContext.getGenotype(sampleName) == null) + // is treated like one where all its attributes/inline fields are missing, this matches the behavior + // of the old writer, which previously created a new empty Genotype object for each missing genotypes, is this right? + + /** + * Class that writes one field specified by a {@link VCFFormatHeaderLine} + * from all Genotypes contained inside a {@link VariantContext}, iterating through each Genotype in order. + *

+ * Writing occurs in two passes: first all the attribute objects are loaded into the lower level + * {@link BCF2FieldEncoder} then the attributes are written out. This is necessary as some aspects of the BCF + * encoding such as type and sometimes count can only be determined by inspecting all elements to be written. + */ + abstract static class GenotypeWriter extends BCF2FieldWriter { + + GenotypeWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + abstract void encode(final VariantContext vc, final List sampleNames) throws IOException; + } + + /** + * FORMAT writer that accesses genotype fields stored in the Genotype object's attributes map + */ + static class GenotypeAttributeWriter extends GenotypeWriter { + + private final BCF2FieldEncoder siteEncoder; + private final boolean boundedNonAtomic; + + GenotypeAttributeWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + this.siteEncoder = BCF2FieldWriter.getEncoder(headerLine, encoder); + + // If this line's count is unbounded, or the inner encoder is one of the atomic specializations, + // the inner encoder can always figure out the correct number of BCF2 values to write out by itself. + // Otherwise we need to inspect the context to determine the number of values to encode + // and possibly error if too many values were provided + this.boundedNonAtomic = headerLine.getCountType() != VCFHeaderLineCount.UNBOUNDED && !( + siteEncoder instanceof BCF2FieldEncoder.AtomicIntFieldEncoder || siteEncoder instanceof BCF2FieldEncoder.AtomicFloatFieldEncoder + ); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + siteEncoder.load(g == null ? null : g.getExtendedAttribute(key)); + } + + if (boundedNonAtomic) { + siteEncoder.checkNValues(headerLine, vc); + } + + siteEncoder.encodeType(); + siteEncoder.encode(); + } + } + + /** + * Base class for FORMAT writers that access genotype fields stored directly + * as int fields in the Genotype object and not inside the attributes map. + */ + abstract static class GenotypeInlineAtomicIntWriter extends GenotypeWriter { + + // Used to store values to write out to avoid boxing + private int[] vs; + + GenotypeInlineAtomicIntWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + if (vs == null || vs.length < sampleNames.size()) { + vs = new int[sampleNames.size()]; + } + + BCF2Type type = BCF2Type.INT8; + int i = 0; + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final int v = g == null ? -1 : get(g); + if (v != -1) { + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + } + vs[i++] = v; + } + + encoder.encodeType(1, type); + + for (int j = 0; j < i; j++) { + final int v = vs[j]; + if (v == -1) { + encoder.encodeRawMissingValue(type); + } else { + encoder.encodeRawInt(v, type); + } + } + } + + abstract int get(final Genotype g); + } + + static class DPWriter extends GenotypeInlineAtomicIntWriter { + + DPWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int get(final Genotype g) { + return g.getDP(); + } + } + + static class GQWriter extends GenotypeInlineAtomicIntWriter { + + GQWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int get(final Genotype g) { + return g.getGQ(); + } + } + + /** + * Base class for FORMAT writers that access genotype fields stored directly + * as int[] fields in the Genotype object and not inside the attributes map. + */ + abstract static class GenotypeInlineVecIntWriter extends GenotypeWriter { + + private final List vs = new ArrayList<>(); + + GenotypeInlineVecIntWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + BCF2Type type = BCF2Type.INT8; + + // For both vector of int types represented as inline fields by htsjdk (AD and PL), + // the count type can be determined by inspecting the header + final int nValues = headerLine.getCount(vc); + + // Find narrowest integer type that fits all values + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final int[] v = g == null ? null : get(g); + vs.add(v); + + if (v == null) continue; + if (v.length > nValues) + throw BCF2FieldWriter.tooManyValues(v.length, nValues, key, vc); + + type = BCF2Utils.maxIntegerType(type, BCF2Utils.determineIntegerType(v)); + } + + encoder.encodeType(nValues, type); + + for (final int[] vs : vs) { + if (vs == null) { + encoder.encodePaddingValues(nValues, type); + } else { + encoder.encodeRawVecInt(vs, nValues, type); + } + } + vs.clear(); + } + + abstract int[] get(final Genotype g); + } + + static class ADWriter extends GenotypeInlineVecIntWriter { + + ADWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int[] get(final Genotype g) { + return g.getAD(); + } + } + + static class PLWriter extends GenotypeInlineVecIntWriter { + + PLWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + int[] get(final Genotype g) { + return g.getPL(); + } + } + + /** + * Writer for the FT or filter field. This is a special case of the String writer + * where the type of the value is known to be String (and not List) + * and null values must be specially handled by encoding them as PASS. + */ + static class FTWriter extends GenotypeWriter { + + private static final byte[] PASS = "PASS".getBytes(StandardCharsets.US_ASCII); + + private final List vs = new ArrayList<>(); + + FTWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + int nValues = 0; + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + final String f; + final byte[] v; + if (g == null || (f = g.getFilters()) == null) { + v = FTWriter.PASS; + } else { + v = f.getBytes(StandardCharsets.UTF_8); + } + nValues = Math.max(nValues, v.length); + vs.add(v); + } + + encoder.encodeType(nValues, BCF2Type.CHAR); + for (final byte[] v : vs) { + encoder.encodeRawString(v, nValues); + } + vs.clear(); + } + } + + /** + * Specialized writer for GT field. + */ + static class GTWriter extends GenotypeWriter { + + private final HashMap alleleMapForTriPlus = new HashMap<>(5); + private Allele ref, alt1; + + GTWriter(final VCFFormatHeaderLine headerLine, final int dictionaryOffset, final BCF2Encoder encoder) { + super(headerLine, dictionaryOffset, encoder); + } + + @Override + void encode(final VariantContext vc, final List sampleNames) throws IOException { + buildAlleleMap(vc); + final int nValues = vc.getMaxPloidy(2); + // Offsets should always fit into a signed 8-bit integer but do this check anyway for spec compliance + final BCF2Type type = BCF2Utils.determineIntegerType(vc.getNAlleles() << 1); + + encoder.encodeType(nValues, type); + + for (final String s : sampleNames) { + final Genotype g = vc.getGenotype(s); + if (g != null) { + boolean notFirst = false; + for (final Allele a : g.getAlleles()) { + // TODO Genotype and Allele classes can't properly store phasing information for ploidy > 2 + // Currently all non ref alleles are assumed to have the same phasing + final int encoded = encodeAlleleWithoutPhasing(a) | ((g.isPhased() && notFirst) ? 0x01 : 0x00); + encoder.encodeRawInt(encoded, type); + notFirst = true; + } + // Pad with missing values if sample ploidy is less than maximum + final int padding = nValues - g.getPloidy(); + if (padding > 0) { + encoder.encodePaddingValues(padding, type); + } + } else { + // Entirely missing genotype, which we encode as vector of no call + // These cannot be encoded as MISSING values, because the BCF 2.2 spec explicitly forbids + // any negative values in the GT array and MISSING values are negative + for (int i = 0; i < nValues; i++) { + encoder.encodeRawInt(0, type); + } + } + } + } + + /** + * Fast path code to encode an allele without phasing information. + * Inline tests for == against ref (most common, first test) + * == alt1 (second most common, second test) + * == NO_CALL (third) + * and finally in the map from allele => offset for all alt 2+ alleles + * + * @param a the allele we want to encode + * @return the encoded allele without phasing information + */ + private int encodeAlleleWithoutPhasing(final Allele a) { + if (a == ref) return 2; // ( 0 + 1) << 1 + else if (a == alt1) return 4; // ( 1 + 1) << 1 + else if (a == Allele.NO_CALL) return 0; // (-1 + 1) << 1 + else { + final Integer i = alleleMapForTriPlus.get(a); + if (i == null) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a); + return i; + } + } + + private void buildAlleleMap(final VariantContext vc) { + // ref and alt1 are handled by a fast path when determining the offset + // so they do not need to be placed in the map + final int nAlleles = vc.getNAlleles(); + ref = vc.getReference(); + alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null; + + if (nAlleles > 2) { + // for multi-allelics we need to clear the map, and add additional looks + alleleMapForTriPlus.clear(); + final List alleles = vc.getAlleles(); + for (int i = 2; i < alleles.size(); i++) { + // Perform encoding here so we only do it once instead of after every lookup + alleleMapForTriPlus.put(alleles.get(i), (i + 1) << 1); + } + } + } + } + + + ////////////////////////////////////////////////// + // Exception utilities // + ////////////////////////////////////////////////// + static TribbleException tooManyValues(final int observed, final int expected, final String key, final VariantContext vc) { + final String error = "Observed number of values: %d exceeds expected number: %d for attribute: %s in VariantContext: %s"; + return new TribbleException(String.format(error, observed, expected, key, vc)); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java new file mode 100644 index 0000000000..02fb3e4cdd --- /dev/null +++ b/src/main/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldWriterManager.java @@ -0,0 +1,114 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFStandardHeaderLines; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class BCF2FieldWriterManager { + private static final Log log = Log.getInstance(BCF2FieldWriterManager.class); + + private final Map infoWriters; + private final Map formatWriters; + private final List sampleNames; + + public BCF2FieldWriterManager(final VCFHeader header, final Map dict, final BCF2Encoder encoder) { + infoWriters = new HashMap<>(header.getInfoHeaderLines().size()); + for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { + final String field = line.getID(); + validateStandardHeader(line, VCFStandardHeaderLines.getInfoLine(field, false)); + final int offset = dict.get(field); + final BCF2FieldWriter.SiteWriter writer = BCF2FieldWriter.createSiteWriter(line, offset, encoder); + infoWriters.put(field, writer); + } + + formatWriters = new HashMap<>(header.getFormatHeaderLines().size()); + for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { + final String field = line.getID(); + // We skip validation for the FT key because its line count changed between VCF versions 4.2 and 4.3 + // from UNBOUNDED to 1, while VCFStandardHeaderLines keeps the 4.2 definition. + // This does not matter for our BCF writing code because the concrete BCF count encoded in the typing + // bytes for strings always has to be determined by inspecting the strings themselves, so this validation + // would only produce noisy but harmless warnings. + if (!field.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { + validateStandardHeader(line, VCFStandardHeaderLines.getFormatLine(field, false)); + } + final int offset = dict.get(field); + final BCF2FieldWriter.GenotypeWriter writer = BCF2FieldWriter.createGenotypeWriter(line, offset, encoder); + formatWriters.put(field, writer); + } + + sampleNames = header.getGenotypeSamples(); + } + + public void writeInfo(final VariantContext vc) throws IOException { + for (final String field : vc.getAttributes().keySet()) { + final BCF2FieldWriter.SiteWriter writer = infoWriters.get(field); + if (writer == null) errorUnexpectedFieldToWrite(vc, field, "INFO"); + writer.encodeKey(); + writer.encode(vc); + } + } + + public void writeFormat(final VariantContext vc, final List genotypeFields) throws IOException { + for (final String field : genotypeFields) { + final BCF2FieldWriter.GenotypeWriter writer = formatWriters.get(field); + if (writer == null) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + writer.encodeKey(); + writer.encode(vc, sampleNames); + } + } + + private static void validateStandardHeader( + final T actualLine, + final T expectedLine + ) { + if (expectedLine == null) return; + final VCFHeaderLineType actualType = actualLine.getType(); + final VCFHeaderLineType expectedType = expectedLine.getType(); + if (actualType != expectedType) { + log.warn(String.format( + "Header with standard key: `%s` has type: %s which does not match standard type: %s", + actualLine.getID(), + actualType, + expectedType + )); + } + + final VCFHeaderLineCount actualCountType = actualLine.getCountType(); + final VCFHeaderLineCount expectedCountType = expectedLine.getCountType(); + if (actualCountType != expectedCountType || actualLine.isFixedCount() && actualLine.getCount() != expectedLine.getCount()) { + log.warn(String.format( + "Header with standard key: `%s` has count: %s which does not match standard count: %s", + actualLine.getID(), + actualLine.isFixedCount() ? actualLine.getCount() : actualCountType, + expectedLine.isFixedCount() ? expectedLine.getCount() : expectedCountType + )); + } + } + + private static void errorUnexpectedFieldToWrite( + final VariantContext vc, + final String field, + final String fieldType + ) { + throw new TribbleException(String.format( + "Found %s field %s of VariantContext at %s:%d from %s that has not been defined in the VCFHeader", + fieldType, field, + vc.getContig(), vc.getStart(), vc.getSource() + )); + } +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java index c406b6602d..34d49546c1 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2GenotypeFieldDecoders.java @@ -33,6 +33,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -50,14 +51,13 @@ public class BCF2GenotypeFieldDecoders { private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number // initialized once per writer to allow parallel writers to work - private final HashMap genotypeFieldDecoder = new HashMap(); + private final HashMap genotypeFieldDecoder = new HashMap<>(); private final Decoder defaultDecoder = new GenericDecoder(); public BCF2GenotypeFieldDecoders(final VCFHeader header) { // TODO -- fill in appropriate decoders for each FORMAT field in the header genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); - // currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder()); genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); @@ -74,41 +74,41 @@ public BCF2GenotypeFieldDecoders(final VCFHeader header) { /** * Return decoder appropriate for field, or the generic decoder if no * specialized one is bound + * * @param field the GT field to decode * @return a non-null decoder */ public Decoder getDecoder(final String field) { - final Decoder d = genotypeFieldDecoder.get(field); - return d == null ? defaultDecoder : d; + return genotypeFieldDecoder.getOrDefault(field, defaultDecoder); } /** * Decoder a field (implicit from creation) encoded as * typeDescriptor in the decoder object in the GenotypeBuilders * one for each sample in order. - * + *

* The way this works is that this decode method * iterates over the builders, decoding a genotype field * in BCF2 for each sample from decoder. - * + *

* This system allows us to easily use specialized * decoders for specific genotype field values. For example, * we use a special decoder to directly read the BCF2 data for * the PL field into a int[] rather than the generic List of Integer */ public interface Decoder { - public void decode(final List siteAlleles, - final String field, - final BCF2Decoder decoder, - final byte typeDescriptor, - final int numElements, - final GenotypeBuilder[] gbs) throws IOException; + void decode(final List siteAlleles, + final String field, + final BCF2Decoder decoder, + final byte typeDescriptor, + final int numElements, + final GenotypeBuilder[] gbs) throws IOException; } - private class GTDecoder implements Decoder { + private static class GTDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + if (ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); @@ -117,44 +117,47 @@ public void decode(final List siteAlleles, final String field, final BCF /** * fast path for many samples with diploid genotypes - * + *

* The way this would work is simple. Create a List diploidGenotypes[] object * After decoding the offset, if that sample is diploid compute the * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype * cache it and use that - * + *

* Some notes. If there are nAlleles at the site, there are implicitly actually - * n + 1 options including + * n + 1 options including ref */ @SuppressWarnings({"unchecked"}) - private final void fastBiallelicDiploidDecode(final List siteAlleles, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { + private void fastBiallelicDiploidDecode(final List siteAlleles, + final BCF2Decoder decoder, + final byte typeDescriptor, + final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final int nPossibleGenotypes = 3 * 3; - final Object allGenotypes[] = new Object[nPossibleGenotypes]; + final Object[] allGenotypes = new Object[nPossibleGenotypes]; - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { final int a1 = decoder.decodeInt(type); final int a2 = decoder.decodeInt(type); - if ( a1 == type.getMissingBytes() ) { - assert a2 == type.getMissingBytes(); + final boolean phased; + if (a1 == decoder.getPaddingValue(type)) { + assert a2 == decoder.getPaddingValue(type); // no called sample GT = . gb.alleles(null); - } else if ( a2 == type.getMissingBytes() ) { - gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1))); + phased = false; + } else if (a2 == decoder.getPaddingValue(type)) { + gb.alleles(Collections.singletonList(getAlleleFromEncoded(siteAlleles, a1))); + phased = (a1 & 0x01) == 1; } else { // downshift to remove phase final int offset = (a1 >> 1) * 3 + (a2 >> 1); assert offset < allGenotypes.length; // TODO -- how can I get rid of this cast? - List gt = (List)allGenotypes[offset]; - if ( gt == null ) { + List gt = (List) allGenotypes[offset]; + if (gt == null) { final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1); final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2); gt = Arrays.asList(allele1, allele2); @@ -162,116 +165,128 @@ private final void fastBiallelicDiploidDecode(final List siteAlleles, } gb.alleles(gt); + phased = (a2 & 0x01) == 1; } - final boolean phased = (a2 & 0x01) == 1; gb.phased(phased); } } - private final void generalDecode(final List siteAlleles, - final int ploidy, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { + private void generalDecode(final List siteAlleles, + final int ploidy, + final BCF2Decoder decoder, + final byte typeDescriptor, + final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // a single cache for the encoded genotypes, since we don't actually need this vector final int[] tmp = new int[ploidy]; - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp); - if ( encoded == null ) + if (encoded == null) // no called sample GT = . gb.alleles(null); else { assert encoded.length > 0; // we have at least some alleles to decode - final List gt = new ArrayList(encoded.length); + final List gt = new ArrayList<>(encoded.length); // note that the auto-pruning of fields magically handles different // ploidy per sample at a site - for ( final int encode : encoded ) + for (final int encode : encoded) gt.add(getAlleleFromEncoded(siteAlleles, encode)); gb.alleles(gt); + // TODO htsjdk's Genotype class cannot properly encode phasing for ploidy > 2 + // See https://github.com/samtools/htsjdk/issues/1044 final boolean phased = ((encoded.length > 1 ? encoded[1] : encoded[0]) & 0x01) == 1; gb.phased(phased); } } } - private final Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { - final int offset = encode >> 1; + private Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { + final int offset = encode >>> 1; return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1); } } - private class DPDecoder implements Decoder { + private static class DPDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); } } } - private class GQDecoder implements Decoder { + private static class GQDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); } } } - private class ADDecoder implements Decoder { + private static class ADDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); } } } - private class PLDecoder implements Decoder { + private static class PLDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { + for (final GenotypeBuilder gb : gbs) { gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); } } } - private class GenericDecoder implements Decoder { + private static class GenericDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) { - // todo -- I really hate this, and it suggests that the code isn't completely right - // the reason it's here is that it's possible to prune down a vector to a singleton - // value and there we have the contract that the value comes back as an atomic value - // not a vector of size 1 - value = ((List)value).get(0); - } + for (final GenotypeBuilder gb : gbs) { + final Object value = decoder.decodeTypedValue(typeDescriptor, numElements); + if (value == null) continue; + // TODO see https://github.com/samtools/hts-specs/issues/618 + // Although it seems like a very rare corner case, this decoder cannot distinguish between + // a vector of Character and a String, which are different VCF types but identical in BCF, + // which should be decoded differently as Java objects + // as List chars = Arrays.asList("a", "b", "c") vs String str = new String("abc") + // We would need the associated header line for each key to inspect its VCF type like we do in the + // BCF writer. This would require a rewrite of this class, which would be desirable either way + // so we can do stricter validation of the number and type of attributes being deserialized + if (value instanceof List && ((List) value).size() == 1) { + // TODO not sure what this refers to, htsjdk itself doesn't make any assumptions about + // the concrete type of the data contained in the attributes map. + // Maybe there are upstream consumers who have this contract. + + // todo -- I really hate this, and it suggests that the code isn't completely right + // the reason it's here is that it's possible to prune down a vector to a singleton + // value and there we have the contract that the value comes back as an atomic value + // not a vector of size 1 + gb.attribute(field, ((List) value).get(0)); + } else { gb.attribute(field, value); } } } } - private class FTDecoder implements Decoder { + private static class FTDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - assert value == null || value instanceof String; - gb.filter((String)value); + for (final GenotypeBuilder gb : gbs) { + gb.filters(decoder.decodeExplodedStrings(numElements, ';')); } } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java b/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java index aadea53dfb..a23c74c091 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2LazyGenotypesDecoder.java @@ -65,7 +65,8 @@ public LazyGenotypesContext.LazyData parse(final Object data) { try { // load our byte[] data into the decoder - final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(codec.getBCFVersion()); + decoder.setRecordBytes(((BCF2Codec.LazyData)data).bytes); for ( int i = 0; i < nSamples; i++ ) builders[i].reset(true); diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java index 11c8edf6c5..ae6f6ed90f 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Type.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Type.java @@ -1,27 +1,27 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; @@ -39,62 +39,72 @@ public enum BCF2Type { // the actual values themselves MISSING(0, 0, 0x00) { - @Override public int read(final InputStream in) throws IOException { + @Override + public int read(final InputStream in) throws IOException { throw new IllegalArgumentException("Cannot read MISSING type"); } - @Override public void write(final int value, final OutputStream out) throws IOException { + + @Override + public void write(final int value, final OutputStream out) throws IOException { throw new IllegalArgumentException("Cannot write MISSING type"); } }, - INT8 (1, 1, 0xFFFFFF80, -127, 127) { + INT8(1, 1, 0xFFFFFF80, 0xFFFFFF81, -120, 127) { @Override public int read(final InputStream in) throws IOException { - return BCF2Utils.readByte(in); + // This cast to byte then implicit cast back to int is needed so that negative + // integers are sign extended to their proper 32 bit representation. + // The integer read from the stream before truncating to byte is an 32-bit integer + // with the 3 high bytes 0, and the widening conversion performs sign extension, + // the same applies for the read method of INT16. + return (byte) in.read(); } @Override public void write(final int value, final OutputStream out) throws IOException { - out.write(0xFF & value); // TODO -- do we need this operation? + // Do not need to mask off higher bytes because Java's OutputStream contract is to + // only write the bottom byte of the passed in int, the same applies to the write + // methods of the larger int sizes below. + out.write(value); } }, - INT16(2, 2, 0xFFFF8000, -32767, 32767) { + INT16(2, 2, 0xFFFF8000, 0xFFFF8001, -32760, 32767) { @Override public int read(final InputStream in) throws IOException { - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (short)((b1 << 8) | b2); + final int b2 = in.read(); + final int b1 = in.read(); + return (short) ((b1 << 8) | b2); } @Override public void write(final int value, final OutputStream out) throws IOException { - // TODO -- optimization -- should we put this in a local buffer? - out.write((0x00FF & value)); - out.write((0xFF00 & value) >> 8); + out.write(value); + out.write(value >> 8); } }, - INT32(3, 4, 0x80000000, -2147483647, 2147483647) { + INT32(3, 4, 0x80000000, 0x80000001, -2147483640, 2147483647) { @Override public int read(final InputStream in) throws IOException { - final int b4 = BCF2Utils.readByte(in) & 0xFF; - final int b3 = BCF2Utils.readByte(in) & 0xFF; - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); + final int b4 = in.read(); + final int b3 = in.read(); + final int b2 = in.read(); + final int b1 = in.read(); + return b1 << 24 | b2 << 16 | b3 << 8 | b4; } @Override public void write(final int value, final OutputStream out) throws IOException { - out.write((0x000000FF & value)); - out.write((0x0000FF00 & value) >> 8); - out.write((0x00FF0000 & value) >> 16); - out.write((0xFF000000 & value) >> 24); + out.write(value); + out.write(value >> 8); + out.write(value >> 16); + out.write(value >> 24); } }, - FLOAT(5, 4, 0x7F800001) { + FLOAT(5, 4, 0x7F800001, 0x7F800002, 0, 0) { @Override public int read(final InputStream in) throws IOException { return INT32.read(in); @@ -106,7 +116,9 @@ public void write(final int value, final OutputStream out) throws IOException { } }, - CHAR (7, 1, 0x00000000) { + // TODO uncertain as to the correct MISSING and EOV representations of Character/String + // see https://github.com/samtools/hts-specs/issues/618 + CHAR(7, 1, 0x07, 0x00, 0, 0) { @Override public int read(final InputStream in) throws IOException { return INT8.read(in); @@ -120,25 +132,40 @@ public void write(final int value, final OutputStream out) throws IOException { private final int id; private final Object missingJavaValue; + + /* + Note that the values for these fields for INT8 and IN16 differ from those given in the spec + The values given here are as if they have been sign-extended to 32 bits from their native + integer width (meaning they have all bits above that width set, as the missing and EOV + values all have their highest bit set in their native width) + + This is so that they compare equal to the values returned by the various + integer types' read methods, which must also sign-extend their return values so + we can return a uniformly sized 32-bit int + */ private final int missingBytes; + private final int EOVBytes; private final int sizeInBytes; + private final long minValue, maxValue; BCF2Type(final int id, final int sizeInBytes, final int missingBytes) { - this(id, sizeInBytes, missingBytes, 0, 0); + this(id, sizeInBytes, missingBytes, 0, 0, 0); } - BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { + BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final int EOVBytes, final long minValue, final long maxValue) { this.id = id; this.sizeInBytes = sizeInBytes; this.missingJavaValue = null; this.missingBytes = missingBytes; + this.EOVBytes = EOVBytes; this.minValue = minValue; this.maxValue = maxValue; } /** * How many bytes are used to represent this type on disk? + * * @return */ public int getSizeInBytes() { @@ -147,19 +174,24 @@ public int getSizeInBytes() { /** * The ID according to the BCF2 specification + * * @return */ - public int getID() { return id; } + public int getID() { + return id; + } /** * Can we encode value v in this type, according to its declared range. - * + *

* Only makes sense for integer values * * @param v * @return */ - public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } + public final boolean withinRange(final long v) { + return v <= maxValue && v >= minValue; + } /** * Return the java object (aka null) that is used to represent a missing value for this @@ -167,7 +199,9 @@ public int getSizeInBytes() { * * @return */ - public Object getMissingJavaValue() { return missingJavaValue; } + public Object getMissingJavaValue() { + return missingJavaValue; + } /** * The bytes (encoded as an int) that are used to represent a missing value @@ -175,7 +209,19 @@ public int getSizeInBytes() { * * @return */ - public int getMissingBytes() { return missingBytes; } + public int getMissingBytes() { + return missingBytes; + } + + /** + * The bytes (encoded as an int) that are used to represent an end of vector value + * for this type in BCF2 + * + * @return + */ + public int getEOVBytes() { + return EOVBytes; + } /** * An enum set of the types that might represent Integer values @@ -195,7 +241,7 @@ public boolean isIntegerType() { /** * Read a value from in stream of this BCF2 type as an int [32 bit] collection of bits - * + *

* For intX and char values this is just the int / byte value of the underlying data represented as a 32 bit int * For a char the result must be converted to a char by (char)(byte)(0x0F & value) * For doubles it's necessary to convert subsequently this value to a double via Double.bitsToDouble() @@ -211,4 +257,16 @@ public int read(final InputStream in) throws IOException { public void write(final int value, final OutputStream out) throws IOException { throw new IllegalArgumentException("Not implemented"); } + + private enum Special { + MISSING, + EOV, + } + + /** + * @return a unique End Of Vector object used by the low level decoder + */ + public static Object EOVValue() { + return Special.EOV; + } } diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java index 39478bf069..f64f49a9b6 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java @@ -27,32 +27,27 @@ import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.TribbleException; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; import java.io.File; -import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Array; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; -import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.Set; /** * Common utilities for working with BCF2 files - * + *

* Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type) * * @author depristo * @since 5/12 */ public final class BCF2Utils { - public static final int MAX_ALLELES_IN_GENOTYPES = 127; public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; @@ -62,50 +57,16 @@ public final class BCF2Utils { static { int maxID = -1; - for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID); - ID_TO_ENUM = new BCF2Type[maxID+1]; - for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v; + for (final BCF2Type v : BCF2Type.values()) maxID = Math.max(v.getID(), maxID); + ID_TO_ENUM = new BCF2Type[maxID + 1]; + for (final BCF2Type v : BCF2Type.values()) ID_TO_ENUM[v.getID()] = v; } - private BCF2Utils() {} - - /** - * Create a strings dictionary from the VCF header - * - * The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT) - * fields. - * - * Note that its critical that the list be dedupped and sorted in a consistent manner each time, - * as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly - * the same way as in the header each time it's very bad - * - * @param header the VCFHeader from which to build the dictionary - * @return a non-null dictionary of elements, may be empty - */ - public static ArrayList makeDictionary(final VCFHeader header) { - final Set seen = new HashSet(); - final ArrayList dict = new ArrayList(); - - // special case the special PASS field which doesn't show up in the FILTER field definitions - seen.add(VCFConstants.PASSES_FILTERS_v4); - dict.add(VCFConstants.PASSES_FILTERS_v4); - - // set up the strings dictionary - for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { - if ( line.shouldBeAddedToDictionary() ) { - final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - if ( ! seen.contains(idLine.getID())) { - dict.add(idLine.getID()); - seen.add(idLine.getID()); - } - } - } - - return dict; + private BCF2Utils() { } - public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { - return (byte)((0x0F & nElements) << 4 | (type.getID() & 0x0F)); + public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type) { + return (byte) ((0x0F & nElements) << 4 | (type.getID() & 0x0F)); } public static int decodeSize(final byte typeDescriptor) { @@ -124,58 +85,12 @@ public static boolean sizeIsOverflow(final byte typeDescriptor) { return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; } - public static byte readByte(final InputStream stream) throws IOException { - return (byte)(stream.read() & 0xFF); - } - - /** - * Collapse multiple strings into a comma separated list - * - * ["s1", "s2", "s3"] => ",s1,s2,s3" - * - * @param strings size > 1 list of strings - * @return - */ - public static String collapseStringList(final List strings) { - if ( strings.isEmpty() ) return ""; - else if ( strings.size() == 1 ) return strings.get(0); - else { - final StringBuilder b = new StringBuilder(); - for ( final String s : strings ) { - if ( s != null ) { - assert s.indexOf(",") == -1; // no commas in individual strings - b.append(',').append(s); - } - } - return b.toString(); - } - } - - /** - * Inverse operation of collapseStringList. - * - * ",s1,s2,s3" => ["s1", "s2", "s3"] - * - * - * @param collapsed - * @return - */ - public static List explodeStringList(final String collapsed) { - assert isCollapsedString(collapsed); - final String[] exploded = collapsed.substring(1).split(","); - return Arrays.asList(exploded); - } - - public static boolean isCollapsedString(final String s) { - return !s.isEmpty() && s.charAt(0) == ','; - } - /** * Returns a good name for a shadow BCF file for vcfFile. - * + *

* foo.vcf => foo.bcf * foo.xxx => foo.xxx.bcf - * + *

* If the resulting BCF file cannot be written, return null. Happens * when vcfFile = /dev/null for example * @@ -184,11 +99,11 @@ public static boolean isCollapsedString(final String s) { */ public static final File shadowBCF(final File vcfFile) { final String path = vcfFile.getAbsolutePath(); - if ( path.contains(FileExtensions.VCF) ) + if (path.contains(FileExtensions.VCF)) return new File(path.replace(FileExtensions.VCF, FileExtensions.BCF)); else { - final File bcf = new File( path + FileExtensions.BCF ); - if ( bcf.canRead() ) + final File bcf = new File(path + FileExtensions.BCF); + if (bcf.canRead()) return bcf; else { try { @@ -197,9 +112,7 @@ public static final File shadowBCF(final File vcfFile) { o.close(); bcf.delete(); return bcf; - } catch ( FileNotFoundException e ) { - return null; - } catch ( IOException e ) { + } catch (final IOException e) { return null; } } @@ -207,8 +120,8 @@ public static final File shadowBCF(final File vcfFile) { } public static BCF2Type determineIntegerType(final int value) { - for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { - if ( potentialType.withinRange(value) ) + for (final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { + if (potentialType.withinRange(value)) return potentialType; } @@ -218,9 +131,9 @@ public static BCF2Type determineIntegerType(final int value) { public static BCF2Type determineIntegerType(final int[] values) { // find the min and max values in the array int max = 0, min = 0; - for ( final int v : values ) { - if ( v > max ) max = v; - if ( v < min ) min = v; + for (final int v : values) { + if (v > max) max = v; + if (v < min) min = v; } final BCF2Type maxType = determineIntegerType(max); @@ -232,7 +145,7 @@ public static BCF2Type determineIntegerType(final int[] values) { /** * Returns the maximum BCF2 integer size of t1 and t2 - * + *

* For example, if t1 == INT8 and t2 == INT16 returns INT16 * * @param t1 @@ -240,64 +153,49 @@ public static BCF2Type determineIntegerType(final int[] values) { * @return */ public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { - switch ( t1 ) { - case INT8: return t2; - case INT16: return t2 == BCF2Type.INT32 ? t2 : t1; - case INT32: return t1; - default: throw new TribbleException("BUG: unexpected BCF2Type " + t1); + switch (t1) { + case INT8: + return t2; + case INT16: + return t2 == BCF2Type.INT32 ? t2 : t1; + case INT32: + return t1; + default: + throw new TribbleException("BUG: unexpected BCF2Type " + t1); } } public static BCF2Type determineIntegerType(final List values) { BCF2Type maxType = BCF2Type.INT8; - for ( final int value : values ) { + for (final Integer value : values) { + if (value == null) continue; final BCF2Type type1 = determineIntegerType(value); - switch ( type1 ) { - case INT8: break; - case INT16: maxType = BCF2Type.INT16; break; - case INT32: return BCF2Type.INT32; // fast path for largest possible value - default: throw new TribbleException("Unexpected integer type " + type1 ); + switch (type1) { + case INT8: + break; + case INT16: + maxType = BCF2Type.INT16; + break; + case INT32: + return BCF2Type.INT32; // fast path for largest possible value + default: + throw new TribbleException("Unexpected integer type " + type1); } } return maxType; } - /** - * Helper function that takes an object and returns a list representation - * of it: - * - * o == null => [] - * o is a list => o - * else => [o] - * - * @param c the class of the object - * @param o the object to convert to a Java List - * @return - */ - public static List toList(final Class c, final Object o) { - if ( o == null ) return Collections.emptyList(); - else if ( o instanceof List ) return (List)o; - else if ( o.getClass().isArray() ) { - final int arraySize = Array.getLength(o); - final List list = new ArrayList(arraySize); - for (int i=0; i + * If the order of INFO, FILTER, or contig elements in the output header is different than * in the input header we must decode the blocks using the input header and then recode them * based on the new output order. - * + *

* If they are consistent, we can simply pass through the raw genotypes block bytes, which is * a *huge* performance win for large blocks. - * + *

* Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc) * don't modify the ordering of the header fields and so can safely pass through the genotypes * undecoded. Some operations -- those at add filters or info fields -- can change the ordering @@ -305,28 +203,25 @@ else if ( o.getClass().isArray() ) { */ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) { // first, we have to have the same samples in the same order - if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) + if (!nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder()))) return false; - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - while ( inputLinesIt.hasNext() ) { - if ( ! outputLinesIt.hasNext() ) // missing lines in output + for (final VCFHeaderLine headerLine : genotypesBlockHeader.getIDHeaderLines()) { + if (!outputLinesIt.hasNext()) // missing lines in output return false; - final VCFIDHeaderLine outputLine = outputLinesIt.next(); - final VCFIDHeaderLine inputLine = inputLinesIt.next(); - - if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) + final VCFHeaderLine outputLine = outputLinesIt.next(); + if (!headerLine.getClass().equals(outputLine.getClass()) || !headerLine.getID().equals(outputLine.getID())) return false; } return true; } - private static List nullAsEmpty(List l) { - if ( l == null ) + private static List nullAsEmpty(final List l) { + if (l == null) return Collections.emptyList(); else return l; diff --git a/src/main/java/htsjdk/variant/bcf2/BCFVersion.java b/src/main/java/htsjdk/variant/bcf2/BCFVersion.java index b18b83e4aa..788cb60e88 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCFVersion.java +++ b/src/main/java/htsjdk/variant/bcf2/BCFVersion.java @@ -29,6 +29,9 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; /** * Simple holder for BCF version information @@ -44,6 +47,12 @@ public final class BCFVersion { */ public static final byte[] MAGIC_HEADER_START = "BCF".getBytes(); + public static final BCFVersion BCF2_1Version = new BCFVersion(2, 1); + public static final BCFVersion BCF2_2Version = new BCFVersion(2, 2); + + public static final Set SUPPORTED_VERSIONS = new HashSet<>(Collections.singletonList(BCF2_2Version)); + + final int majorVersion; final int minorVersion; diff --git a/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java b/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java index 483e1c617d..fd6bdd1fe6 100644 --- a/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/GenotypeBuilder.java @@ -25,15 +25,18 @@ package htsjdk.variant.variantcontext; -import htsjdk.tribble.util.ParsingUtils; +import htsjdk.tribble.TribbleException; import htsjdk.variant.vcf.VCFConstants; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; /** * A builder class for genotypes @@ -71,7 +74,7 @@ public final class GenotypeBuilder { private int[] AD = null; private int[] PL = null; private Map extendedAttributes = null; - private String filters = null; + private Set filters; private int initialAttributeMapSize = 5; private final static Map NO_ATTRIBUTES = @@ -199,7 +202,7 @@ public final void reset(final boolean keepSampleName) { */ public Genotype make() { final Map ea = (extendedAttributes == null) ? NO_ATTRIBUTES : extendedAttributes; - return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea); + return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, buildFilterString(), ea); } /** @@ -216,7 +219,7 @@ public Genotype makeWithShallowCopy() { final List al = new ArrayList<>(alleles); final int[] copyAD = (AD == null) ? null : Arrays.copyOf(AD, AD.length); final int[] copyPL = (PL == null) ? null : Arrays.copyOf(PL, PL.length); - return new FastGenotype(sampleName, al, isPhased, GQ, DP, copyAD, copyPL, filters, ea); + return new FastGenotype(sampleName, al, isPhased, GQ, DP, copyAD, copyPL, buildFilterString(), ea); } /** @@ -373,12 +376,32 @@ public GenotypeBuilder attribute(final String key, final Object value) { * @return this builder */ public GenotypeBuilder filters(final List filters) { - if ( filters.isEmpty() ) - return filter(null); - else if ( filters.size() == 1 ) - return filter(filters.get(0)); - else - return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters))); + for (final String filter : filters) { + if (!VariantContext.VALID_FILTER.matcher(filter).matches()) { + throw new TribbleException("Filter '" + filter + + "' contains an illegal character. It must conform to the regex ;'" + VariantContext.VALID_FILTER); + } else if (filter.equals("0")) { + throw new TribbleException("Filter cannot use reserved string '0'"); + } + } + // Filters must be unique + final Set uniqueFilters = new HashSet<>(filters.size()); + for (final String filter : filters) { + if (uniqueFilters.contains(filter)) { + throw new TribbleException("BUG: Attempting to add duplicate filter " + filter + " at " + this); + } else { + uniqueFilters.add(filter); + } + } + + final boolean hasUnfilteredString = uniqueFilters.contains(VCFConstants.UNFILTERED); + final boolean hasPassesString = uniqueFilters.contains(VCFConstants.PASSES_FILTERS_v4); + if ((hasUnfilteredString || hasPassesString) && uniqueFilters.size() > 1) { + throw new TribbleException("Filters cannot contain missing value '.' or passing value 'PASS' in addition to filters"); + } + + this.filters = hasPassesString ? null : uniqueFilters; + return this; } /** @@ -397,10 +420,27 @@ public GenotypeBuilder filters(final String ... filters) { * @return */ public GenotypeBuilder filter(final String filter) { - this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter; + // TODO should this split the string on semicolon, or should it be in the function's contract + // that only one filter and no semicolons can be included in the passed in string + if (filter == null || filter.isEmpty() || VCFConstants.PASSES_FILTERS_v4.equals(filter)) { + this.filters = null; + } else { + // Internal adjacent separators such as a;;b produce an empty string in the split array, which is + // handled by the valid filter regular expression, which rejects empty filter strings + if (filter.startsWith(";") || filter.endsWith(";")) { + throw new TribbleException("Filter string cannot start or end with filter separator ';'"); + } + filters(filter.split(";")); + } return this; } + private String buildFilterString() { + return this.filters == null || this.filters.isEmpty() + ? null + : this.filters.stream().sorted().collect(Collectors.joining(";")); + } + /** * This genotype is unfiltered * diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java index 2d50955bd1..a2c47f9e7a 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -26,6 +26,7 @@ package htsjdk.variant.variantcontext; import htsjdk.beta.plugin.HtsRecord; +import htsjdk.samtools.util.QualityUtil; import htsjdk.tribble.Feature; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; @@ -47,7 +48,7 @@ import java.util.stream.Collectors; /** - * + * *

High-level overview

* * The VariantContext object is a single general class system for representing genetic variation data composed of: @@ -89,7 +90,7 @@ *

* A [ref] / T at 10 *

- *
 
+ *
  * GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10);
  *
*

@@ -208,7 +209,7 @@ * * * - *

Fully decoding.

+ *

Fully decoding.

* Currently VariantContexts support some fields, particularly those * stored as generic attributes, to be of any type. For example, a field AB might * be naturally a floating point number, 0.51, but when it's read into a VC its @@ -266,11 +267,12 @@ public class VariantContext implements HtsRecord, Feature, Serializable { /* cached monomorphic value: null -> not yet computed, False, True */ private Boolean monomorphic = null; + private final VCFHeaderVersion version; + /* * Determine which genotype fields are in use in the genotypes in VC * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first */ - public List calcVCFGenotypeKeys(final VCFHeader header) { final Set keys = new HashSet<>(); @@ -296,21 +298,23 @@ public List calcVCFGenotypeKeys(final VCFHeader header) { if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY); if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - List sortedList = ParsingUtils.sortList(new ArrayList<>(keys)); - - // make sure the GT is first + final List list = new ArrayList<>(6 + keys.size()); + // Make sure the GT is first if present if (sawGoodGT) { - final List newList = new ArrayList<>(sortedList.size() + 1); - newList.add(VCFConstants.GENOTYPE_KEY); - newList.addAll(sortedList); - sortedList = newList; + list.add(VCFConstants.GENOTYPE_KEY); + list.addAll(keys); + // Sort, skipping GT which will be at the first position of the list + Collections.sort(list.subList(1, list.size())); + } else { + list.addAll(keys); + Collections.sort(list); } - if (sortedList.isEmpty() && header.hasGenotypingData()) { + if (list.isEmpty() && header.hasGenotypingData()) { // this needs to be done in case all samples are no-calls return Collections.singletonList(VCFConstants.GENOTYPE_KEY); } else { - return sortedList; + return list; } } @@ -321,7 +325,7 @@ public List calcVCFGenotypeKeys(final VCFHeader header) { // // --------------------------------------------------------------------------------------------------------- - //no controls and white-spaces characters, no semicolon. + // No controls and white-spaces characters, no semicolon, filter string cannot be empty public static final Pattern VALID_FILTER = Pattern.compile("^[!-:<-~]+$"); public enum Validation { @@ -396,13 +400,15 @@ private static void validateFilters(final VariantContext variantContext) { return; } - for (String filter : filters) { + for (final String filter : filters) { if ( filter == null) { - throw new IllegalStateException("'null' is not a valid filter string."); + throw new TribbleException("'null' is not a valid filter string."); } if (!VALID_FILTER.matcher(filter).matches()) { - throw new IllegalStateException("Filter '" + filter + + throw new TribbleException("Filter '" + filter + "' contains an illegal character. It must conform to the regex ;'" + VALID_FILTER); + } else if (filter.equals("0")) { + throw new TribbleException("Filter cannot use reserved string '0'"); } } } @@ -421,12 +427,14 @@ private static void validateFilters(final VariantContext variantContext) { * * @param other the VariantContext to copy */ - protected VariantContext(VariantContext other) { + protected VariantContext(final VariantContext other) { this(other.getSource(), other.getID(), other.getContig(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getLog10PError(), other.getFiltersMaybeNull(), other.getAttributes(), - other.fullyDecoded, NO_VALIDATION); + other.fullyDecoded, + other.version, + NO_VALIDATION); } /** @@ -454,14 +462,17 @@ protected VariantContext(final String source, final Set filters, final Map attributes, final boolean fullyDecoded, + final VCFHeaderVersion version, final EnumSet validationToPerform ) { - if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } + if ( contig == null || contig.isEmpty() ) { throw new IllegalArgumentException("Contig cannot be null or the empty string"); } this.contig = contig; this.start = start; this.stop = stop; // intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null - if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + if ( ID == null || ID.equals("") ) { + throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + } this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID; this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); @@ -488,6 +499,7 @@ protected VariantContext(final String source, } this.fullyDecoded = fullyDecoded; + this.version = version; if ( ! validationToPerform.isEmpty() ) { validate(validationToPerform); @@ -617,7 +629,7 @@ private final Set allelesOfGenotypes(Collection genotypes) { *
  • Mixed
  • *
  • Mix of other classes
  • * - * + * * Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population * * @@ -814,6 +826,10 @@ public String getID() { return ID; } + public VCFHeaderVersion getVersion() { + return version; + } + // --------------------------------------------------------------------------------------------------------- // @@ -1654,7 +1670,7 @@ private final Map fullyDecodeAttributes(final Map fullyDecodeAttributes(final Map values = new ArrayList<>(splits.length); - for ( int i = 0; i < splits.length; i++ ) - values.add(decodeOne(field, splits[i], format)); + for (final String split : splits) + values.add(decodeOne(field, split, format, percentDecode)); return values; } else { - return decodeOne(field, string, format); + return decodeOne(field, string, format, percentDecode); } - } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { + } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { final List asList = (List)value; final List values = new ArrayList<>(asList.size()); for ( final String s : asList ) - values.add(decodeOne(field, s, format)); + values.add(decodeOne(field, s, format, percentDecode)); return values; } else { return value; @@ -1703,7 +1729,7 @@ private final Object decodeValue(final String field, final Object value, final V // allowMissingValuesComparedToHeader } - private final Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format) { + private Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format, final boolean percentDecode) { try { if ( string.equals(VCFConstants.MISSING_VALUE_v4) ) return null; @@ -1711,12 +1737,12 @@ private final Object decodeOne(final String field, final String string, final VC switch ( format.getType() ) { case Character: return string; case Flag: - final boolean b = Boolean.valueOf(string) || string.equals("1"); - if ( b == false ) + final boolean b = Boolean.parseBoolean(string) || string.equals("1"); + if (!b) throw new TribbleException("VariantContext FLAG fields " + field + " cannot contain false values" + " as seen at " + getContig() + ":" + getStart()); - return b; - case String: return string; + return true; + case String: return percentDecode ? VCFPercentEncodedTextTransformer.percentDecode(string) : string; case Integer: return Integer.valueOf(string); case Float: return VCFUtils.parseVcfDouble(string); default: throw new TribbleException("Unexpected type for field" + field); @@ -1727,7 +1753,36 @@ private final Object decodeOne(final String field, final String string, final VC } } - private final void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { + private static List decodeGPKey(final String value, final VCFHeaderVersion version) { + final String[] splits = value.split(","); + // We need to special-case GP because there is a discrepancy in the scale used to record + // its values between pre-4.3 and 4.3+ VCF. Pre-4.3 GP is phred scale encoded while + // 4.3+ GP is a linear probability, bringing it in line with other standard keys that + // use the P suffix (c.f. VCF 4.3 spec section 7.2). + + // Some tools in the wild apparently already use linear scaled GP, so we have to + // be careful about converting inputs. We check whether GP values are already linear + // scaled by seeing if the values' sum is approximately equal to 1, like we + // would expect if the values were linear scale probabilities. + // c.f. https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/CEBCD558.FA29%25browning%40u.washington.edu/ + double sum = 0; + + final List rawGPValues = new ArrayList<>(splits.length); + for (final String s : splits) { + final double GP = VCFUtils.parseVcfDouble(s); + rawGPValues.add(GP); + sum += GP; + } + + final boolean wasLinearScale = GeneralUtils.compareDoubles(sum, 1, VCFConstants.VCF_ENCODING_EPSILON) == 0; + if (!wasLinearScale && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + rawGPValues.replaceAll(GP -> QualityUtil.getErrorProbabilityFromPhredScore((int) Math.round(GP))); + } + return rawGPValues; + + } + + private void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { final GenotypesContext gc = new GenotypesContext(); for ( final Genotype g : getGenotypes() ) { gc.add(fullyDecodeGenotypes(g, header)); @@ -1862,9 +1917,9 @@ public int[] getGLIndicesOfAlternateAllele(Allele targetAllele) { return GenotypeLikelihoods.getPLIndicesOfAlleles(0, index); } - /** - * Search for the INFO=SVTYPE and return the type of Structural Variant - * @return the StructuralVariantType of null if there is no property SVTYPE + /** + * Search for the INFO=SVTYPE and return the type of Structural Variant + * @return the StructuralVariantType of null if there is no property SVTYPE * */ public StructuralVariantType getStructuralVariantType() { final String svType = this.getAttributeAsString(VCFConstants.SVTYPE, null); diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java b/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java index fae8d81514..b52ed0a936 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContextBuilder.java @@ -25,7 +25,10 @@ package htsjdk.variant.variantcontext; +import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderVersion; import java.io.Serializable; import java.util.ArrayList; @@ -80,6 +83,7 @@ public class VariantContextBuilder { private Map attributes = null; private boolean attributesCanBeModified = false; private boolean filtersCanBeModified = false; + private VCFHeaderVersion version = VCFHeader.DEFAULT_VCF_VERSION; /** enum of what must be validated */ final private EnumSet toValidate = EnumSet.noneOf(VariantContext.Validation.class); @@ -161,6 +165,10 @@ public Map getAttributes() { return attributes; } + public VCFHeaderVersion getVersion() { + return version; + } + /** * Returns a new builder based on parent -- the new VC will have all fields initialized * to their corresponding values in parent. This is the best way to create a derived VariantContext @@ -181,6 +189,7 @@ public VariantContextBuilder(final VariantContext parent) { this.start = parent.getStart(); this.stop = parent.getEnd(); this.fullyDecoded = parent.isFullyDecoded(); + this.version = parent.getVersion(); this.attributes(parent.getAttributes()); if (parent.filtersWereApplied()) { @@ -205,6 +214,7 @@ public VariantContextBuilder(final VariantContextBuilder parent) { this.start = parent.start; this.stop = parent.stop; this.fullyDecoded = parent.fullyDecoded; + this.version = parent.version; this.attributes(parent.attributes); this.filters(parent.filters); @@ -214,6 +224,17 @@ public VariantContextBuilder copy() { return new VariantContextBuilder(this); } + /** + * Tells this builder to create a VariantContext conforming to this version of VCF + * + * @param version the version of VCF to which the VariantContext produced by this builder conforms + * @return this builder + */ + public VariantContextBuilder version(final VCFHeaderVersion version) { + this.version = version; + return this; + } + /** * Tells this builder to use this collection of alleles for the resulting VariantContext * @@ -646,6 +667,6 @@ public VariantContext make(final boolean leaveModifyableAsIs) { return new VariantContext(source, ID, contig, start, stop, alleles, genotypes, log10PError, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, version, toValidate); } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java deleted file mode 100644 index 495cd93ec9..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Encoder.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public final class BCF2Encoder { - // TODO -- increase default size? - public static final int WRITE_BUFFER_INITIAL_SIZE = 16384; - private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE); - - // -------------------------------------------------------------------------------- - // - // Functions to return the data being encoded here - // - // -------------------------------------------------------------------------------- - - public byte[] getRecordBytes() { - byte[] bytes = encodeStream.toByteArray(); - encodeStream.reset(); - return bytes; - } - - // -------------------------------------------------------------------------------- - // - // Writing typed values (have type byte) - // - // -------------------------------------------------------------------------------- - - public final void encodeTypedMissing(final BCF2Type type) throws IOException { - encodeType(0, type); - } - - public final void encodeTyped(final Object value, final BCF2Type type) throws IOException { - if ( value == null ) - encodeTypedMissing(type); - else { - switch ( type ) { - case INT8: - case INT16: - case INT32: encodeTypedInt((Integer)value, type); break; - case FLOAT: encodeTypedFloat((Double) value); break; - case CHAR: encodeTypedString((String) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } - - public final void encodeTypedInt(final int v) throws IOException { - final BCF2Type type = BCF2Utils.determineIntegerType(v); - encodeTypedInt(v, type); - } - - public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException { - encodeType(1, type); - encodeRawInt(v, type); - } - - public final void encodeTypedString(final String s) throws IOException { - encodeTypedString(s.getBytes()); - } - - public final void encodeTypedString(final byte[] s) throws IOException { - if ( s == null ) - encodeType(0, BCF2Type.CHAR); - else { - encodeType(s.length, BCF2Type.CHAR); - for ( int i = 0; i < s.length; i++ ) { - encodeRawChar(s[i]); - } - } - } - - public final void encodeTypedFloat(final double d) throws IOException { - encodeType(1, BCF2Type.FLOAT); - encodeRawFloat(d); - } - - public final void encodeTyped(List v, final BCF2Type type) throws IOException { - if ( type == BCF2Type.CHAR && !v.isEmpty()) { - final String s = BCF2Utils.collapseStringList((List) v); - v = stringToBytes(s); - } - - encodeType(v.size(), type); - encodeRawValues(v, type); - } - - // -------------------------------------------------------------------------------- - // - // Writing raw values (don't have a type byte) - // - // -------------------------------------------------------------------------------- - - public final void encodeRawValues(final Collection v, final BCF2Type type) throws IOException { - for ( final T v1 : v ) { - encodeRawValue(v1, type); - } - } - - public final void encodeRawValue(final T value, final BCF2Type type) throws IOException { - try { - if ( value == type.getMissingJavaValue() ) - encodeRawMissingValue(type); - else { - switch (type) { - case INT8: - case INT16: - case INT32: encodeRawBytes((Integer) value, type); break; - case FLOAT: encodeRawFloat((Double) value); break; - case CHAR: encodeRawChar((Byte) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } catch ( ClassCastException e ) { - throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value); - } - } - - public final void encodeRawMissingValue(final BCF2Type type) throws IOException { - encodeRawBytes(type.getMissingBytes(), type); - } - - public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException { - for ( int i = 0; i < size; i++ ) - encodeRawMissingValue(type); - } - - // -------------------------------------------------------------------------------- - // - // low-level encoders - // - // -------------------------------------------------------------------------------- - - public final void encodeRawChar(final byte c) throws IOException { - encodeStream.write(c); - } - - public final void encodeRawFloat(final double value) throws IOException { - encodeRawBytes(Float.floatToIntBits((float) value), BCF2Type.FLOAT); - } - - public final void encodeType(final int size, final BCF2Type type) throws IOException { - if ( size <= BCF2Utils.MAX_INLINE_ELEMENTS ) { - final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type); - encodeStream.write(typeByte); - } else { - final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type); - encodeStream.write(typeByte); - // write in the overflow size - encodeTypedInt(size); - } - } - - public final void encodeRawInt(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - public final void encodeRawBytes(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - // -------------------------------------------------------------------------------- - // - // utility functions - // - // -------------------------------------------------------------------------------- - - public void encodeRawString(final String s, final int sizeToWrite) throws IOException { - final byte[] bytes = s.getBytes(); - for ( int i = 0; i < sizeToWrite; i++ ) - if ( i < bytes.length ) - encodeRawChar(bytes[i]); - else - encodeRawMissingValue(BCF2Type.CHAR); - } - - /** - * Totally generic encoder that examines o, determines the best way to encode it, and encodes it - * - * This method is incredibly slow, but it's only used for UnitTests so it doesn't matter - * - * @param o - * @return - */ - public final BCF2Type encode(final Object o) throws IOException { - if ( o == null ) throw new IllegalArgumentException("Generic encode cannot deal with null values"); - - if ( o instanceof List ) { - final BCF2Type type = determineBCFType(((List) o).get(0)); - encodeTyped((List) o, type); - return type; - } else { - final BCF2Type type = determineBCFType(o); - encodeTyped(o, type); - return type; - } - } - - private final BCF2Type determineBCFType(final Object arg) { - final Object toType = arg instanceof List ? ((List)arg).get(0) : arg; - - if ( toType instanceof Integer ) - return BCF2Utils.determineIntegerType((Integer) toType); - else if ( toType instanceof String ) - return BCF2Type.CHAR; - else if ( toType instanceof Double ) - return BCF2Type.FLOAT; - else - throw new IllegalArgumentException("No native encoding for Object of type " + arg.getClass().getSimpleName()); - } - - private final List stringToBytes(final String v) throws IOException { - if ( v == null || v.equals("") ) - return Collections.emptyList(); - else { - // TODO -- this needs to be optimized away for efficiency - final byte[] bytes = v.getBytes(); - final List l = new ArrayList(bytes.length); - for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); - return l; - } - } -} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java deleted file mode 100644 index 7d1f0de43d..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldEncoder.java +++ /dev/null @@ -1,455 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCompoundHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public abstract class BCF2FieldEncoder { - /** - * The header line describing the field we will encode values of - */ - final VCFCompoundHeaderLine headerLine; - - /** - * The BCF2 type we'll use to encoder this field, if it can be determined statically. - * If not, this variable must be null - */ - final BCF2Type staticType; - - /** - * The integer offset into the strings map of the BCF2 file corresponding to this - * field. - */ - final int dictionaryOffset; - - /** - * The integer type we use to encode our dictionary offset in the BCF2 file - */ - final BCF2Type dictionaryOffsetType; - - // ---------------------------------------------------------------------- - // - // Constructor - // - // ---------------------------------------------------------------------- - - private BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final Map dict, final BCF2Type staticType) { - this.headerLine = headerLine; - this.staticType = staticType; - - final Integer offset = dict.get(getField()); - if ( offset == null ) throw new IllegalStateException("Format error: could not find string " + getField() + " in header as required by BCF"); - this.dictionaryOffset = offset; - dictionaryOffsetType = BCF2Utils.determineIntegerType(offset); - } - - // ---------------------------------------------------------------------- - // - // Basic accessors - // - // ---------------------------------------------------------------------- - - public final String getField() { return headerLine.getID(); } - - /** - * Write the field key (dictionary offset and type) into the BCF2Encoder stream - * - * @param encoder where we write our dictionary offset - * @throws IOException - */ - public final void writeFieldKey(final BCF2Encoder encoder) throws IOException { - encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType); - } - - @Override - public String toString() { - return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName(); - } - - // ---------------------------------------------------------------------- - // - // methods to determine the number of encoded elements - // - // ---------------------------------------------------------------------- - - protected final VCFHeaderLineCount getCountType() { - return headerLine.getCountType(); - } - - /** - * @return True if this field has a constant, fixed number of elements (such as 1 for an atomic integer) - */ - public boolean hasConstantNumElements() { - return getCountType() == VCFHeaderLineCount.INTEGER; - } - - /** - * @return True if the only way to determine how many elements this field contains is by - * inspecting the actual value directly, such as when the number of elements - * is a variable length list per site or per genotype. - */ - public boolean hasValueDeterminedNumElements() { - return getCountType() == VCFHeaderLineCount.UNBOUNDED; - } - - /** - * @return True if this field has a non-fixed number of elements that depends only on the properties - * of the current VariantContext, such as one value per Allele or per genotype configuration. - */ - public boolean hasContextDeterminedNumElements() { - return ! hasConstantNumElements() && ! hasValueDeterminedNumElements(); - } - - /** - * @return the number of elements, assuming this field has a constant number of elements. - */ - public int numElements() { - return headerLine.getCount(); - } - - /** - * @return the number of elements by looking at the actual value provided - */ - public int numElements(final Object value) { - return numElementsFromValue(value); - } - - /** - * @return the number of elements, assuming this field has context-determined number of elements. - */ - public int numElements(final VariantContext vc) { - return headerLine.getCount(vc); - } - - /** - * A convenience access for the number of elements. - * @param vc - * @param value - * @return the number of encoded elements, either from the fixed number - * it has, from the VC, or from the value itself. - */ - public final int numElements(final VariantContext vc, final Object value) { - if ( hasConstantNumElements() ) return numElements(); - else if ( hasContextDeterminedNumElements() ) return numElements(vc); - else return numElements(value); - } - - /** - * Given a value, return the number of elements we will encode for it. - * - * Assumes the value is encoded as a List - * - * @param value - * @return the number of elements we will encode for {@param value}. - */ - protected int numElementsFromValue(final Object value) { - if ( value == null ) return 0; - else if ( value instanceof List ) return ((List) value).size(); - else return 1; - } - - // ---------------------------------------------------------------------- - // - // methods to determine the BCF2 type of the encoded values - // - // ---------------------------------------------------------------------- - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return true if the field is static - */ - public final boolean isStaticallyTyped() { return ! isDynamicallyTyped(); } - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return true if the field is not static - */ - public final boolean isDynamicallyTyped() { return staticType == null; } - - /** - * Get the BCF2 type for this field, either from the static type of the - * field itself or by inspecting the value itself. - * - * @return the BCF2 type for this field - */ - public final BCF2Type getType(final Object value) { - return isDynamicallyTyped() ? getDynamicType(value) : getStaticType(); - } - - public final BCF2Type getStaticType() { - return staticType; - } - - public BCF2Type getDynamicType(final Object value) { - throw new IllegalStateException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField()); - } - - // ---------------------------------------------------------------------- - // - // methods to encode values, including the key abstract method - // - // ---------------------------------------------------------------------- - - /** - * Key abstract method that should encode a value of the given type into the encoder. - * - * Value will be of a type appropriate to the underlying encoder. If the genotype field is represented as - * an int[], this will be value, and the encoder needs to handle encoding all of the values in the int[]. - * - * The argument should be used, not the getType() method in the superclass as an outer loop might have - * decided a more general type (int16) to use, even through this encoder could have been done with int8. - * - * If minValues > 0, then encodeValue must write in at least minValues items from value. If value is atomic, - * this means that minValues - 1 MISSING values should be added to the encoder. If minValues is a collection - * type (int[]) then minValues - values.length should be added. This argument is intended to handle padding - * of values in genotype fields. - * - * @param encoder - * @param value - * @param type - * @param minValues - * @throws IOException - */ - public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException; - - // ---------------------------------------------------------------------- - // - // Subclass to encode Strings - // - // ---------------------------------------------------------------------- - - public static class StringOrCharacter extends BCF2FieldEncoder { - public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.CHAR); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - final String s = javaStringToBCF2String(value); - encoder.encodeRawString(s, Math.max(s.length(), minValues)); - } - - // - // Regardless of what the header says, BCF2 strings and characters are always encoded - // as arrays of CHAR type, which has a variable number of elements depending on the - // exact string being encoded - // - @Override public boolean hasConstantNumElements() { return false; } - @Override public boolean hasContextDeterminedNumElements() { return false; } - @Override public boolean hasValueDeterminedNumElements() { return true; } - @Override protected int numElementsFromValue(final Object value) { - return value == null ? 0 : javaStringToBCF2String(value).length(); - } - - /** - * Recode the incoming object to a String, compacting it into a - * BCF2 string if the value is a list. - * - * @param value a String or List to encode, or null - * @return a non-null string to encode - */ - private String javaStringToBCF2String(final Object value) { - if ( value == null ) - return ""; - else if (value instanceof List) { - final List l = (List)value; - return BCF2Utils.collapseStringList(l); - } else if ( value.getClass().isArray() ) { - final List l = new ArrayList(); - Collections.addAll(l, (String[])value); - return BCF2Utils.collapseStringList(l); - } else - return (String)value; - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLAG - // - // ---------------------------------------------------------------------- - - public static class Flag extends BCF2FieldEncoder { - public Flag(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.INT8); - if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 ) - throw new IllegalStateException("Flag encoder only supports atomic flags for field " + getField()); - } - - @Override - public int numElements() { - return 1; // the header says 0 but we will write 1 value - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - encoder.encodeRawBytes(1, getStaticType()); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLOAT - // - // ---------------------------------------------------------------------- - - public static class Float extends BCF2FieldEncoder { - final boolean isAtomic; - - public Float(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.FLOAT); - isAtomic = hasConstantNumElements() && numElements() == 1; - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - // TODO -- can be restructured to avoid toList operation - if ( isAtomic ) { - // fast path for fields with 1 fixed float value - if ( value != null ) { - encoder.encodeRawFloat((Double)value); - count++; - } - } else { - // handle generic case - final List doubles = BCF2Utils.toList(Double.class, value); - for ( final Double d : doubles ) { - if ( d != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawFloat(d); - count++; - } - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode int[] - // - // ---------------------------------------------------------------------- - - public static class IntArray extends BCF2FieldEncoder { - public IntArray(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - protected int numElementsFromValue(final Object value) { - return value == null ? 0 : ((int[])value).length; - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - for ( final int i : (int[])value ) { - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode List - // - // ---------------------------------------------------------------------- - - /** - * Specialized int encoder for atomic (non-list) integers - */ - public static class AtomicInt extends BCF2FieldEncoder { - public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - encoder.encodeRawInt((Integer)value, type); - count++; - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - public static class GenericInts extends BCF2FieldEncoder { - public GenericInts(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(BCF2Utils.toList(Integer.class, value)); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - for ( final Integer i : BCF2Utils.toList(Integer.class, value) ) { - if ( i != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } -} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java deleted file mode 100644 index 857cedfe3a..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriter.java +++ /dev/null @@ -1,324 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.bcf2.BCF2Type; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeader; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public abstract class BCF2FieldWriter { - private final VCFHeader header; - private final BCF2FieldEncoder fieldEncoder; - - protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - this.header = header; - this.fieldEncoder = fieldEncoder; - } - - protected VCFHeader getHeader() { return header; } - protected BCF2FieldEncoder getFieldEncoder() { - return fieldEncoder; - } - protected String getField() { return getFieldEncoder().getField(); } - - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - fieldEncoder.writeFieldKey(encoder); - } - - public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness - - @Override - public String toString() { - return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder(); - } - - // -------------------------------------------------------------------------------- - // - // Sites writers - // - // -------------------------------------------------------------------------------- - - public static abstract class SiteWriter extends BCF2FieldWriter { - protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException; - } - - public static class GenericSiteWriter extends SiteWriter { - public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - final Object rawValue = vc.getAttribute(getField(), null); - final BCF2Type type = getFieldEncoder().getType(rawValue); - if ( rawValue == null ) { - // the value is missing, just write in null - encoder.encodeType(0, type); - } else { - final int valueCount = getFieldEncoder().numElements(vc, rawValue); - encoder.encodeType(valueCount, type); - getFieldEncoder().encodeValue(encoder, rawValue, type, valueCount); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Genotypes writers - // - // -------------------------------------------------------------------------------- - - public static abstract class GenotypesWriter extends BCF2FieldWriter { - int nValuesPerGenotype = -1; - BCF2Type encodingType = null; - - protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - - if ( fieldEncoder.hasConstantNumElements() ) { - nValuesPerGenotype = getFieldEncoder().numElements(); - } - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // writes the key information - super.start(encoder, vc); - - // only update if we need to - if ( ! getFieldEncoder().hasConstantNumElements() ) { - if ( getFieldEncoder().hasContextDeterminedNumElements() ) - // we are cheap -- just depends on genotype of allele counts - nValuesPerGenotype = getFieldEncoder().numElements(vc); - else - // we have to go fishing through the values themselves (expensive) - nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc); - } - - encoder.encodeType(nValuesPerGenotype, encodingType); - } - - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final Object fieldValue = g.getExtendedAttribute(getField(), null); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField())); - } - - private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { - int size = -1; - - for ( final Genotype g : vc.getGenotypes() ) { - size = Math.max(size, numElements(vc, g)); - } - - return size; - } - } - - public static class StaticallyTypeGenotypesWriter extends GenotypesWriter { - public StaticallyTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - encodingType = getFieldEncoder().getStaticType(); - } - } - - public static class IntegerTypeGenotypesWriter extends GenotypesWriter { - public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // the only value that is dynamic are integers - final List values = new ArrayList(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - for ( final Integer i : BCF2Utils.toList(Integer.class, g.getExtendedAttribute(getField(), null)) ) { - if ( i != null ) values.add(i); - } - } - - encodingType = BCF2Utils.determineIntegerType(values); - super.start(encoder, vc); - } - } - - public static class IGFGenotypesWriter extends GenotypesWriter { - final IntGenotypeFieldAccessors.Accessor ige; - - public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) { - super(header, fieldEncoder); - this.ige = ige; - - if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) ) - throw new IllegalArgumentException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField()); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // TODO - // TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration - // TODO - encodingType = BCF2Type.INT8; - for ( final Genotype g : vc.getGenotypes() ) { - final int[] pls = ige.getValues(g); - final BCF2Type plsType = getFieldEncoder().getType(pls); - encodingType = BCF2Utils.maxIntegerType(encodingType, plsType); - if ( encodingType == BCF2Type.INT32 ) - break; // stop early - } - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return ige.getSize(g); - } - } - - public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter { - public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final String fieldValue = g.getFilters(); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getFilters()); - } - } - - public static class GTWriter extends GenotypesWriter { - final Map alleleMapForTriPlus = new HashMap(5); - Allele ref, alt1; - - public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) - throw new IllegalStateException("Current BCF2 encoder cannot handle sites " + - "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " - + vc.getNAlleles() + " at " + vc.getContig() + ":" + vc.getStart()); - - encodingType = BCF2Type.INT8; - buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(2); - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final int samplePloidy = g.getPloidy(); - for ( int i = 0; i < nValuesPerGenotype; i++ ) { - if ( i < samplePloidy ) { - // we encode the actual allele - final Allele a = g.getAllele(i); - final int offset = getAlleleOffset(a); - final int encoded = ((offset+1) << 1) | ((g.isPhased() && i!=0) ? 0x01 : 0x00); - encoder.encodeRawBytes(encoded, encodingType); - } else { - // we need to pad with missing as we have ploidy < max for this sample - encoder.encodeRawBytes(encodingType.getMissingBytes(), encodingType); - } - } - } - - /** - * Fast path code to determine the offset. - * - * Inline tests for == against ref (most common, first test) - * == alt1 (second most common, second test) - * == NO_CALL (third) - * and finally in the map from allele => offset for all alt 2+ alleles - * - * @param a the allele whose offset we wish to determine - * @return the offset (from 0) of the allele in the list of variant context alleles (-1 means NO_CALL) - */ - private final int getAlleleOffset(final Allele a) { - if ( a == ref ) return 0; - else if ( a == alt1 ) return 1; - else if ( a == Allele.NO_CALL ) return -1; - else { - final Integer o = alleleMapForTriPlus.get(a); - if ( o == null ) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a); - return o; - } - } - - private final void buildAlleleMap(final VariantContext vc) { - // these are fast path options to determine the offsets for - final int nAlleles = vc.getNAlleles(); - ref = vc.getReference(); - alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null; - - if ( nAlleles > 2 ) { - // for multi-allelics we need to clear the map, and add additional looks - alleleMapForTriPlus.clear(); - final List alleles = vc.getAlleles(); - for ( int i = 2; i < alleles.size(); i++ ) { - alleleMapForTriPlus.put(alleles.get(i), i); - } - } - } - } -} - diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java deleted file mode 100644 index 20f9ce6aa4..0000000000 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2FieldWriterManager.java +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.variantcontext.writer; - -import htsjdk.variant.utils.GeneralUtils; -import htsjdk.variant.vcf.VCFCompoundHeaderLine; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; - -import java.util.HashMap; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public class BCF2FieldWriterManager { - final Map siteWriters = new HashMap(); - final Map genotypesWriters = new HashMap(); - final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); - - public BCF2FieldWriterManager() { } - - /** - * Setup the FieldWriters appropriate to each INFO and FORMAT in the VCF header - * - * Must be called before any of the getter methods will work - * - * @param header a VCFHeader containing description for every INFO and FORMAT field we'll attempt to write out to BCF - * @param encoder the encoder we are going to use to write out the BCF2 data - * @param stringDictionary a map from VCFHeader strings to their offsets for encoding - */ - public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map stringDictionary) { - for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary); - add(siteWriters, field, writer); - } - - for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary); - add(genotypesWriters, field, writer); - } - } - - private final void add(final Map map, final String field, final T writer) { - if ( map.containsKey(field) ) - throw new IllegalStateException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders"); - map.put(field, writer); - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate SiteWriter for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header, - final VCFInfoHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false)); - } - - private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line, - final BCF2Encoder encoder, - final Map dict, - final boolean createGenotypesEncoders ) { - - if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && line.getType() != VCFHeaderLineType.Integer ) - System.err.println("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line); - return new BCF2FieldEncoder.IntArray(line, dict); - } else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldEncoder.GenericInts(line, dict); - } else { - switch ( line.getType() ) { - case Character: - case String: - return new BCF2FieldEncoder.StringOrCharacter(line, dict); - case Flag: - return new BCF2FieldEncoder.Flag(line, dict); - case Float: - return new BCF2FieldEncoder.Float(line, dict); - case Integer: - if ( line.isFixedCount() && line.getCount() == 1 ) - return new BCF2FieldEncoder.AtomicInt(line, dict); - else - return new BCF2FieldEncoder.GenericInts(line, dict); - default: - throw new IllegalArgumentException("Unexpected type for field " + line.getID()); - } - } - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate Genotypes for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header, - final VCFFormatHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - final String field = line.getID(); - final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true); - - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldWriter.GTWriter(header, fieldEncoder); - } else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder); - } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { - return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); - } else if ( line.getType() == VCFHeaderLineType.Integer ) { - return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder); - } else { - return new BCF2FieldWriter.StaticallyTypeGenotypesWriter(header, fieldEncoder); - } - } - - // ----------------------------------------------------------------- - // - // Accessors to get site / genotype writers - // - // ----------------------------------------------------------------- - - /** - * Get a site writer specialized to encode values for site info field - * @param field key found in the VCF header INFO records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) { - return getWriter(field, siteWriters); - } - - /** - * Get a genotypes writer specialized to encode values for genotypes field - * @param field key found in the VCF header FORMAT records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) { - return getWriter(field, genotypesWriters); - } - - public T getWriter(final String key, final Map map) { - return map.get(key); - } -} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java index 07b2d0d41e..517d5eeb3d 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/BCF2Writer.java @@ -27,20 +27,21 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.tribble.index.IndexCreator; import htsjdk.variant.bcf2.BCF2Codec; +import htsjdk.variant.bcf2.BCF2Dictionary; +import htsjdk.variant.bcf2.BCF2Encoder; import htsjdk.variant.bcf2.BCF2Type; import htsjdk.variant.bcf2.BCF2Utils; import htsjdk.variant.bcf2.BCFVersion; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.bcf2.BCF2FieldWriter.BCF2FieldWriterManager; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFUtils; @@ -51,19 +52,16 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * VariantContextWriter that emits BCF2 binary encoding - * + *

    * Overall structure of this writer is complex for efficiency reasons - * + *

    * -- The BCF2Writer manages the low-level BCF2 encoder, the mappings * from contigs and strings to offsets, the VCF header, and holds the * lower-level encoders that map from VC and Genotype fields to their @@ -71,29 +69,23 @@ * like POS, contig, the size of info and genotype data, QUAL, etc. It * has loops over the INFO and GENOTYPES to encode each individual datum * with the generic field encoders, but the actual encoding work is - * done with by the FieldWriters classes themselves - * + * done with by the FieldWriters classes themselves. The piece of code + * that determines which FieldWriters to associate with each SITE and + * GENOTYPE field is the BCF2FieldWriterManager. + *

    * -- BCF2FieldWriter are specialized classes for writing out SITE and * genotype information for specific SITE/GENOTYPE fields (like AC for * sites and GQ for genotypes). These are objects in themselves because - * the manage all of the complexity of relating the types in the VCF header + * they manage all of the complexity of relating the types in the VCF header * with the proper encoding in BCF as well as the type representing this * in java. Relating all three of these pieces of information together - * is the main complexity challenge in the encoder. The piece of code - * that determines which FieldWriters to associate with each SITE and - * GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters - * are specialized for specific combinations of encoders (see below) - * and contexts (genotypes) for efficiency, so they smartly manage - * the writing of PLs (encoded as int[]) directly into the lowest - * level BCFEncoder. - * - * -- At the third level is the BCF2FieldEncoder, relatively simple - * pieces of code that handle the task of determining the right - * BCF2 type for specific field values, as well as reporting back - * information such as the number of elements used to encode it - * (simple for atomic values like Integer but complex for PLs - * or lists of strings) - * + * is the main complexity challenge in the encoder. These classes are + * responsible for extracting the necessary data from the VariantContext + * or Genotype, determining its BCF type and size, and writing it out. + * These FieldWriters are specialized for specific combinations of VCF type + * and contexts for efficiency, so they smartly manage the writing of PLs + * (encoded as int[]) directly into the lowest level BCFEncoder. + *

    * -- At the lowest level is the BCF2Encoder itself. This provides * just the limited encoding methods specified by the BCF2 specification. This encoder * doesn't do anything but make it possible to conveniently write out valid low-level @@ -103,20 +95,25 @@ * @since 06/12 */ class BCF2Writer extends IndexingVariantContextWriter { + private static final Log log = Log.getInstance(BCF2Writer.class); + public static final int MAJOR_VERSION = 2; - public static final int MINOR_VERSION = 1; + public static final int MINOR_VERSION = 2; + + public static final BCFVersion VERSION = new BCFVersion(MAJOR_VERSION, MINOR_VERSION); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support private VCFHeader header; - private final Map contigDictionary = new HashMap(); - private final Map stringDictionaryMap = new LinkedHashMap(); + private final Map contigDictionary = new HashMap<>(); + private final Map stringDictionaryMap = new HashMap<>(); private final boolean doNotWriteGenotypes; - private String[] sampleNames = null; + private final Map> genotypeKeys = new HashMap<>(); - private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives - final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); + private BCF2Encoder encoder; // initialized after the header arrives + private BCFVersion version; + private BCF2FieldWriterManager fieldWriterManager; /** * cached results for whether we can write out raw genotypes data. @@ -134,15 +131,15 @@ public BCF2Writer(final File location, final OutputStream output, final SAMSeque } public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict, - final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { + final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); this.outputStream = getOutputStream(); this.doNotWriteGenotypes = doNotWriteGenotypes; } public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, - final IndexCreator indexCreator, - final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { + final IndexCreator indexCreator, + final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { this(IOUtil.toPath(location), output, refDict, indexCreator, enableOnTheFlyIndexing, doNotWriteGenotypes); } @@ -162,45 +159,70 @@ public BCF2Writer(final Path location, final OutputStream output, final SAMSeque // -------------------------------------------------------------------------------- @Override - public void writeHeader(VCFHeader header) { + public void writeHeader(final VCFHeader header) { setHeader(header); try { // write out the header into a byte stream, get its length, and write everything to the file final ByteArrayOutputStream capture = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(capture); - this.header = VCFWriter.writeHeader(this.header, writer, VCFWriter.getVersionLine(), "BCF2 stream"); + VCFWriter.writeHeader(this.header, writer, "BCF2 stream"); writer.append('\0'); // the header is null terminated by a byte writer.close(); final byte[] headerBytes = capture.toByteArray(); - new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream); + BCF2Writer.VERSION.write(outputStream); BCF2Type.INT32.write(headerBytes.length, outputStream); outputStream.write(headerBytes); outputHasBeenWritten = true; - } catch (IOException e) { + } catch (final IOException e) { throw new RuntimeIOException("BCF2 stream: Got IOException while trying to write BCF2 header", e); } } @Override - public void add( VariantContext vc ) { - if ( doNotWriteGenotypes ) + public void add(VariantContext vc) { + if (doNotWriteGenotypes) vc = new VariantContextBuilder(vc).noGenotypes().make(); vc = vc.fullyDecode(header, false); super.add(vc); // allow on the fly indexing try { - final byte[] infoBlock = buildSitesData(vc); - final byte[] genotypesBlock = buildSamplesData(vc); + // Sites data + buildSitesData(vc); + final int sitesLength = encoder.getSize(); + + // Genotypes data + final int genotypesLength; + final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects + final boolean lazyDataUsable = lazyData != null && lazyData.version.equals(this.version); + if (lazyDataUsable) { + // We never decoded any data from this BCF file, and its contents were already encoded in the same BCF + // version as we are currently writing, so we don't need to re-encode the samples data. + // Note that the version check is necessary so that we do not write contents encoded using an old + // version of BCF as if it were a newer version, as this can cause problems with e.g. MISSING values + genotypesLength = lazyData.bytes.length; + } else { + // we have to do work to convert the VC into a BCF2 byte stream + buildSamplesData(vc); + genotypesLength = encoder.getSize() - sitesLength; + } + + // Write lengths + BCF2Type.INT32.write(sitesLength, outputStream); + BCF2Type.INT32.write(genotypesLength, outputStream); - // write the two blocks to disk - writeBlock(infoBlock, genotypesBlock); + // Write the encoder's buffer into the output stream + // If there was no lazy data, this also contains the genotypes data + encoder.write(outputStream); + if (lazyDataUsable) { + // The encoder only contained sites data, so we need to write the lazy data + outputStream.write(lazyData.bytes); + } outputHasBeenWritten = true; - } - catch ( IOException e ) { - throw new RuntimeIOException("Error writing record to BCF2 file: " + vc.toString(), e); + } catch (final IOException e) { + throw new RuntimeIOException("Error writing record to BCF2 file: " + vc, e); } } @@ -208,8 +230,7 @@ public void add( VariantContext vc ) { public void close() { try { outputStream.flush(); - } - catch ( IOException e ) { + } catch (final IOException e) { throw new RuntimeIOException("Failed to flush BCF2 file"); } super.close(); @@ -220,39 +241,59 @@ public void setHeader(final VCFHeader header) { if (outputHasBeenWritten) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } + + version = getBCFVersionFromHeader(header); + encoder = BCF2Encoder.getEncoder(version); + // make sure the header is sorted correctly - this.header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : new VCFHeader( - header.getMetaDataInSortedOrder(), header.getGenotypeSamples()); + this.header = doNotWriteGenotypes + ? new VCFHeader(header.getMetaDataInSortedOrder()) + : new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples()); + + // TODO should follow up on hts-specs and clarify the relationship between ##dictionary and IDX fields + // Error on ##dictionary lines, we don't know what to do with them + if (this.header.getMetaDataInInputOrder().stream().anyMatch(line -> line.getKey().equals("dictionary"))) { + log.warn("Use of the ##dictionary line is not supported"); + } + // create the config offsets map - if ( this.header.getContigLines().isEmpty() ) { - if ( ALLOW_MISSING_CONTIG_LINES ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary"); - } + if (this.header.getContigLines().isEmpty()) { + if (ALLOW_MISSING_CONTIG_LINES) { + log.debug("No contig dictionary found in header, falling back to reference sequence dictionary"); + // The reference sequence dictionary should never contain IDX fields createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null)); } else { throw new IllegalStateException("Cannot write BCF2 file with missing contig lines"); } } else { - createContigDictionary(this.header.getContigLines()); - } - // set up the map from dictionary string values -> offset - final ArrayList dict = BCF2Utils.makeDictionary(this.header); - for ( int i = 0; i < dict.size(); i++ ) { - stringDictionaryMap.put(dict.get(i), i); + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2ContigDictionary(header, BCF2Writer.VERSION); + dict.forEach((offset, string) -> contigDictionary.put(string, offset)); } - sampleNames = this.header.getGenotypeSamples().toArray(new String[this.header.getNGenotypeSamples()]); - // setup the field encodings - fieldManager.setup(this.header, encoder, stringDictionaryMap); + // Create offset -> string map then turn inside-out + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(this.header, BCF2Writer.VERSION); + dict.forEach((offset, string) -> stringDictionaryMap.put(string, offset)); + // Set up the field encodings + fieldWriterManager = new BCF2FieldWriterManager(header, stringDictionaryMap, encoder); + } + + /** + * Determine the appropriate BCF version to use to encode a VCF with based on the version of its VCF header + * Note: currently htsjdk only supports one version of BCF (2.2), but this method is here for if/when + * new BCF versions are added. + * @param header + * @return + */ + private static BCFVersion getBCFVersionFromHeader(final VCFHeader header) { + return BCF2Codec.ALLOWED_BCF_VERSION; } // -------------------------------------------------------------------------------- // // implicit block // - // The first four records of BCF are inline untype encoded data of: + // The first four records of BCF are inline untyped encoded data of: // // 4 byte integer chrom offset // 4 byte integer start @@ -260,23 +301,23 @@ public void setHeader(final VCFHeader header) { // 4 byte float qual // // -------------------------------------------------------------------------------- - private byte[] buildSitesData( VariantContext vc ) throws IOException { + private void buildSitesData(final VariantContext vc) throws IOException { final int contigIndex = contigDictionary.get(vc.getContig()); - if ( contigIndex == -1 ) + if (contigIndex == -1) throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getContig())); - // note use of encodeRawValue to not insert the typing byte - encoder.encodeRawValue(contigIndex, BCF2Type.INT32); + // note use of encodeRawInt to not insert the typing byte + encoder.encodeRawInt(contigIndex, BCF2Type.INT32); // pos. GATK is 1 based, BCF2 is 0 based - encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32); + encoder.encodeRawInt(vc.getStart() - 1, BCF2Type.INT32); // ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1 // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1 - encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); + encoder.encodeRawInt(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); // qual - if ( vc.hasLog10PError() ) + if (vc.hasLog10PError()) encoder.encodeRawFloat((float) vc.getPhredScaledQual()); else encoder.encodeRawMissingValue(BCF2Type.FLOAT); @@ -294,14 +335,12 @@ private byte[] buildSitesData( VariantContext vc ) throws IOException { buildAlleles(vc); buildFilter(vc); buildInfo(vc); - - return encoder.getRecordBytes(); } /** * Can we safely write on the raw (undecoded) genotypes of an input VC? - * + *

    * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in * which case we return the previous result. If it's not cached, we use the BCF2Util to * compare the VC header with our header (expensive) and cache it. @@ -310,9 +349,9 @@ private byte[] buildSitesData( VariantContext vc ) throws IOException { * @return */ private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) { - if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) { + if (lazyData.header != lastVCFHeaderOfUnparsedGenotypes) { // result is already cached - canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header); + canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header, lazyData.header); lastVCFHeaderOfUnparsedGenotypes = lazyData.header; } @@ -320,12 +359,12 @@ private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyDat } private BCF2Codec.LazyData getLazyData(final VariantContext vc) { - if ( vc.getGenotypes().isLazyWithData() ) { - final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); + if (vc.getGenotypes().isLazyWithData()) { + final LazyGenotypesContext lgc = (LazyGenotypesContext) vc.getGenotypes(); - if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && - canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { - return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData(); + if (lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && + canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { + return (BCF2Codec.LazyData) lgc.getUnparsedGenotypeData(); } else { lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long } @@ -336,7 +375,7 @@ private BCF2Codec.LazyData getLazyData(final VariantContext vc) { /** * Try to get the nGenotypeFields as efficiently as possible. - * + *

    * If this is a lazy BCF2 object just grab the field count from there, * otherwise do the whole counting by types test in the actual data * @@ -345,80 +384,46 @@ private BCF2Codec.LazyData getLazyData(final VariantContext vc) { */ private int getNGenotypeFormatFields(final VariantContext vc) { final BCF2Codec.LazyData lazyData = getLazyData(vc); - return lazyData != null ? lazyData.nGenotypeFields : vc.calcVCFGenotypeKeys(header).size(); + if (lazyData == null) { + // Calculate genotype keys of a VariantContext and cache result + // This computation can be expensive as it needs to inspect every genotype in the VC, + // so we cache the result as it will be needed again when writing the genotype information + return genotypeKeys.computeIfAbsent(vc, v -> v.calcVCFGenotypeKeys(header)).size(); + } else { + return lazyData.nGenotypeFields; + } } - private void buildID( VariantContext vc ) throws IOException { + private void buildID(final VariantContext vc) throws IOException { encoder.encodeTypedString(vc.getID()); } - private void buildAlleles( VariantContext vc ) throws IOException { - for ( Allele allele : vc.getAlleles() ) { + private void buildAlleles(final VariantContext vc) throws IOException { + for (final Allele allele : vc.getAlleles()) { final byte[] s = allele.getDisplayBases(); - if ( s == null ) + if (s == null) throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele); encoder.encodeTypedString(s); } } - private void buildFilter( VariantContext vc ) throws IOException { - if ( vc.isFiltered() ) { + private void buildFilter(final VariantContext vc) throws IOException { + if (vc.isFiltered()) { encodeStringsByRef(vc.getFilters()); - } else if ( vc.filtersWereApplied() ) { - encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4)); + } else if (vc.filtersWereApplied()) { + // PASS is always implicitly encoded as 0 + encoder.encodeTypedInt(0, BCF2Type.INT8); } else { encoder.encodeTypedMissing(BCF2Type.INT8); } } - private void buildInfo( VariantContext vc ) throws IOException { - for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { - final String field = infoFieldEntry.getKey(); - final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO"); - writer.start(encoder, vc); - writer.site(encoder, vc); - writer.done(encoder, vc); - } + private void buildInfo(final VariantContext vc) throws IOException { + fieldWriterManager.writeInfo(vc); } - private byte[] buildSamplesData(final VariantContext vc) throws IOException { - final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects - if ( lazyData != null ) { - // we never decoded any data from this BCF file, so just pass it back - return lazyData.bytes; - } - - // we have to do work to convert the VC into a BCF2 byte stream - final List genotypeFields = vc.calcVCFGenotypeKeys(header); - for ( final String field : genotypeFields ) { - final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); - - assert writer != null; - - writer.start(encoder, vc); - for ( final String name : sampleNames ) { - Genotype g = vc.getGenotype(name); - if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); - writer.addGenotype(encoder, vc, g); - } - writer.done(encoder, vc); - } - return encoder.getRecordBytes(); - } - - /** - * Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file - * but there's no header line for it. - * - * @param vc - * @param field - * @param fieldType - */ - private void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) { - throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " + - vc.getContig() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader"); + private void buildSamplesData(final VariantContext vc) throws IOException { + fieldWriterManager.writeFormat(vc, genotypeKeys.computeIfAbsent(vc, v -> v.calcVCFGenotypeKeys(header))); } // -------------------------------------------------------------------------------- @@ -427,34 +432,20 @@ private void errorUnexpectedFieldToWrite(final VariantContext vc, final String f // // -------------------------------------------------------------------------------- - /** - * Write the data in the encoder to the outputstream as a length encoded - * block of data. After this call the encoder stream will be ready to - * start a new data block - * - * @throws IOException - */ - private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException { - BCF2Type.INT32.write(infoBlock.length, outputStream); - BCF2Type.INT32.write(genotypesBlock.length, outputStream); - outputStream.write(infoBlock); - outputStream.write(genotypesBlock); - } - - private BCF2Type encodeStringsByRef(final Collection strings) throws IOException { - final List offsets = new ArrayList(strings.size()); + private void encodeStringsByRef(final Collection strings) throws IOException { + final int[] offsets = new int[strings.size()]; + int i = 0; - // iterate over strings until we find one that needs 16 bits, and break - for ( final String string : strings ) { + // Map strings to their position in string dictionary + for (final String string : strings) { final Integer got = stringDictionaryMap.get(string); - if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF"); - final int offset = got; - offsets.add(offset); + if (got == null) + throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF"); + offsets[i] = got; + i++; } - final BCF2Type type = BCF2Utils.determineIntegerType(offsets); - encoder.encodeTyped(offsets, type); - return type; + encoder.encodeTypedVecInt(offsets); } /** @@ -464,7 +455,7 @@ private BCF2Type encodeStringsByRef(final Collection strings) throws IOE */ private void createContigDictionary(final Collection contigLines) { int offset = 0; - for ( VCFContigHeaderLine contig : contigLines ) + for (final VCFContigHeaderLine contig : contigLines) contigDictionary.put(contig.getID(), offset++); } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java new file mode 100644 index 0000000000..1d43b1c486 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFVersionUpgradePolicy.java @@ -0,0 +1,40 @@ +package htsjdk.variant.variantcontext.writer; + +/** + * The policy {@link VCFWriter} will use to determine the version of VCF to write from a given VCF file. + *

    + * htsjdk's behavior to this point has been to stamp the most recent version of VCF onto all VCF files + * written by VCFWriter regardless of the input VCF's original version. This had been possible as new versions + * of VCF were backwards compatible and version upgrading was infallible. VCF 4.3 is stricter than previous versions, + * meaning that some previously valid files are invalid 4.3 and upgrading from pre-4.3 to 4.3+ can sometimes fail. + *

    + * This class is a temporary workaround to allow opt-in 4.3 writing support in a way that does not break + * workflows that may process pre-4.3 files that are invalid 4.3, but should be removed once proper versioning + * support for VCF is incorporated into htsjdk. + */ +public enum VCFVersionUpgradePolicy { + /** + * Interpret VCF files with exactly the version that they have on read. The VCF is assumed to be valid + * for its version and no version validation will be performed. The written VCF will have the same version + * as the one which was read. + */ + DO_NOT_UPGRADE, + + /** + * Write pre-4.3 files as 4.2, to which automatic upgrading should always be possible, and + * write 4.3+ files as 4.3. + */ + ONLY_INFALLIBLE_UPGRADE, + + /** + * Inspect the headers of pre-4.3 files to determine if they can be automatically upgraded to 4.3, + * and if automatic upgrade is possible write them as 4.3, or else write them as 4.2. + */ + UPGRADE_OR_FALLBACK, + + /** + * Inspect the headers of pre 4.3 files to determine if they can be automatically upgraded to 4.3, + * and abort with an error if automatic upgrade is not possible + */ + UPGRADE_OR_FAIL, +} diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java index 21f1453fbb..d9977a66d8 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java @@ -27,6 +27,7 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.tribble.index.IndexCreator; import htsjdk.variant.variantcontext.VariantContext; @@ -45,14 +46,13 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Path; +import java.util.stream.Collectors; /** * this class writes VCF files */ class VCFWriter extends IndexingVariantContextWriter { - - private static final String VERSION_LINE = - VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString(); + protected final static Log logger = Log.getInstance(VCFWriter.class); // Initialized when the header is written to the output stream private VCFEncoder vcfEncoder = null; @@ -150,12 +150,11 @@ private void writeAndResetBuffer() throws IOException { @Override public void writeHeader(final VCFHeader header) { - - // note we need to update the mHeader object after this call because they header + // note we need to update the mHeader object after this call because the header // may have genotypes trimmed out of it, if doNotWriteGenotypes is true setHeader(header); try { - writeHeader(this.mHeader, writer, getVersionLine(), getStreamName()); + writeHeader(this.mHeader, writer, getStreamName()); writeAndResetBuffer(); outputHasBeenWritten = true; } catch ( IOException e ) { @@ -163,24 +162,32 @@ public void writeHeader(final VCFHeader header) { } } - public static String getVersionLine() { - return VERSION_LINE; - } - - public static VCFHeader writeHeader(VCFHeader header, + @Deprecated // starting after version 2.24.1 + public static VCFHeader writeHeader(final VCFHeader header, final Writer writer, final String versionLine, final String streamNameForError) { + // Determine requested version from versionLine + final VCFHeaderVersion requestedVersion = VCFHeaderVersion.fromHeaderVersionLine(versionLine); + final VCFHeaderLine requestedVersionLine = VCFHeader.makeHeaderVersionLine(requestedVersion); + // Set version inside header and validate lines + header.addMetaDataLine(requestedVersionLine); + return writeHeader(header, writer, streamNameForError); + } + public static VCFHeader writeHeader(final VCFHeader header, + final Writer writer, + final String streamNameForError) { try { - rejectVCFV43Headers(header); - - // the file format field needs to be written first - writer.write(versionLine + "\n"); + // The file format field needs to be written first; below any file format lines + // embedded in the header will be removed + writer.write(header.getVCFHeaderVersion().toHeaderVersionLine() + "\n"); for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { - if ( VCFHeaderVersion.isFormatString(line.getKey()) ) + // Remove the fileformat header lines + if ( VCFHeaderVersion.isFormatString(line.getKey()) ) { continue; + } writer.write(VCFHeader.METADATA_INDICATOR); writer.write(line.toString()); @@ -189,14 +196,9 @@ public static VCFHeader writeHeader(VCFHeader header, // write out the column line writer.write(VCFHeader.HEADER_INDICATOR); - boolean isFirst = true; - for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { - if ( isFirst ) - isFirst = false; // don't write out a field separator - else - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(field.toString()); - } + writer.write(header.getHeaderFields().stream() + .map(Enum::name) + .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR))); if ( header.hasGenotypingData() ) { writer.write(VCFConstants.FIELD_SEPARATOR); @@ -260,20 +262,10 @@ public void add(final VariantContext context) { @Override public void setHeader(final VCFHeader header) { - rejectVCFV43Headers(header); - if (outputHasBeenWritten) { throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream."); } this.mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header; this.vcfEncoder = new VCFEncoder(this.mHeader, this.allowMissingFieldsInHeader, this.writeFullFormatField); } - - // writing vcf v4.3 is not implemented - private static void rejectVCFV43Headers(final VCFHeader targetHeader) { - if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion())); - } - - } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java index 67656fbe03..0dd3e9d77c 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java @@ -481,7 +481,8 @@ else if (STREAM_TYPES.contains(this.outType)) if ((refDict == null) && (options.contains(Options.INDEX_ON_THE_FLY))) throw new IllegalArgumentException("A reference dictionary is required for creating Tribble indices on the fly"); - writer = createBCFWriter(outPath, outStreamFromFile); + // BCFs are always bgzipped, but the compression level can be set to 0 to only apply trivial compression + writer = createBCFWriter(outPath, new BlockCompressedOutputStream(outStreamFromFile, outPath)); break; case VCF_STREAM: writer = createVCFWriter(null, outStreamFromFile); @@ -492,7 +493,7 @@ else if (STREAM_TYPES.contains(this.outType)) options.remove(Options.INDEX_ON_THE_FLY); } - writer = createBCFWriter(null, outStream); + writer = createBCFWriter(null, new BlockCompressedOutputStream(outStreamFromFile, outPath)); break; } @@ -571,17 +572,19 @@ private static boolean isCompressedVCF(final Path outPath) { private VariantContextWriter createVCFWriter(final Path writerPath, final OutputStream writerStream) { if (idxCreator == null) { return new VCFWriter(writerPath, writerStream, refDict, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES), - options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), - options.contains(Options.WRITE_FULL_FORMAT_FIELD)); - } - else { - return new VCFWriter(writerPath, writerStream, refDict, idxCreator, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES), - options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), - options.contains(Options.WRITE_FULL_FORMAT_FIELD)); + options.contains(Options.INDEX_ON_THE_FLY), + options.contains(Options.DO_NOT_WRITE_GENOTYPES), + options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), + options.contains(Options.WRITE_FULL_FORMAT_FIELD) + ); + } else { + return new VCFWriter( + writerPath, writerStream, refDict, idxCreator, + options.contains(Options.INDEX_ON_THE_FLY), + options.contains(Options.DO_NOT_WRITE_GENOTYPES), + options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER), + options.contains(Options.WRITE_FULL_FORMAT_FIELD) + ); } } diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index bfa718453e..753a1c16f1 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -25,17 +25,22 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.QualityUtil; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.FileNotFoundException; import java.io.IOException; @@ -43,9 +48,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { + protected final static Log logger = Log.getInstance(AbstractVCFCodec.class); + public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th @@ -54,39 +62,30 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected VCFHeader header = null; protected VCFHeaderVersion version = null; - private final static VCFTextTransformer percentEncodingTextTransformer = new VCFPercentEncodedTextTransformer(); - private final static VCFTextTransformer passThruTextTransformer = new VCFPassThruTextTransformer(); - //by default, we use the passThruTextTransformer (assume pre v4.3) - private VCFTextTransformer vcfTextTransformer = passThruTextTransformer; - // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); - - // for performance testing purposes - public static boolean validate = true; + protected final Map> alleleMap = new HashMap<>(3); // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; - protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); + protected final HashMap> filterHash = new HashMap<>(); // we store a name to give to each of the variant contexts we emit protected String name = "Unknown"; protected int lineNo = 0; - protected Map stringCache = new HashMap(); + protected final Map stringCache = new HashMap<>(); protected boolean warnedAboutNoEqualsForNonFlag = false; /** * If true, then we'll magically fix up VCF headers on the fly when we read them in */ - protected boolean doOnTheFlyModifications = true; + protected VCFVersionUpgradePolicy policy = Defaults.VCF_VERSION_TRANSITION_POLICY; /** * If non-null, we will replace the sample name read from the VCF header with this sample name. This feature works @@ -117,17 +116,72 @@ class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { @Override public LazyGenotypesContext.LazyData parse(final Object data) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } } /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied + * Return true if this codec can decode files with the target version + * @param targetVersion the target version to consider + * @return true if this codec can handle targetVersion */ - protected abstract List parseFilters(String filterString); + public abstract boolean canDecodeVersion(final VCFHeaderVersion targetVersion); + + /** + * Reads all of the header from the provided iterator, but reads no further. + * @param lineIterator the line reader to take header lines from + * @return The parsed header + */ + @Override + public Object readActualHeader(final LineIterator lineIterator) { + final List headerStrings = new ArrayList<>(); + + // Extract one line and retrieve the file format and version, which must be the first line, + // and then add it back into the headerLines. + final VCFHeaderVersion fileFormatVersion = readFormatVersionLine(lineIterator); + headerStrings.add(fileFormatVersion.toHeaderVersionLine()); + + // collect metadata lines until we hit the required header line, or a non-metadata line, + // in which case throw since there was no header line + while (lineIterator.hasNext()) { + final String line = lineIterator.peek(); + if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + } else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + this.header = parseHeaderFromLines(headerStrings, fileFormatVersion); + return this.header; + } + } + throw new TribbleException.InvalidHeader( + "The required header line (starting with one #) is missing in the input VCF file"); + } + + /** + * Read ahead one line to obtain and return the vcf header version for this file + * + * @param headerLineIterator + * @return VCFHeaderVersion for this file + * @throws TribbleException if no file format header line is found in the first line or, the version can't + * be handled by this codec + */ + protected VCFHeaderVersion readFormatVersionLine(final LineIterator headerLineIterator) { + if (headerLineIterator.hasNext()) { + final String headerVersionLine = headerLineIterator.next(); + if (headerVersionLine.startsWith(VCFHeader.METADATA_INDICATOR)) { + final VCFHeaderVersion vcfFileVersion = VCFHeaderVersion.fromHeaderVersionLine(headerVersionLine); + if (!canDecodeVersion(vcfFileVersion)) { + throw new TribbleException.InvalidHeader( + String.format("The \"(%s)\" codec does not support VCF version: %s", getName(), vcfFileVersion)); + } else { + return vcfFileVersion; + } + } + } + throw new TribbleException.InvalidHeader("The VCF version header line is missing"); + } /** * create a VCF header from a set of header record lines @@ -135,231 +189,353 @@ public LazyGenotypesContext.LazyData parse(final Object data) { * @param headerStrings a list of strings that represent all the ## and # entries * @return a VCFHeader object */ - protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion version ) { - this.version = version; - - Set metaData = new LinkedHashSet(); - Set sampleNames = new LinkedHashSet(); + protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion sourceVersion ) { + final Set metaData = new LinkedHashSet<>(); + Set sampleNames = new LinkedHashSet<>(); int contigCounter = 0; - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; - } - - while ( arrayIndex < strings.length ) - sampleNames.add(strings[arrayIndex++]); - - if ( sawFormatTag && sampleNames.isEmpty()) - throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); - - // If we're performing sample name remapping and there is exactly one sample specified in the header, replace - // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested - // for this file. - if ( remappedSampleName != null ) { - // We currently only support on-the-fly sample name remapping for single-sample VCFs - if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { - throw new TribbleException(String.format("Cannot remap sample name to %s because %s samples are specified in the VCF header, and on-the-fly sample name remapping is only supported for single-sample VCFs", - remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); - } - - sampleNames.clear(); - sampleNames.add(remappedSampleName); - } + for ( String headerLine : headerStrings ) { + if ( !headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { + sampleNames = parsePrimaryHeaderLine(headerLine); } else { - if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); - metaData.add(info); - } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { - final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); - metaData.add(filter); - } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { - final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); - metaData.add(format); - } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); - metaData.add(contig); - } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - metaData.add(getAltHeaderLine(str.substring(VCFConstants.ALT_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.PEDIGREE_HEADER_START) && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // only model pedigree header lines as structured header lines starting with v4.3 - metaData.add(getPedigreeHeaderLine(str.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.META_HEADER_START) ) { - metaData.add(getMetaHeaderLine(str.substring(VCFConstants.META_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { - metaData.add(getSampleHeaderLine(str.substring(VCFConstants.SAMPLE_HEADER_OFFSET), version)); + if ( headerLine.startsWith(VCFConstants.INFO_HEADER_START) ) { + metaData.add(getInfoHeaderLine(headerLine.substring(VCFConstants.INFO_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FILTER_HEADER_START) ) { + metaData.add(getFilterHeaderLine(headerLine.substring(VCFConstants.FILTER_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FORMAT_HEADER_START) ) { + metaData.add(getFormatHeaderLine(headerLine.substring(VCFConstants.FORMAT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.CONTIG_HEADER_START) ) { + metaData.add(getContigHeaderLine(headerLine.substring(VCFConstants.CONTIG_HEADER_OFFSET), sourceVersion, contigCounter++)); + } else if ( headerLine.startsWith(VCFConstants.ALT_HEADER_START) ) { + metaData.add(getAltHeaderLine(headerLine.substring(VCFConstants.ALT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) { + metaData.add(getPedigreeHeaderLine(headerLine.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.META_HEADER_START) ) { + metaData.add(getMetaHeaderLine(headerLine.substring(VCFConstants.META_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { + metaData.add(getSampleHeaderLine(headerLine.substring(VCFConstants.SAMPLE_HEADER_OFFSET), sourceVersion)); } else { - int equals = str.indexOf('='); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); + final VCFHeaderLine otherHeaderLine = getOtherHeaderLine( + headerLine.substring(VCFHeader.METADATA_INDICATOR.length()), + sourceVersion); + if (otherHeaderLine != null) + metaData.add(otherHeaderLine); } } } - - setVCFHeader(new VCFHeader(version, metaData, sampleNames), version); - return this.header; + // return the header that is returned by setVCFHeader, since it may be different than the + // one we create here since setVCFHeader calls + // {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}, which can create an + // entirely new "repaired" header. + final VCFHeader vcfHeader = new VCFHeader(metaData, sampleNames); + return setVCFHeader(vcfHeader); } /** - * @return the header that was either explicitly set on this codec, or read from the file. May be null. - * The returned value should not be modified. + * Create and return a VCFInfoHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFInfoHeaderLine object */ - public VCFHeader getHeader() { - return header; + protected VCFInfoHeaderLine getInfoHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFInfoHeaderLine(headerLineString, sourceVersion); } /** - * @return the version number that was either explicitly set on this codec, or read from the file. May be null. + * Create and return a VCFFormatHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFormatHeaderLine object */ - public VCFHeaderVersion getVersion() { - return version; + protected VCFFormatHeaderLine getFormatHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFormatHeaderLine(headerLineString, sourceVersion); } /** - * Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file - * and the version state stored in this instance; conversely, reading the header from a file will - * overwrite whatever is set here. - * - * @param newHeader - * @param newVersion - * @return the actual header for this codec. The returned header may not be identical to the header - * argument since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set. - * @throws TribbleException if the requested header version is not compatible with the existing version + * Create and return a VCFFilterHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFilterHeaderLine object */ - public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { - validateHeaderVersionTransition(newHeader, newVersion); - if (this.doOnTheFlyModifications) { - final VCFHeader repairedHeader = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); - // validate the new header after repair to ensure the resulting header version is - // still compatible with the current version - validateHeaderVersionTransition(repairedHeader, newVersion); - this.header = repairedHeader; - } else { - this.header = newHeader; - } - - this.version = newVersion; - this.vcfTextTransformer = getTextTransformerForVCFVersion(newVersion); + protected VCFFilterHeaderLine getFilterHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFilterHeaderLine(headerLineString, sourceVersion); + } - return this.header; + /** + * Create and return a VCFContigHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be valid for this header version. + * @return a VCFContigHeaderLine object + */ + protected VCFContigHeaderLine getContigHeaderLine( + final String headerLineString, + final VCFHeaderVersion sourceVersion, + final int contigIndex) { + return new VCFContigHeaderLine(headerLineString, sourceVersion, contigIndex); } /** * Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##ALT=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFAltHeaderLine object */ - public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFAltHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##PEDIGREE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFPedigreeHeaderLine object + * + * NOTE:this can't return a VCFPedigreeHeaderLine since for pre-v4.3 PEDIGREE lines must be modeled as + * VCFHeaderLine due to the lack of a requirement for an ID field */ - public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { - return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + protected VCFHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + if (sourceVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + } else { + return new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, headerLineString); + } } /** * Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##META=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFMetaHeaderLine object */ - public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFMetaHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##SAMPLE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFSampleHeaderLine object */ - public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFSampleHeaderLine(headerLineString, sourceVersion); } /** - * the fast decode function - * @param line the line of text for the record - * @return a feature, (not guaranteed complete) that has the correct start and stop + * Create and return a header line that is not modeled by a specific VCFHeaderLine subclass, ie., its not + * a info/format/contig/alt/pedigree/meta/sample VCFHeaderLine. This may return either a VCFSimpleHeaderLine + * or a VCFHeaderLine. + * + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion VCFHeaderVersion being parsed + * @return a VCFHeaderLine */ - public Feature decodeLoc(String line) { - return decodeLine(line, false); + protected VCFHeaderLine getOtherHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + final int indexOfEquals = headerLineString.indexOf('='); + if (indexOfEquals < 1) { // must at least have "?=" + if (VCFUtils.isStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader("Unrecognized metadata line type: " + headerLineString); + } + logger.warn("Dropping unrecognized VCFHeader metadata line type: " + headerLineString); + return null; + } + final String headerLineValue = headerLineString.substring(indexOfEquals + 1).trim(); + if (headerLineValue.startsWith("<") && headerLineValue.endsWith(">")) { + if (sourceVersion.isAtLeastAsRecentAs((VCFHeaderVersion.VCF4_3)) || headerLineString.contains(""), + // but which do not contain an ID attribute, i.e., GATK Funcotator uses v4.1 ClinVar test + // files with lines like that look like this: + // + // "ID=" + // + // where the key is "ID", and no ID attribute is present + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } else { + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } + + // Parse the primary header line of the form: + // + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... + // + // The string passed in is the first non-metadata line we've seen, so it should conform. + // + private Set parsePrimaryHeaderLine(final String headerLine) { + final Set sampleNames = new LinkedHashSet<>(); + + final String[] columns = headerLine.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( columns.length < VCFHeader.HEADER_FIELDS.values().length ) { + throw new TribbleException.InvalidHeader("not enough columns present in header line: " + headerLine); + } + + int col = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(columns[col])) { + throw new TribbleException.InvalidHeader("expected column headerLineID '" + field + "' but saw '" + columns[col] + "'"); + } + } catch (IllegalArgumentException e) { + throw new TribbleException.InvalidHeader("column headerLineID '" + columns[col] + "' is not a legal column header headerLineID."); + } + col++; + } + + boolean sawFormatTag = false; + if ( col < columns.length ) { + if ( !columns[col].equals("FORMAT") ) + throw new TribbleException.InvalidHeader("expected column headerLineID 'FORMAT' but saw '" + columns[col] + "'"); + sawFormatTag = true; + col++; + } + + while ( col < columns.length ) { + // Sample names must be unique + if (sampleNames.contains(columns[col])) { + throw new TribbleException.InvalidHeader("duplicate sample name: " + columns[col]); + } else { + sampleNames.add(columns[col++]); + } + } + + if ( sawFormatTag && sampleNames.isEmpty()) + throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); + + // If we're performing sample name remapping and there is exactly one sample specified in the header, replace + // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested + // for this file. + if ( remappedSampleName != null ) { + // We currently only support on-the-fly sample name remapping for single-sample VCFs + if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { + throw new TribbleException( + String.format("Cannot remap sample headerLineID to %s because %s samples are specified in the VCF header, " + + "and on-the-fly sample headerLineID remapping is only supported for single-sample VCFs", + remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); + } + + sampleNames.clear(); + sampleNames.add(remappedSampleName); + } + + return sampleNames; } /** - * decode the line into a feature (VariantContext) - * @param line the line - * @return a VariantContext + * @return the header that was either explicitly set on this codec, or read from the file. May be null. + * The returned value should not be modified. */ - @Override - public VariantContext decode(String line) { - return decodeLine(line, true); + public VCFHeader getHeader() { + return header; } /** - * Throw if new a version/header are not compatible with the existing version/header. Generally, any version - * before v4.2 can be up-converted to v4.2, but not to v4.3. Once a header is established as v4.3, it cannot - * can not be up or down converted, and it must remain at v4.3. - * @param newHeader - * @param newVersion - * @throws TribbleException if the header conversion is not valid + * @return the version number that was either explicitly set on this codec, or read from the file. May be null. */ - private void validateHeaderVersionTransition(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { + public VCFHeaderVersion getVersion() { + return version; + } + + @Deprecated // starting after version 2.24.1 + //Note: this is currently used by Disq + public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { ValidationUtils.nonNull(newHeader); ValidationUtils.nonNull(newVersion); + ValidationUtils.validateArg( + newHeader.getVCFHeaderVersion().equals(newVersion), + "new version must equal the newHeader's version"); + return setVCFHeader(newHeader); + } - VCFHeader.validateVersionTransition(version, newVersion); - - // If this codec currently has no header (this happens when the header is being established for - // the first time during file parsing), establish an initial header and version, and bypass - // validation. - if (header != null && newHeader.getVCFHeaderVersion() != null) { - VCFHeader.validateVersionTransition(header.getVCFHeaderVersion(), newHeader.getVCFHeaderVersion()); + /** + * Set the VCFHeader for this codec. The final header may be a complete replacement for the + * provided input header, since header lines may be "repaired" (upgraded to vcf v4.2) if + * doOnTheFlyModifications is set. See + * {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + * + * @param newHeader the new header to be used by this codec + * @return the actual header that is established for this codec. See {@link + * VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + */ + public VCFHeader setVCFHeader(final VCFHeader newHeader) { + ValidationUtils.nonNull(newHeader); + final VCFHeaderVersion originalVersion = newHeader.getVCFHeaderVersion(); + + switch(this.policy) { + case DO_NOT_UPGRADE: + this.header = newHeader; + break; + case ONLY_INFALLIBLE_UPGRADE: + // Upgrade pre-4.3 versions to 4.2, and keep 4.3 at 4.3 + // calling this with a header that has any pre-v4.3 version will always result in a header + // with version vcfV4.2, no matter what the header version originally was, since the "repair" + // operation is essentially a transform of the header so that it conforms with header line rules + // as of 4.2 + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + break; + case UPGRADE_OR_FAIL: + case UPGRADE_OR_FALLBACK: + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + final Collection> errors = this.header.getValidationErrors(VCFHeader.DEFAULT_VCF_VERSION); + if (!errors.isEmpty()) { + final String message = String.format( + "Version transition from VCF version %s to %s failed with validation error(s):\n%s%s", + originalVersion.getVersionString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString(), + errors.stream() + .limit(5) + .map(VCFValidationFailure::getSourceMessage) + .collect(Collectors.joining("\n")), + errors.size() > 5 ? "\n+ " + (errors.size() - 5) + " additional error(s)" : "" + ); + if (this.policy == VCFVersionUpgradePolicy.UPGRADE_OR_FAIL) { + throw new TribbleException(message); + } else { + logger.info(message + ", header will be kept at original version: " + originalVersion.getVersionString()); + } + } else { + // Only upgrade if no errors resulting from version upgrading would occur + this.header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + } + break; + default: + throw new TribbleException("Unrecognized VCF Version Upgrade Policy: " + this.policy); } + + this.version = this.header.getVCFHeaderVersion(); + return this.header; + } + + /** + * the fast decode function + * @param line the line of text for the record + * @return a feature, (not guaranteed complete) that has the correct start and stop + */ + public Feature decodeLoc(String line) { + return decodeLine(line, false); } /** - * For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded - * on read. Return a version-aware text transformer that can decode encoded text. - * @param targetVersion the version for which a transformer is bing requested - * @return a {@link VCFTextTransformer} suitable for the targetVersion + * decode the line into a feature (VariantContext) + * @param line the line + * @return a VariantContext */ - private VCFTextTransformer getTextTransformerForVCFVersion(final VCFHeaderVersion targetVersion) { - return targetVersion != null && targetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ? - percentEncodingTextTransformer : - passThruTextTransformer; + @Override + public VariantContext decode(String line) { + return decodeLine(line, true); } private VariantContext decodeLine(final String line, final boolean includeGenotypes) { @@ -392,6 +568,7 @@ private VariantContext decodeLine(final String line, final boolean includeGenoty */ private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); + builder.version(version); builder.source(getName()); // increment the line count @@ -421,7 +598,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) final String alts = parts[4]; builder.log10PError(parseQual(parts[5])); - final List filters = parseFilters(getCachedString(parts[6])); + final Set filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) { builder.filters(new HashSet<>(filters)); } @@ -432,7 +609,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) // update stop with the end key if provided try { builder.stop(Integer.parseInt(attrs.get(VCFConstants.END_KEY).toString())); - } catch (Exception e) { + } catch (NumberFormatException e) { generateException("the END value in the INFO field is not valid"); } } else { @@ -499,20 +676,64 @@ protected String getCachedString(String str) { return internedString; } + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected Set parseFilters(final String filterString) { + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) + return Collections.emptySet(); + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter headerLineID in vcf4", lineNo); + if (filterString.isEmpty()) + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); + + // do we have the filter string cached? + if ( filterHash.containsKey(filterString) ) + return filterHash.get(filterString); + + // empty set for passes filters + final Set fFields = new HashSet<>(); + // otherwise we have to parse and cache the value + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) + fFields.add(filterString); + else { + // Variant context uses a Set to store these, so duplicates have historically been + // dropped in previous versions. Delegate handling of warning for these to the + // specific codec subclass. + String[] filters = filterString.split(VCFConstants.FILTER_CODE_SEPARATOR); + for (int i = 0; i < filters.length; i++) { + if (!fFields.add(filters[i])) { + reportDuplicateFilterIDs(filters[i], lineNo); + } + } + } + + filterHash.put(filterString, Collections.unmodifiableSet(fFields)); + + return fFields; + } + /** * parse out the info fields * @param infoField the fields * @return a mapping of keys to objects */ - private Map parseInfo(String infoField) { - Map attributes = new HashMap(); + protected Map parseInfo(String infoField) { + Map attributes = new HashMap<>(); if ( infoField.isEmpty() ) generateException("The VCF specification requires a valid (non-zero length) info field"); if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf('\t') != -1 || infoField.indexOf(' ') != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\""); + if ( infoField.indexOf('\t') != -1 ) { + generateException("The VCF specification does not allow for tab characters in the INFO field. Offending field value was \"" + infoField + "\""); + } List infoFields = ParsingUtils.split(infoField, VCFConstants.INFO_FIELD_SEPARATOR_CHAR); for (int i = 0; i < infoFields.size(); i++) { @@ -525,23 +746,23 @@ private Map parseInfo(String infoField) { String valueString = infoFields.get(i).substring(eqI + 1); // split on the INFO field separator - List infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR); + final List infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR); if ( infoValueSplit.size() == 1 ) { - value = vcfTextTransformer.decodeText(infoValueSplit.get(0)); + value = infoValueSplit.get(0); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) { // deal with the case where a flag field has =0, such as DB=0, by skipping the add continue; } } else { - value = vcfTextTransformer.decodeText(infoValueSplit); + value = infoValueSplit; } } else { key = infoFields.get(i); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { - System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " + if ( warnedAboutNoEqualsForNonFlag ) { + logger.warn("Found info key " + key + " without a = value, but the header says the field is of type " + headerLine.getType() + " but this construct is only value for FLAG type fields"); warnedAboutNoEqualsForNonFlag = true; } @@ -555,6 +776,10 @@ private Map parseInfo(String infoField) { // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + if (attributes.containsKey(key)) { + reportDuplicateInfoKeyValue(key, infoField, lineNo); + } + attributes.put(key, value); } } @@ -562,6 +787,23 @@ private Map parseInfo(String infoField) { return attributes; } + /** + * Handle reporting of duplicate filter IDs + * + * @param duplicateFilterString the duplicate filter string + * @param lineNo line number of the offending line + */ + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) {} + + /** + * Handle reporting of duplicate info line field values + * + * @param duplicateKey the key name of the field that is duplicated + * @param infoField the entire info field line + * @param lineNo line number of the offending line + */ + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { } + /** * create a an allele from an index and an array of alleles * @param index the index @@ -660,8 +902,12 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.isEmpty() ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); - if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { - System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); + if ( MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { + logger.warn(String.format( + "Allele detected with length %d exceeding max size %d at approximately line %d, " + + "likely resulting in degraded VCF processing performance", + allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo + )); } if (Allele.wouldBeSymbolicAllele(allele.getBytes())) { @@ -772,8 +1018,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, // cycle through the genotype strings boolean PlIsSet = false; for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { - List genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - genotypeValues = vcfTextTransformer.decodeText(genotypeValues); + final List genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); final String sampleName = sampleNameIterator.next(); final GenotypeBuilder gb = new GenotypeBuilder(sampleName); @@ -796,8 +1041,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } else if ( missing ) { // if its truly missing (there no provided value) skip adding it to the attributes } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - final List filters = parseFilters(getCachedString(genotypeValues.get(i))); - if ( filters != null ) gb.filters(filters); + final Set filters = parseFilters(getCachedString(genotypeValues.get(i))); + if ( filters != null ) gb.filters(new ArrayList<>(filters)); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { // don't add missing values to the map } else { @@ -847,8 +1092,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } private static final int[] decodeInts(final String string) { - List split = ParsingUtils.split(string, ','); - int [] values = new int[split.size()]; + final List split = ParsingUtils.split(string, ','); + final int [] values = new int[split.size()]; try { for (int i = 0; i < values.length; i++) { values[i] = Integer.parseInt(split.get(i)); @@ -865,7 +1110,16 @@ private static final int[] decodeInts(final String string) { * raw VCF records */ public final void disableOnTheFlyModifications() { - doOnTheFlyModifications = false; + setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + } + + /** + * Forces all VCFCodecs to not perform any on the fly modifications to the VCF header + * of VCF records. Useful primarily for raw comparisons such as when comparing + * raw VCF records + */ + public final void setVersionUpgradePolicy(final VCFVersionUpgradePolicy policy) { + this.policy = policy; } /** @@ -880,11 +1134,11 @@ public void setRemappedSampleName( final String remappedSampleName ) { } protected void generateException(String message) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } protected static void generateException(String message, int lineNo) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java index e9ca3abdf7..3c19a7f051 100644 --- a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java +++ b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java @@ -25,12 +25,9 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; +import java.util.HashSet; +import java.util.Set; /** @@ -53,45 +50,19 @@ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; /** - * @param reader the line reader to take header lines from - * @return the number of header lines + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator reader) { - final List headerStrings = new ArrayList(); - - VCFHeaderVersion version = null; - boolean foundHeaderVersion = false; - while (reader.hasNext()) { - lineNo++; - final String line = reader.peek(); - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(reader.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(reader.next()); - return super.parseHeaderFromLines(headerStrings, version); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF3_3 || targetHeaderVersion == VCFHeaderVersion.VCF3_2; } + @Override + public boolean canDecode(final String potentialInputFile) { + return canDecodeFile(potentialInputFile, VCF3_MAGIC_HEADER); + } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt @@ -99,24 +70,24 @@ else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { * @return a set of the filters applied */ @Override - protected List parseFilters(String filterString) { + protected Set parseFilters(String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; // empty set for passes filters - List fFields = new ArrayList(); + HashSet fFields = new HashSet<>(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return new ArrayList(fFields); + return new HashSet<>(fFields); if (filterString.isEmpty()) generateException("The VCF specification requires a valid filter status"); // do we have the filter string cached? if ( filterHash.containsKey(filterString) ) - return new ArrayList(filterHash.get(filterString)); + return new HashSet<>(filterHash.get(filterString)); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) @@ -130,7 +101,13 @@ protected List parseFilters(String filterString) { } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // no-op since this codec historically doesn't report duplicates } + + @Override + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + // no-op since this codec historically doesn't report duplicates + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java index 71c4850f07..37ac9874e9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java @@ -1,5 +1,7 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; + import java.util.*; /** @@ -7,16 +9,46 @@ */ public class VCFAltHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFHeader.class); private static List expectedTags = Collections.unmodifiableList( new ArrayList(2) {{ - add(ID_ATTRIBUTE); - add(DESCRIPTION_ATTRIBUTE); - }} + add(ID_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} ); public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags)); + // Honor the requested version to choose the parser, and let validateForVersion figure out + // whether that version is valid for this line (for example, if this is called with a pre-4.0 version) + super(VCFConstants.ALT_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, expectedTags)); + validateForVersion(version); } + public VCFAltHeaderLine(final String id, final String description) { + super(VCFConstants.ALT_HEADER_KEY, + new LinkedHashMap() {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + //TODO: Should we validate/constrain these to match the 4.3 spec constraints ? + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final VCFValidationFailure validationFailure = new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", getKey(), vcfTargetVersion)); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(validationFailure); + } else { + logger.warn(validationFailure.getFailureMessage()); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 42f07150d1..3ebf47c02a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -25,17 +25,10 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * A feature codec for the VCF 4 specification + * A feature codec for the VCF 4.0, 4.1, 4.2, and 4.3 specification versions * *

    * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a @@ -45,7 +38,7 @@ * of related samples. Recently the format for storing next-generation read alignments has been * standardised by the SAM/BAM file format specification. This has significantly improved the * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * We propose the Variant Call Format (VCF) as a standardised format for storing the most prevalent * types of sequence variation, including SNPs, indels and larger structural variants, together * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for * fast data retrieval of variants from a range of positions on the reference genome. @@ -72,91 +65,55 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying + // on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** - * Reads all of the header from the provided iterator, but no reads no further. - * @param lineIterator the line reader to take header lines from - * @return The parsed header + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator lineIterator) { - final List headerStrings = new ArrayList(); - - String line; - boolean foundHeaderVersion = false; - while (lineIterator.hasNext()) { - line = lineIterator.peek(); - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(lineIterator.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(lineIterator.next()); - super.parseHeaderFromLines(headerStrings, version); - return this.header; - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF4_0 || + targetHeaderVersion == VCFHeaderVersion.VCF4_1 || + targetHeaderVersion == VCFHeaderVersion.VCF4_2 || + targetHeaderVersion == VCFHeaderVersion.VCF4_3; } - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) - */ @Override - protected List parseFilters(final String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptyList(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if (filterString.isEmpty()) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // empty set for passes filters - final List fFields = new LinkedList(); - // otherwise we have to parse and cache the value - if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, Collections.unmodifiableList(fFields)); + public boolean canDecode(final String potentialInput) { + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + } - return fFields; + @Override + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // older versions of htsjdk have been silently dropping these for a while, but we can at least warn + logger.warn(String.format("Duplicate filter %s found on line %d", duplicateFilterString, lineNo)); } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + logger.warn(String.format("Duplicate key %s found in %s on line %d", duplicateKey, infoField, lineNo)); } + + /** + * parse out the info fields + * @param infoField the fields + * @return a mapping of keys to objects + */ + protected Map parseInfo(String infoField) { + if (infoField.indexOf(' ') != -1) { + generateException( + String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", + version == null ? + "unknown" : + version.getVersionString(), + infoField) + ); + } + return super.parseInfo(infoField); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index f955a434e1..028798757a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -25,60 +25,187 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; -import htsjdk.variant.utils.GeneralUtils; +import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.VariantContext; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.regex.Pattern; + import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** - * a base class for compound header lines, which include info lines and format lines (so far) + * Abstract base class for compound header lines, which include INFO lines and FORMAT lines. + * + * Compound header lines are distinguished only in that are required to have TYPE and NUMBER attributes + * (VCFHeaderLineCount, a VCFHeaderLineType, and a count). */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { +public abstract class VCFCompoundHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFCompoundHeaderLine.class); + + // regex pattern corresponding to legal info/format field keys + protected static final Pattern VALID_HEADER_ID_PATTERN = Pattern.compile("^[A-Za-z_][0-9A-Za-z_.]*$"); + protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; + + protected static final String NUMBER_ATTRIBUTE = "Number"; + protected static final String TYPE_ATTRIBUTE = "Type"; + + // List of expected tags that have a predefined order (used by the parser to verify order only). The + // header line class itself should verify that all required tags are present. + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(4) {{ + add(ID_ATTRIBUTE); + add(NUMBER_ATTRIBUTE); + add(TYPE_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} + ); + + // immutable, cached binary representations of compound header line attributes + private final VCFHeaderLineType type; + private final VCFHeaderLineCount countType; + private final int count; - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); + /** + * create a VCF compound header line with count type = VCFHeaderLineCount.INTEGER + * + * @param key the key (header line type) for this header line + * @param headerLineID the is or this header line + * @param count the count for this header line, sets countType type as VCFHeaderLineCount.INTEGER + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final int count, + final VCFHeaderLineType type, + final String description) + { + this(key, createAttributeMap(headerLineID, VCFHeaderLineCount.INTEGER, count, type, description), VCFHeader.DEFAULT_VCF_VERSION); + } - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } + /** + * create a VCF compound header line + * + * @param key the key (header line type) for this header line + * @param headerLineID the id for this header line + * @param countType the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final VCFHeaderLineCount countType, + final VCFHeaderLineType type, + final String description) { + this(key, createAttributeMap(headerLineID, countType, VCFHeaderLineCount.VARIABLE_COUNT, type, description), VCFHeader.DEFAULT_VCF_VERSION); } - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; - private String source; - private String version; + /** + * create a VCF compound header line from an attribute map + * + * @param key the key (header line type) for this header line + * @param mapping the header line attribute map + * @param vcfVersion the VCF header version. This may be null, in which case + */ + protected VCFCompoundHeaderLine(final String key, final Map mapping, final VCFHeaderVersion vcfVersion) { + super(key, mapping); + ValidationUtils.nonNull(vcfVersion); + + this.type = decodeLineType(getGenericFieldValue(TYPE_ATTRIBUTE)); + final String countString = getGenericFieldValue(NUMBER_ATTRIBUTE); + this.countType = decodeCountType(countString, vcfVersion); + this.count = decodeCount(countString, this.countType); + validateForVersion(vcfVersion); + } + + /** + * Return the description for this header line. + * @return the header line's description + */ + public String getDescription() { + final String description = getGenericFieldValue(DESCRIPTION_ATTRIBUTE); + return description == null ? + UNBOUND_DESCRIPTION : + description; + } - // access methods - @Override - public String getID() { return name; } - public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } - public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } + + /** + * @return true if this header line has a fixed integer count type ({@link #getCountType()} + * equals {@link VCFHeaderLineCount#INTEGER}) + */ + public boolean isFixedCount() { return countType.isFixedCount(); } + + /** + * @return the integer count for this header line if the header has a fixed integer + * count type ({@link #isFixedCount()} is true). A TribbleException is thrown if the + * header line does not have a fixed integer count type ({@link #getCountType()} equals + * {@link VCFHeaderLineCount#INTEGER}). + * + * @throws TribbleException if the {@link VCFHeaderLineCount} is not a fixed integer + */ public int getCount() { - if (!isFixedCount()) - throw new TribbleException("Asking for header line count when type is not an integer"); + if (!isFixedCount()) { + throw new TribbleException("Header line count request when count type is not an integer"); + } return count; } public String getSource() { - return source; + return getGenericFieldValue(SOURCE_ATTRIBUTE); } public String getVersion() { - return version; + return getGenericFieldValue(VERSION_ATTRIBUTE); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // The VCF 4.3 spec does not phrase this restriction as one on the form of the ID value of + // INFO/FORMAT lines but instead on the INFO/FORMAT fixed field key values (c.f. section 1.6.1). + // However, the key values correspond to INFO/FORMAT header lines defining the attribute and its type, + // so we do the validation here + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + final Optional> validationFailure = validateKeyOrID(getID()) + .map(e -> new VCFValidationFailure<>(vcfTargetVersion, this, e)); + if (validationFailure.isPresent()) { + // TODO thinking that these getValidationFailure should be a pure function and its caller + // decides whether to pass the error up or just log if not using strict validation + if (VCFUtils.isStrictVCFVersionValidation()) { + return validationFailure; + } else { + // warn for older versions - this line can't be used as a v4.3 line + logger.warn(validationFailure.get().getFailureMessage()); + } + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + /** + * @param id the candidate ID + * @return true if ID conforms to header line id requirements, otherwise false + */ + @Override + protected Optional validateKeyOrID(final String id) { + return VALID_HEADER_ID_PATTERN.matcher(id).matches() + ? Optional.empty() + : Optional.of(String.format("Key: %s does not match header line key regex: %s", id, VALID_HEADER_ID_PATTERN)); } /** @@ -113,278 +240,209 @@ public int getCount(final VariantContext vc) { } } - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - /** - * create a VCF format header line + * Specify annotation source + *

    + * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param source annotation source (case-insensitive, e.g. "dbsnp") */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after 2.24.1 + public void setSource(final String source) { + updateGenericField(SOURCE_ATTRIBUTE, source); } /** - * create a VCF format header line + * Specify annotation version + *

    + * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param version exact version (e.g. "138") */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after version 2.24.1 + public void setVersion(final String version) { + updateGenericField(VERSION_ATTRIBUTE, version); } - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); - } + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFCompoundHeaderLine)) return false; + if (!super.equals(o)) return false; - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); + final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; + + if (count != that.count) return false; + if (type != that.type) return false; + return countType == that.countType; } - /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * - */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + type.hashCode(); + result = 31 * result + countType.hashCode(); + result = 31 * result + count; + return result; + } - final ArrayList expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description")); - final List recommendedTags; - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - recommendedTags = Arrays.asList("Source", "Version"); + private VCFHeaderLineType decodeLineType(final String lineTypeString) { + if (lineTypeString == null) { + throw new TribbleException(String.format("A line type attribute is required for %s header lines", getKey())); } else { - recommendedTags = Collections.emptyList(); - } - final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags, recommendedTags); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if (numberStr.equals(VCFConstants.PER_ALTERNATE_COUNT)) { - countType = VCFHeaderLineCount.A; - } else if (numberStr.equals(VCFConstants.PER_ALLELE_COUNT)) { - countType = VCFHeaderLineCount.R; - } else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) { - countType = VCFHeaderLineCount.G; - } else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { - countType = VCFHeaderLineCount.UNBOUNDED; - } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.parseInt(numberStr); - - } - - if (count < 0 && countType == VCFHeaderLineCount.INTEGER) - throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name); - - try { - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - } catch (Exception e) { - throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); + try { + return VCFHeaderLineType.valueOf(lineTypeString); + } catch (IllegalArgumentException e) { + throw new TribbleException(String.format( + "\"%s\" is not a valid type for %s header lines (note that types are case-sensitive)", + lineTypeString, + getKey())); + } } - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field at line - " + line); - - description = mapping.get("Description"); - if (description == null && ALLOW_UNBOUND_DESCRIPTIONS) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - - this.lineType = lineType; + } - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - this.source = mapping.get("Source"); - this.version = mapping.get("Version"); + private VCFHeaderLineCount decodeCountType(final String countString, final VCFHeaderVersion vcfVersion) { + if (countString == null) { + throw new TribbleException.InvalidHeader( + String.format("A count type/value must be provided for %s header lines.", getID())); } - - validate(); + return VCFHeaderLineCount.decode(vcfVersion, countString); } - private void validate() { - if (type != VCFHeaderLineType.Flag && countType == VCFHeaderLineCount.INTEGER && count <= 0) - throw new IllegalArgumentException(String.format("Invalid count number, with fixed count the number should be 1 or higher: key=%s name=%s type=%s desc=%s lineType=%s count=%s", - getKey(), name, type, description, lineType, count)); - if (name == null || type == null || description == null || lineType == null) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - getKey(), name, type, description, lineType)); - if (name.contains("<") || name.contains(">")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if (name.contains("=")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - if (type == VCFHeaderLineType.Flag && count != 0) { - count = 0; - if (GeneralUtils.DEBUG_MODE_ENABLED) { - System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); + private int decodeCount(final String countString, final VCFHeaderLineCount requestedCountType) { + int lineCount = VCFHeaderLineCount.VARIABLE_COUNT; + if (requestedCountType.isFixedCount()) { + if (countString == null) { + throw new TribbleException.InvalidHeader(String.format("Missing count value in VCF header field %s", getID())); + } + try { + lineCount = Integer.parseInt(countString); + } catch (NumberFormatException e) { + throw new TribbleException.InvalidHeader(String.format("Invalid count value %s in VCF header field %s", lineCount, getID())); + } + if (getType() == VCFHeaderLineType.Flag) { + if (lineCount != 0) { + // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag + // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but + // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. + logger.warn(String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", + lineCount, + getID())); + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; + } + } else if (lineCount <= 0) { + throw new TribbleException.InvalidHeader( + String.format("Invalid count number %d for fixed count in header line with ID %s. For fixed count, the count number must be 1 or higher.", + lineCount, + getID())); } } + return lineCount; } - /** - * make a string representation of this header line - * @return a string representation - */ - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch (countType) { - case A: - number = VCFConstants.PER_ALTERNATE_COUNT; - break; - case R: - number = VCFConstants.PER_ALLELE_COUNT; - break; - case G: - number = VCFConstants.PER_GENOTYPE_COUNT; - break; - case UNBOUNDED: - number = VCFConstants.UNBOUNDED_ENCODING_v4; - break; - case INTEGER: - default: - number = count; - } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - if (source != null) { - map.put("Source", source); - } - if (version != null) { - map.put("Version", version); - } - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + // Create a backing attribute map out of VCFCompoundHeaderLine elements + private static LinkedHashMap createAttributeMap( + final String headerLineID, + final VCFHeaderLineCount countType, + final int count, + final VCFHeaderLineType type, + final String description) { + return new LinkedHashMap() { + { put(ID_ATTRIBUTE, headerLineID); } + { put(NUMBER_ATTRIBUTE, countType.encode(count)); } + { put(TYPE_ATTRIBUTE, type.encode()); } + { + // Handle the case where there's no description provided, ALLOW_UNBOUND_DESCRIPTIONS is the default + // note: if no description was provided, don't cache it, which means we don't round trip it + if (description != null) { + put(DESCRIPTION_ATTRIBUTE, description); + } + } + }; } /** - * returns true if we're equal to another compound header line - * @param o a compound header line - * @return true if equal + * Compare two VCFCompoundHeaderLine (FORMAT or INFO) lines to determine if they have compatible number types, + * and return a VCFCompoundHeaderLine that can be used to represent the result of merging these lines. In the + * case where the merged line requires "promoting" one of the types to the other, a new line of the appropriate + * type is created by calling the {@code compoundHeaderLineResolver} to produce new line of the correct + * subclass (INFO or FORMAT). + * + * @param line1 first line to merge + * @param line2 second line to merge + * @param conflictWarner conflict warning manager + * @param compoundHeaderLineResolver function that accepts two compound header lines of the same type (info or + * format, and returns a new header line representing the combination of the + * two input header lines + * @param type of VCFCompoundHeaderLine to merge (subclass of VCFCompoundHeaderLine) + * @return the merged line if one can be created */ - @Override - public boolean equals(final Object o) { - if ( this == o ) { - return true; + static T getMergedCompoundHeaderLine( + final T line1, + final T line2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner, + BiFunction compoundHeaderLineResolver) + { + ValidationUtils.nonNull(line1); + ValidationUtils.nonNull(line2); + ValidationUtils.validateArg(line1.getKey().equals(line2.getKey()) && line1.getID().equals(line2.getID()), + "header lines must have the same type to merge"); + T mergedLine = line1; + + if (!line1.equalsExcludingExtraAttributes(line2)) { + if (getCompoundLineDifferenceScore(line1, line2) > 1) { + // merge lines if they have zero or one mergeable differences, but if there are multiple + // differences, call the headers incompatible and bail, since we need to choose one line + // or the other as the merge line (we can't do generic field-level resolution) + throw new TribbleException( + String.format("Incompatible header merge, can't merge lines with multiple attribute differences %s/%s.", + line1, line2)); + } + if (line1.getType().equals(line2.getType())) { + // The lines have a common type. + // The Number entry is an Integer that describes the number of values that can be + // included with the INFO field. For example, if the INFO field contains a single + // number, then this value should be 1. However, if the INFO field describes a pair + // of numbers, then this value should be 2 and so on. If the number of possible + // values varies, is unknown, or is unbounded, then this value should be '.'. + conflictWarner.warn("Promoting header field Number to . due to number differences in header lines: " + line1 + " " + line2); + mergedLine = compoundHeaderLineResolver.apply(line1, line2); + } else if (line1.getType() == VCFHeaderLineType.Integer && line2.getType() == VCFHeaderLineType.Float) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + mergedLine = line2; + } else if (line1.getType() == VCFHeaderLineType.Float && line2.getType() == VCFHeaderLineType.Integer) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + } else { + throw new IllegalStateException("Attempt to merge incompatible headers, can't merge these lines: " + line1 + " " + line2); + } } - if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { - return false; + if (!line1.getDescription().equals(line2.getDescription())) { + conflictWarner.warn("Allowing unequal description fields through: keeping " + line2 + " excluding " + line1); } - final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; - return equalsExcludingDescription(that) && - description.equals(that.description); - } - - @Override - public int hashCode() { - int result = super.hashCode(); - result = 31 * result + name.hashCode(); - result = 31 * result + count; - result = 31 * result + (countType != null ? countType.hashCode() : 0); // only nullable field according to validate() - result = 31 * result + description.hashCode(); - result = 31 * result + type.hashCode(); - result = 31 * result + lineType.hashCode(); - result = 31 * result + (source != null ? source.hashCode() : 0); - result = 31 * result + (version != null ? version.hashCode() : 0); - return result; + return mergedLine; } - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { + boolean equalsExcludingExtraAttributes(final VCFCompoundHeaderLine other) { return count == other.count && countType == other.countType && type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); + getKey().equals(other.getKey()) && + getID().equals(other.getID()); } - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - - /** - * Specify annotation source - *

    - * This value is optional starting with VCFv4.2. - * - * @param source annotation source (case-insensitive, e.g. "dbsnp") - */ - public void setSource(final String source) { - this.source = source; - } - - /** - * Specify annotation version - *

    - * This value is optional starting with VCFv4.2. - * - * @param version exact version (e.g. "138") - */ - public void setVersion(final String version) { - this.version = version; + private static int getCompoundLineDifferenceScore(final T line1, final T line2) { + final int dataTypeDiffers = line1.getType().equals(line2.getType()) ? 0 : 1; // data type + final int countTypeDiffers = line1.getCountType().equals(line2.getCountType()) ? 0 : 1; // count type + // getCount is only valid if the getCountType==Integer + final int countDiffers = + (countTypeDiffers == 0 && + line1.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line2.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line1.getCount() != line2.getCount()) ? 1 : 0; + return dataTypeDiffers + countTypeDiffers + countDiffers; } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFConstants.java b/src/main/java/htsjdk/variant/vcf/VCFConstants.java index 64fdf2bc8e..11f12cf07c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFConstants.java +++ b/src/main/java/htsjdk/variant/vcf/VCFConstants.java @@ -45,7 +45,7 @@ public final class VCFConstants { public static final String GENOTYPE_KEY = "GT"; public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD isn't reserved, but is specifically handled by VariantContext + public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD is now reserved public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; @Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods @@ -86,14 +86,20 @@ public final class VCFConstants { public static final String PHASING_TOKENS = "/|\\"; // header lines - public static final String FILTER_HEADER_START = "##FILTER"; - public static final String FORMAT_HEADER_START = "##FORMAT"; - public static final String INFO_HEADER_START = "##INFO"; - public static final String ALT_HEADER_KEY = "ALT"; - public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY ; - public static final String CONTIG_HEADER_KEY = "contig"; - public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY; + public static final String FILTER_HEADER_KEY = "FILTER"; + public static final String FILTER_HEADER_START = VCFHeader.METADATA_INDICATOR + FILTER_HEADER_KEY; + public static final int FILTER_HEADER_OFFSET = FILTER_HEADER_START.length() + 1; + + public static final String FORMAT_HEADER_KEY = "FORMAT"; + public static final String FORMAT_HEADER_START = VCFHeader.METADATA_INDICATOR + FORMAT_HEADER_KEY; + public static final int FORMAT_HEADER_OFFSET = FORMAT_HEADER_START.length() + 1; + + public static final String INFO_HEADER_KEY = "INFO"; + public static final String INFO_HEADER_START = VCFHeader.METADATA_INDICATOR + INFO_HEADER_KEY; + public static final int INFO_HEADER_OFFSET = INFO_HEADER_START.length() + 1; + public static final String ALT_HEADER_KEY = "ALT"; + public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY; public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1; public static final String PEDIGREE_HEADER_KEY = "PEDIGREE"; @@ -108,6 +114,10 @@ public final class VCFConstants { public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY; public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1; + public static final String CONTIG_HEADER_KEY = "contig"; + public static final String CONTIG_HEADER_START = VCFHeader.METADATA_INDICATOR + CONTIG_HEADER_KEY; + public static final int CONTIG_HEADER_OFFSET = CONTIG_HEADER_START.length() + 1; + // old indel alleles public static final char DELETION_ALLELE_v3 = 'D'; public static final char INSERTION_ALLELE_v3 = 'I'; diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 9ec50681b4..d8a19e2fa5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -26,11 +26,14 @@ package htsjdk.variant.vcf; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Optional; +import java.util.regex.Pattern; /** * A special class representing a contig VCF header line. Knows the true contig order and sorts on that @@ -40,42 +43,111 @@ * @author mdepristo */ public class VCFContigHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFContigHeaderLine.class); + + final static Pattern VALID_CONTIG_ID_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"); final Integer contigIndex; + public static final String LENGTH_ATTRIBUTE = "length"; + public static final String ASSEMBLY_ATTRIBUTE = "assembly"; + public static final String MD5_ATTRIBUTE = "md5"; + public static final String URL_ATTRIBUTE = "URL"; + public static final String SPECIES_ATTRIBUTE = "species"; + /** * create a VCF contig header line * + * NOTE: This is retained for backward compatibility, but is deprecated and should not be used. + * * @param line the header line * @param version the vcf header version * @param key the key for this header line + * @param contigIndex the contig index for this contig */ + @Deprecated // starting after version 2.24.1 public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) { - super(line, version, key, null, Collections.emptyList()); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); - this.contigIndex = contigIndex; + // deprecated because this constructor has a parameter to specify the key (??), but for + // contig lines the key has to be "contig" + this(line, version, contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(key)) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + key, + VCFHeader.CONTIG_KEY)); + } + } + + /** + * create a VCF contig header line + * + * @param line the header line + * @param version the vcf header version + * @param contigIndex the contig index for this contig + */ + public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final int contigIndex) { + this(VCFHeaderLineTranslator.parseLine( + version, line, Collections.singletonList(VCFSimpleHeaderLine.ID_ATTRIBUTE)), contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(getKey())) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + getKey(), + VCFHeader.CONTIG_KEY)); + } + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } + validateForVersion(version); } public VCFContigHeaderLine(final Map mapping, final int contigIndex) { super(VCFHeader.CONTIG_KEY, mapping); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } this.contigIndex = contigIndex; } - VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { - // Using LinkedHashMap to preserve order of keys in contig line (ID, length, assembly) - super(VCFHeader.CONTIG_KEY, new LinkedHashMap() {{ - // Now inside an init block in an anon HashMap subclass - this.put("ID", sequenceRecord.getSequenceName()); - this.put("length", Integer.toString(sequenceRecord.getSequenceLength())); - if ( assembly != null ) this.put("assembly", assembly); - }}); - this.contigIndex = sequenceRecord.getSequenceIndex(); + /** + * Return a VCFContigHeaderLine representing a SAMSequenceRecord. + * + * NOTE: round-tripping between VCFContigHeaderLines and SAMSequenceRecords can be lossy since they + * don't necessarily have equivalent attributes, i.e., SAMSequenceRecord can have a species attribute + * that isn't defined by the VCF spec. + * + * @return VCFContigHeaderLine for the SAMSequenceRecord + */ + public VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { + // preserve order of keys in contig line (ID, length, assembly) + this(new LinkedHashMap() {{ + this.put(ID_ATTRIBUTE, sequenceRecord.getSequenceName()); + if (sequenceRecord.getSequenceLength() != 0) { + this.put(LENGTH_ATTRIBUTE, Integer.toString(sequenceRecord.getSequenceLength())); + } + if (assembly != null) { + if (!assembly.equals(sequenceRecord.getAssembly())) { + logger.warn(String.format( + "Inconsistent \"assembly\" attribute values found while creating VCFContigLine " + + "(with assembly \"%s\") from SAMSequenceRecord (with assembly \"%s\")", + assembly, + sequenceRecord.getAssembly())); + } + this.put(ASSEMBLY_ATTRIBUTE, assembly); + } + if (sequenceRecord.getMd5() != null) { + this.put(MD5_ATTRIBUTE, sequenceRecord.getMd5()); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG) != null) { + this.put(URL_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG)); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) { + this.put(SPECIES_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG)); + } + }}, + sequenceRecord.getSequenceIndex() + ); } - public Integer getContigIndex() { - return contigIndex; - } - /** * Get the SAMSequenceRecord that corresponds to this VCF header line. * If the VCF header line does not have a length tag, the SAMSequenceRecord returned will be set to have a length of @@ -85,20 +157,56 @@ public Integer getContigIndex() { * contig header line does not have a length. */ public SAMSequenceRecord getSAMSequenceRecord() { - final String lengthString = this.getGenericFieldValue("length"); - final int length; - if (lengthString == null) { - length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; + final String lengthString = this.getGenericFieldValue(LENGTH_ATTRIBUTE); + final int length; + if (lengthString == null) { + length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; } else { - length = Integer.parseInt(lengthString); + length = Integer.parseInt(lengthString); + } + final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); + final String assemblyString = this.getGenericFieldValue(ASSEMBLY_ATTRIBUTE); + if (assemblyString != null) { + record.setAssembly(assemblyString); } - final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); - record.setAssembly(this.getGenericFieldValue("assembly")); - record.setSequenceIndex(this.contigIndex); - return record; + record.setSequenceIndex(this.contigIndex); + final String md5 = getGenericFieldValue(MD5_ATTRIBUTE); + if (md5 != null) { + record.setMd5(md5); + } + final String url = getGenericFieldValue(URL_ATTRIBUTE); + if (url != null) { + record.setAttribute(SAMSequenceRecord.URI_TAG, url); + } + final String species = getGenericFieldValue(SPECIES_ATTRIBUTE); + if (species != null) { + record.setSpecies(species); + } + return record; } @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (!VALID_CONTIG_ID_PATTERN.matcher(getID()).matches()) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("Contig headerLine ID \"%s\" doesn't conform to contig ID restrictions", getID()))); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + public Integer getContigIndex() { + return contigIndex; + } + + /** + * Note: this class has a natural ordering that is inconsistent with equals() + */ + @Override public boolean equals(final Object o) { if ( this == o ) { return true; @@ -120,6 +228,11 @@ public int hashCode() { /** * IT IS CRITICAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER + * + * NOTE: this class has a natural ordering that is inconsistent with equals(). This results + * in inconsistent behavior when these lines are used in the sets that are created/accepted + * by VCFHeader (ie., getMetaDataInSortedOrder will filter out VCFContigHeaderLines that are + * returned by getMetaDataInInputOrder or getContigheaderLines). */ @Override public int compareTo(final Object other) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java index 9cffb45837..7be6e32de6 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java +++ b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java @@ -15,7 +15,9 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -26,10 +28,7 @@ */ public class VCFEncoder { - /** - * The encoding used for VCF files: ISO-8859-1. When writing VCF4.3 is implemented, this should change to UTF-8. - */ - public static final Charset VCF_CHARSET = StandardCharsets.ISO_8859_1; + public static final Charset VCF_CHARSET = StandardCharsets.UTF_8; private static final String QUAL_FORMAT_STRING = "%.2f"; private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00"; @@ -41,6 +40,8 @@ public class VCFEncoder { private boolean outputTrailingFormatFields = false; + private final VCFTextTransformer vcfTextTransformer; + /** * Prepare a VCFEncoder that will encode records appropriate to the given VCF header, optionally * allowing missing fields in the header. @@ -52,6 +53,9 @@ public VCFEncoder(final VCFHeader header, final boolean allowMissingFieldsInHead this.header = header; this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; this.outputTrailingFormatFields = outputTrailingFormatFields; + this.vcfTextTransformer = header.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) + ? new VCFPercentEncodedTextTransformer() + : new VCFPassThruTextTransformer(); } /** @@ -148,7 +152,7 @@ public void write(final Appendable vcfOutput, final VariantContext context) thro fieldIsMissingFromHeaderError(context, field.getKey(), "INFO"); } - final String outputValue = formatVCFField(field.getValue()); + final String outputValue = formatVCFField(field.getValue(), context.isFullyDecoded()); if (outputValue != null) { infoFields.put(field.getKey(), outputValue); } @@ -218,34 +222,71 @@ private void fieldIsMissingFromHeaderError(final VariantContext vc, final String } } - @SuppressWarnings("rawtypes") - String formatVCFField(final Object val) { - final String result; + String formatVCFField(final Object val, final boolean fullyDecoded) { if (val == null) { - result = VCFConstants.MISSING_VALUE_v4; + return VCFConstants.MISSING_VALUE_v4; } else if (val instanceof Double) { - result = formatVCFDouble((Double) val); + return formatVCFDouble((Double) val); } else if (val instanceof Boolean) { - result = (Boolean) val ? "" : null; // empty string for true, null for false + return (Boolean) val ? "" : null; // empty string for true, null for false } else if (val instanceof List) { - result = formatVCFField(((List) val).toArray()); + return formatList((List) val, fullyDecoded); } else if (val.getClass().isArray()) { - final int length = Array.getLength(val); - if (length == 0) { - return formatVCFField(null); + return val.getClass().getComponentType().isPrimitive() + ? formatPrimitiveArray(val) + : formatList(Arrays.asList((Object[]) val), fullyDecoded); + } else if (val instanceof String) { + final String s = val.toString(); + // If the VariantContext from which this string was obtained was already fully decoded, + // its in-memory representation may contain special characters which must be re-encoded, + // while strings which have not been decoded yet represent the field as read directly + // from the source VCF, so they are written back out without encoding + return fullyDecoded ? vcfTextTransformer.encodeText(s) : s; + } else { + return val.toString(); + } + } + + private static String formatPrimitiveArray(final Object v) { + final int len = Array.getLength(v); + if (len == 0) return VCFConstants.MISSING_VALUE_v4; + int i = 0; + final StringBuilder s = new StringBuilder(); + if (v instanceof int[]) { + final int[] a = (int[]) v; + for (;;) { + s.append(a[i++]); + if (i == len) break; + s.append(','); } - final StringBuilder sb = new StringBuilder( - formatVCFField(Array.get(val, 0))); - for (int i = 1; i < length; i++) { - sb.append(','); - sb.append(formatVCFField(Array.get(val, i))); + } else if (v instanceof double[]) { + final double[] a = (double[]) v; + for (;;) { + s.append(formatVCFDouble(a[i++])); + if (i == len) break; + s.append(','); + } + } else if (v instanceof long[]) { + final long[] a = (long[]) v; + for (;;) { + s.append(a[i++]); + if (i == len) break; + s.append(','); } - result = sb.toString(); - } else { - result = val.toString(); } + return s.toString(); + } - return result; + private String formatList(final List list, final boolean fullyDecoded) { + if (list.isEmpty()) return VCFConstants.MISSING_VALUE_v4; + final StringBuilder s = new StringBuilder(); + final Iterator it = list.iterator(); + for (;;) { + s.append(formatVCFField(it.next(), fullyDecoded)); + if (!it.hasNext()) break; + s.append(','); + } + return s.toString(); } /** @@ -310,7 +351,8 @@ public void addGenotypeData(final VariantContext vc, final Map a * @param vcfoutput VCF output * @throws IOException */ - private void appendGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final Appendable vcfoutput) throws IOException {final int ploidy = vc.getMaxPloidy(2); + private void appendGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final Appendable vcfoutput) throws IOException { + final int ploidy = vc.getMaxPloidy(2); for (final String sample : this.header.getGenotypeSamples()) { vcfoutput.append(VCFConstants.FIELD_SEPARATOR); @@ -357,7 +399,7 @@ public void addGenotypeData(final VariantContext vc, final Map a } } else { Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - outputValue = formatVCFField(val); + outputValue = formatVCFField(val, vc.isFullyDecoded()); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java index c6ff6158e0..7deade9374 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java @@ -24,6 +24,7 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; @@ -35,6 +36,7 @@ import htsjdk.tribble.TribbleException; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import java.io.File; import java.io.IOException; @@ -53,14 +55,21 @@ public class VCFFileReader implements VCFReader { * Returns true if the given file appears to be a BCF file. */ public static boolean isBCF(final File file) { - return isBCF(file.toPath()); + return isBCF(file.toString()); } /** * Returns true if the given path appears to be a BCF file. */ public static boolean isBCF(final Path path) { - return path.toUri().getRawPath().endsWith(FileExtensions.BCF); + return isBCF(path.toUri().getRawPath()); + } + + /** + * Returns true if the given path appears to be a BCF file. + */ + public static boolean isBCF(final String path) { + return path.endsWith(FileExtensions.BCF) || path.endsWith(FileExtensions.COMPRESSED_BCF); } /** @@ -115,6 +124,23 @@ public VCFFileReader(final File file, final File indexFile, final boolean requir return isBCF(path) ? new BCF2Codec() : new VCFCodec(); } + /** + * returns Correct Feature codec for Path depending whether + * the name seems to indicate that it's a BCF. + * + * @param path to vcf/bcf + * @return FeatureCodec for input Path + */ + private static FeatureCodec getCodecForPath(Path path, final VCFVersionUpgradePolicy policy) { + if (isBCF(path)) { + return new BCF2Codec(); + } else { + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(policy); + return codec; + } + } + /** * Returns the SAMSequenceDictionary from the provided VCF file. */ @@ -142,21 +168,49 @@ public VCFFileReader(final Path path, final Path indexPath) { * Allows construction of a VCFFileReader that will or will not assert the presence of an index as desired. */ public VCFFileReader(final Path path, final boolean requireIndex) { - this.reader = AbstractFeatureReader.getFeatureReader( - path.toUri().toString(), - getCodecForPath(path), - requireIndex); + this(path, requireIndex, Defaults.VCF_VERSION_TRANSITION_POLICY); } /** * Allows construction of a VCFFileReader with a specified index path. */ public VCFFileReader(final Path path, final Path indexPath, final boolean requireIndex) { + this(path, indexPath, requireIndex, Defaults.VCF_VERSION_TRANSITION_POLICY); + } + + /** + * Constructs a VCFFileReader that requires the index to be present. + */ + public VCFFileReader(final Path path, final VCFVersionUpgradePolicy policy) { + this(path, true, policy); + } + + /** + * Constructs a VCFFileReader with a specified index. + */ + public VCFFileReader(final Path path, final Path indexPath, final VCFVersionUpgradePolicy policy) { + this(path, indexPath, true, policy); + } + + /** + * Allows construction of a VCFFileReader that will or will not assert the presence of an index as desired. + */ + public VCFFileReader(final Path path, final boolean requireIndex, final VCFVersionUpgradePolicy policy) { + this.reader = AbstractFeatureReader.getFeatureReader( + path.toUri().toString(), + getCodecForPath(path, policy), + requireIndex); + } + + /** + * Allows construction of a VCFFileReader with a specified index path. + */ + public VCFFileReader(final Path path, final Path indexPath, final boolean requireIndex, final VCFVersionUpgradePolicy policy) { this.reader = AbstractFeatureReader.getFeatureReader( - path.toUri().toString(), - indexPath.toUri().toString(), - getCodecForPath(path), - requireIndex); + path.toUri().toString(), + indexPath.toUri().toString(), + getCodecForPath(path, policy), + requireIndex); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 6ca8f3f532..101ff304c6 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -25,26 +25,40 @@ package htsjdk.variant.vcf; -import java.util.Arrays; -import java.util.Collections; +import htsjdk.tribble.TribbleException; + +import java.util.*; /** * @author ebanks * - * A class representing a key=value entry for FILTER fields in the VCF header + * A class representing FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - +public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + private static List requiredTagOrder = Collections.unmodifiableList( + new ArrayList(2) {{ + add(ID_ATTRIBUTE); + add(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE); + }} + ); + /** * create a VCF filter header line * - * @param name the name for this header line + * @param id the headerLineID for this header line * @param description the description for this header line */ - public VCFFilterHeaderLine(final String name, final String description) { - super("FILTER", name, description); + public VCFFilterHeaderLine(final String id, final String description) { + super(VCFConstants.FILTER_HEADER_KEY, + new LinkedHashMap(2) {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + validate(); } /** @@ -52,29 +66,32 @@ public VCFFilterHeaderLine(final String name, final String description) { * @param name */ public VCFFilterHeaderLine(final String name) { - super("FILTER", name, name); + this(name, name); } /** - * create a VCF info header line + * create a VCF filter header line * * @param line the header line * @param version the vcf header version */ public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description"), Collections.emptyList()); + super(VCFConstants.FILTER_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, requiredTagOrder)); + validate(); + validateForVersion(version); } - @Override - public boolean shouldBeAddedToDictionary() { - return true; + private void validate() { + if (getDescription() == null) { + throw new TribbleException.InvalidHeader("Missing Description attribute in filter header line"); + } } - + /** * get the "Description" field * @return the "Description" field */ public String getDescription() { - return getGenericFieldValue("Description"); + return getGenericFieldValue(DESCRIPTION_ATTRIBUTE); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index 74f4d5e5e3..1e927b7d05 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -26,38 +26,75 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + /** * @author ebanks *

    * Class VCFFormatHeaderLine *

    *

    - * A class representing a key=value entry for genotype FORMAT fields in the VCF header

    + * A class representing genotype FORMAT fields in the VCF header

    */ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version); + validate(); + validateForVersion(version); } - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; + /** + * Compare two VCFFormatHeaderLine objects to determine if they have compatible number types, and return a + * VCFFormatHeaderLine that represents the result of merging these two lines. + * + * @param formatLine1 first format line to merge + * @param formatLine2 second format line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFFormatHeaderLine + */ + public static VCFFormatHeaderLine getMergedFormatHeaderLine( + final VCFFormatHeaderLine formatLine1, + final VCFFormatHeaderLine formatLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(formatLine1); + ValidationUtils. nonNull(formatLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + formatLine1, + formatLine2, + conflictWarner, + (l1, l2) -> new VCFFormatHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); } - @Override - public boolean shouldBeAddedToDictionary() { - return true; + private void validate() { + if (this.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException("Flag is an unsupported type for format fields: " + this.toStringEncoding()); + } } + } \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index c39bef5684..e1a0cf7a4e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -27,60 +27,49 @@ import htsjdk.beta.plugin.HtsHeader; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContextComparator; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - +import java.util.*; +import java.util.stream.Collectors; /** - * A class to represent a VCF header + * A class to represent a VCF header. + * + * A VCFHeader has a "current" VCFHeaderVersion that is established when the header is constructed. If + * metadata lines are provided to the constructor, a ##fileformat line must be included, and all lines + * in that are provided must be valid for the specified version. If no metadata lines are initially + * provided, the default version {@link VCFHeader#DEFAULT_VCF_VERSION} will be used. + * + * Each line in the list is always guaranteed to be valid for the current version, and any line added must + * conform to the current version (as defined by the VCF specification). If a new line is added that fails to + * validate against the current version, or a new line that changes the current version, and an existing line + * in the list fails to validate against the new version, an exception will be thrown. * - * @author aaron - * NOTE: This class stores header lines in lots of places. The original author noted that this should - * be cleaned up at some point in the future (jgentry - 5/2013) + * Once a header version is established, it can be changed by adding a new file format/version line (see + * {@link VCFHeader#makeHeaderVersionLine)} (the new version line will replace any existing line), but only + * if the new version is newer than the previous version. Attempts to move the version to an older version + * will result in an exception. */ public class VCFHeader implements HtsHeader, Serializable { public static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFHeader.class); + public static final VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_3; // the mandatory header fields public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - /** - * The VCF version for this header; once a header version is established, it can only be - * changed subject to version transition rules defined by - * {@link #validateVersionTransition(VCFHeaderVersion, VCFHeaderVersion)} - */ - private VCFHeaderVersion vcfHeaderVersion; - - // the associated meta data - private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new LinkedHashMap(); - private final Map mFormatMetaData = new LinkedHashMap(); - private final Map mFilterMetaData = new LinkedHashMap(); - private final Map mOtherMetaData = new LinkedHashMap(); - private final Map contigMetaData = new LinkedHashMap<>(); + // header meta data + private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); - // the list of auxillary tags - private final List mGenotypeSampleNames = new ArrayList(); + // the list of auxiliary tags + private final List mGenotypeSampleNames = new ArrayList<>(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; @@ -108,59 +97,74 @@ public enum HEADER_FIELDS { private boolean writeCommandLine = true; /** - * Create an empty VCF header with no header lines and no samples + * Create an empty VCF header with no header lines and no samples. Defaults to + * VCF version {@link VCFHeader#DEFAULT_VCF_VERSION}. */ public VCFHeader() { - this(Collections.emptySet(), Collections.emptySet()); + this(makeHeaderVersionLineSet(DEFAULT_VCF_VERSION), Collections.emptySet()); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a list of meta data and auxiliary tags. The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for the header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData the meta data associated with this header + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData) { - mMetaData.addAll(metaData); - removeVCFVersionLines(mMetaData); - createLookupEntriesForAllHeaderLines(); - checkForDeprecatedGenotypeLikelihoodsKey(); + this(metaData, Collections.emptySet()); } /** - * Creates a deep copy of the given VCFHeader, duplicating all its metadata and + * Creates a copy of the given VCFHeader, duplicating all it's metadata and * sample names. */ public VCFHeader(final VCFHeader toCopy) { - this(toCopy.mMetaData, toCopy.mGenotypeSampleNames); + this(toCopy.getMetaDataInInputOrder(), toCopy.mGenotypeSampleNames); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a set of meta data and auxiliary tags. The provided metadata + * list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData set of meta data associated with this header * @param genotypeSampleNames the sample names + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); + this(metaData, new ArrayList<>(genotypeSampleNames)); } /** - * create a VCF header, given a target version, a list of meta data and auxiliary tags + * Create a versioned VCF header. * - * @param vcfHeaderVersion the vcf header version for this header, can not be null - * @param metaData the meta data associated with this header - * @param genotypeSampleNames the sample names + * @param metaData The metadata lines for this header.The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. + * @param genotypeSampleNames Sample names for this header. + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ - public VCFHeader(final VCFHeaderVersion vcfHeaderVersion, final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); - ValidationUtils.nonNull(vcfHeaderVersion); - setVCFHeaderVersion(vcfHeaderVersion); - } - public VCFHeader(final Set metaData, final List genotypeSampleNames) { - this(metaData); + ValidationUtils.nonNull(metaData); + ValidationUtils.nonNull(genotypeSampleNames); - if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) + // propagate the lines and establish the version for this header; note that if multiple version + // lines are presented in the set, a warning will be issued, only the last one will be retained, + // and the header version will be established using the last version line encountered + mMetaData.addMetaDataLines(metaData); + final VCFHeaderVersion vcfHeaderVersion = initializeHeaderVersion(); + mMetaData.validateMetaDataLines(vcfHeaderVersion); + + checkForDeprecatedGenotypeLikelihoodsKey(); + if ( genotypeSampleNames.size() != new HashSet<>(genotypeSampleNames).size() ) throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names"); mGenotypeSampleNames.addAll(genotypeSampleNames); @@ -168,50 +172,30 @@ public VCFHeader(final Set metaData, final List genotypeS buildVCFReaderMaps(genotypeSampleNames); } - /** - * Establish the header version for this header. If the header version has already been established - * for this header, the new version will be subject to version transition validation. - * @param vcfHeaderVersion - * @throws TribbleException if the requested header version is not compatible with the existing version - */ - public void setVCFHeaderVersion(final VCFHeaderVersion vcfHeaderVersion) { - validateVersionTransition(this.vcfHeaderVersion, vcfHeaderVersion); - this.vcfHeaderVersion = vcfHeaderVersion; + /** + * Get the header version for this header. + * @return the VCFHeaderVersion for this header. will not be null + */ + public VCFHeaderVersion getVCFHeaderVersion() { + return mMetaData.getVCFVersion(); } /** - * Throw if {@code fromVersion} is not compatible with a {@code toVersion}. Generally, any version before - * version 4.2 can be up-converted to version 4.2, but not to version 4.3. Once a header is established as - * version 4.3, it cannot be up or down converted, and it must remain at version 4.3. - * @param fromVersion current version. May be null, in which case {@code toVersion} can be any version - * @param toVersion new version. Cannot be null. - * @throws TribbleException if {@code fromVersion} is not compatible with {@code toVersion} + * Adds a new line to the VCFHeader. If a duplicate line is already exists (same key/ID pair for + * structured lines, or duplicate content for unstructured lines with identical keys), the new + * line will replace the existing line. + * + * @param headerLine header line to attempt to add */ - public static void validateVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - ValidationUtils.nonNull(toVersion); - - final String errorMessageFormatString = "VCF cannot be automatically promoted from %s to %s"; - - // fromVersion can be null, in which case anything goes (any transition from null is legal) - if (fromVersion != null) { - if (toVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - if (!fromVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from pre-v4.3 to v4.3+ - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } - - } else if (fromVersion.equals(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from v4.3 to pre-v4.3 - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } - } - } + public void addMetaDataLine(final VCFHeaderLine headerLine) { + // propagate the new line to the metadata lines object, and if the version changed, validate + // the lines against the new version + final VCFHeaderVersion oldHeaderVersion = mMetaData.getVCFVersion(); + mMetaData.addMetaDataLine(headerLine); + final VCFHeaderVersion newHeaderVersion = mMetaData.getVCFVersion(); + validateVersionTransition(headerLine, oldHeaderVersion, newHeaderVersion); - /** - * @return the VCFHeaderVersion for this header. Can be null. - */ - public VCFHeaderVersion getVCFHeaderVersion() { - return vcfHeaderVersion; + checkForDeprecatedGenotypeLikelihoodsKey(); } /** @@ -220,81 +204,58 @@ public VCFHeaderVersion getVCFHeaderVersion() { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance + * @param genotypeSampleNamesInAppearanceOrder genotype sample names, must iterator in order of appearance */ - private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearenceOrder) { - sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); - sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearanceOrder) { + sampleNamesInOrder = new ArrayList<>(genotypeSampleNamesInAppearanceOrder.size()); + sampleNameToOffset = new HashMap<>(genotypeSampleNamesInAppearanceOrder.size()); int i = 0; - for (final String name : genotypeSampleNamesInAppearenceOrder) { + for (final String name : genotypeSampleNamesInAppearanceOrder) { sampleNamesInOrder.add(name); sampleNameToOffset.put(name, i++); } Collections.sort(sampleNamesInOrder); } - /** - * Adds a new line to the VCFHeader. If there is an existing header line of the - * same type with the same key, the new line is not added and the existing line - * is preserved. + * Return all contig line in SORTED order, where the sort order is determined by contig index. + * Note that this behavior differs from other VCFHeader methods that return lines in input order. * - * @param headerLine header line to attempt to add - */ - public void addMetaDataLine(final VCFHeaderLine headerLine) { - // Try to create a lookup entry for the new line. If this succeeds (because there was - // no line of this type with the same key), add the line to our master list of header - // lines in mMetaData. - if ( addMetadataLineLookupEntry(headerLine) ) { - mMetaData.add(headerLine); - checkForDeprecatedGenotypeLikelihoodsKey(); - } - } - - /** - * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present */ public List getContigLines() { - // this must preserve input order - return Collections.unmodifiableList(new ArrayList<>(contigMetaData.values())); - } + // this must return lines in SORTED order + return mMetaData.getContigLines(); + } /** - * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. If contig lines are missing length tags, they will be created with - * length set to SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH. Records with unknown length will match any record with - * the same name when evaluated by SAMSequenceRecord.isSameSequence. + * Returns the contigs in this VCF Header as a SAMSequenceDictionary. + * + * @return Returns null if contig lines are not present in the header. + * @throws TribbleException if one or more contig lines do not have length + * information. */ public SAMSequenceDictionary getSequenceDictionary() { + // this must ensure that the lines used to create the dictionary are sorted by contig index final List contigHeaderLines = this.getContigLines(); - if (contigHeaderLines.isEmpty()) return null; - - final List sequenceRecords = new ArrayList(contigHeaderLines.size()); - for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { - final SAMSequenceRecord samSequenceRecord = contigHeaderLine.getSAMSequenceRecord(); - sequenceRecords.add(samSequenceRecord); - } - - return new SAMSequenceDictionary(sequenceRecords); + return contigHeaderLines.isEmpty() ? null : + new SAMSequenceDictionary( + contigHeaderLines.stream() + .map(contigLine -> contigLine.getSAMSequenceRecord()) + .collect(Collectors.toCollection(ArrayList::new)) + ); } /** - * Completely replaces the contig records in this header with those in the given SAMSequenceDictionary. + * Completely replaces all contig header lines in this header with ones derived from the given SAMSequenceDictionary. + * + * @param dictionary SAMSequenceDictionary to use to create VCFContigHeaderLines for this header */ public void setSequenceDictionary(final SAMSequenceDictionary dictionary) { - this.contigMetaData.clear(); - - // Also need to remove contig record lines from mMetaData - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFContigHeaderLine) { - toRemove.add(line); - } - } - mMetaData.removeAll(toRemove); - for (final SAMSequenceRecord record : dictionary.getSequences()) { - addMetaDataLine(new VCFContigHeaderLine(record, record.getAssembly())); + getContigLines().forEach(hl -> mMetaData.removeMetaDataLine(hl)); + if (dictionary != null) { + dictionary.getSequences().forEach(r -> addMetaDataLine(new VCFContigHeaderLine(r, r.getAssembly()))); } } @@ -305,128 +266,12 @@ public VariantContextComparator getVCFRecordComparator() { /** * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present */ - public List getFilterLines() { - final List filters = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if ( line instanceof VCFFilterHeaderLine ) { - filters.add((VCFFilterHeaderLine)line); - } - } - return filters; - } - - /** - * @return all of the VCF ID-based header lines in their original file order, or an empty list if none were present - */ - public List getIDHeaderLines() { - final List lines = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFIDHeaderLine) { - lines.add((VCFIDHeaderLine)line); - } - } - return lines; - } - - /** - * Remove all lines with a VCF version tag from the provided set of header lines - */ - private void removeVCFVersionLines( final Set headerLines ) { - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : headerLines) { - if (VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - } - headerLines.removeAll(toRemove); - } + public List getFilterLines() { return mMetaData.getFilterLines(); } /** - * Creates lookup table entries for all header lines in mMetaData. + * @return all of the VCFSimpleHeaderLine (ID) lines in their original file order, or an empty list if none are present */ - private void createLookupEntriesForAllHeaderLines() { - for (final VCFHeaderLine line : mMetaData) { - addMetadataLineLookupEntry(line); - } - } - - /** - * Add a single header line to the appropriate type-specific lookup table (but NOT to the master - * list of lines in mMetaData -- this must be done separately if desired). - * - * If a header line is present that has the same key as an existing line, it will not be added. A warning - * will be shown if this occurs when GeneralUtils.DEBUG_MODE_ENABLED is true, otherwise this will occur - * silently. - * - * @param line header line to attempt to add to its type-specific lookup table - * @return true if the line was added to the appropriate lookup table, false if there was an existing - * line with the same key and the new line was not added - */ - private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) { - if ( line instanceof VCFInfoHeaderLine ) { - final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - return addMetaDataLineMapLookupEntry(mInfoMetaData, infoLine.getID(), infoLine); - } else if ( line instanceof VCFFormatHeaderLine ) { - final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFormatMetaData, formatLine.getID(), formatLine); - } else if ( line instanceof VCFFilterHeaderLine ) { - final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFilterMetaData, filterLine.getID(), filterLine); - } else if ( line instanceof VCFContigHeaderLine ) { - return addContigMetaDataLineLookupEntry((VCFContigHeaderLine) line); - } else { - return addMetaDataLineMapLookupEntry(mOtherMetaData, line.getKey(), line); - } - } - - /** - * Add a contig header line to the lookup list for contig lines (contigMetaData). If there's - * already a contig line with the same ID, does not add the line. - * - * Note: does not add the contig line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param line contig header line to add - * @return true if line was added to the list of contig lines, otherwise false - */ - private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) { - // if we are trying to add a contig for the same ID - if (contigMetaData.containsKey(line.getID())) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" ); - } - // do not add this contig if it exists - return false; - } - contigMetaData.put(line.getID(), line); - return true; - } - - /** - * Add a header line to the provided map at a given key. If the key already exists, it will not be replaced. - * If it does already exist and GeneralUtils.DEBUG_MODE_ENABLED is true, it will issue warnings about duplicates, - * otherwise it will silently leave the existing key/line pair as is. - * - * Note: does not add the header line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param map a map from each key to the associated VCFHeaderLine - * @param key the key to insert this line at - * @param line the line to insert at this key - * @param a type of vcf header line that extends VCFHeaderLine - * @return true if the line was added to the map, false if it was not added because there's already a line with that key - */ - private boolean addMetaDataLineMapLookupEntry(final Map map, final String key, final T line) { - if ( map.containsKey(key) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" ); - } - return false; - } - - map.put(key, line); - return true; - } + public List getIDHeaderLines() { return mMetaData.getIDHeaderLines(); } /** * Check for the presence of a format line with the deprecated key {@link VCFConstants#GENOTYPE_LIKELIHOODS_KEY}. @@ -435,12 +280,14 @@ private boolean addMetaDataLineMapLookupEntry(final Ma */ private void checkForDeprecatedGenotypeLikelihoodsKey() { if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " - + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" - + " automatically adding a corresponding PL field to your VCF header"); - } - addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" + + " automatically adding a corresponding PL field to your VCF header"); + addMetaDataLine(new VCFFormatHeaderLine( + VCFConstants.GENOTYPE_PL_KEY, + VCFHeaderLineCount.G, + VCFHeaderLineType.Integer, + "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); } } @@ -451,48 +298,44 @@ private void checkForDeprecatedGenotypeLikelihoodsKey() { * @return a set of the header fields, in order */ public Set getHeaderFields() { - return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); + return new LinkedHashSet<>(Arrays.asList(HEADER_FIELDS.values())); } /** - * get the meta data, associated with this header, in sorted order + * get the meta data, associated with this header, in input order * * @return a set of the meta data */ - public Set getMetaDataInInputOrder() { - return makeGetMetaDataSet(mMetaData); - } - - public Set getMetaDataInSortedOrder() { - return makeGetMetaDataSet(new TreeSet(mMetaData)); - } + public Set getMetaDataInInputOrder() { return mMetaData.getMetaDataInInputOrder(); } - private Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) { - final Set lines = new LinkedHashSet(); - if (vcfHeaderVersion != null && vcfHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // always propagate version 4.3+ to prevent these header lines from magically being back-versioned to < 4.3 - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_3.getFormatString(), VCFHeaderVersion.VCF4_3.getVersionString())); - } else { - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); - } - lines.addAll(headerLinesInSomeOrder); - return Collections.unmodifiableSet(lines); - } + /** + * Get the metadata associated with this header in sorted order. + * + * @return Metadata lines in sorted order (based on lexicographical sort of string encodings). + */ + public Set getMetaDataInSortedOrder() { return mMetaData.getMetaDataInSortedOrder(); } /** * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists - * @param key - * @return + * + * Deprecated. Use {@link #getMetaDataLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the key to use to find header lines to return + * @return the header line with key "key", or null if none is present */ + @Deprecated // starting after version 2.24.1 public VCFHeaderLine getMetaDataLine(final String key) { - for (final VCFHeaderLine line: mMetaData) { - if ( line.getKey().equals(key) ) - return line; - } - - return null; + return mMetaData.getMetaDataLines(key).stream().findFirst().orElse(null); } + /** + * Get the VCFHeaderLines whose key equals key. Returns an empty list if no such lines exist. + * + * @param key the key to use to find header lines to return + * @return the header lines with key "key" + */ + public Collection getMetaDataLines(final String key) { return mMetaData.getMetaDataLines(key); } + /** * get the genotyping sample names * @@ -532,40 +375,32 @@ public int getColumnCount() { /** * Returns the INFO HeaderLines in their original ordering */ - public Collection getInfoHeaderLines() { - return mInfoMetaData.values(); - } + public Collection getInfoHeaderLines() { return mMetaData.getInfoHeaderLines(); } /** * Returns the FORMAT HeaderLines in their original ordering */ - public Collection getFormatHeaderLines() { - return mFormatMetaData.values(); - } + public Collection getFormatHeaderLines() { return mMetaData.getFormatHeaderLines(); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ public VCFInfoHeaderLine getInfoHeaderLine(final String id) { - return mInfoMetaData.get(id); + return mMetaData.getInfoHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFormatHeaderLine getFormatHeaderLine(final String id) { - return mFormatMetaData.get(id); - } + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { return mMetaData.getFormatHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFilterHeaderLine getFilterHeaderLine(final String id) { - return mFilterMetaData.get(id); - } + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { return mMetaData.getFilterHeaderLine(id); } public boolean hasInfoLine(final String id) { return getInfoHeaderLine(id) != null; @@ -580,24 +415,82 @@ public boolean hasFilterLine(final String id) { } /** - * @param key the header key name + * Deprecated. Use {@link #getOtherHeaderLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the of the requested header line * @return the meta data line, or null if there is none */ + @Deprecated // starting after version 2.24.1 this selects one from what can be many) public VCFHeaderLine getOtherHeaderLine(final String key) { - return mOtherMetaData.get(key); + final Collection otherLines = mMetaData.getOtherHeaderLines(); + for (final VCFHeaderLine next: otherLines) { + if (next.getKey().equals(key)) { + // note that this returns the first match it finds, which is why this method is deprecated + return next; + } + } + return null; } /** - * Returns the other HeaderLines in their original ordering + * Returns all "other" VCFHeaderLines, in their original (input) order, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { return mMetaData.getOtherHeaderLines(); } + + /** + * Returns "other" HeaderLines that have the key "key", in their original ordering, where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public List getOtherHeaderLines(final String key) { + return mMetaData.getOtherHeaderLines().stream().filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Adds a single "other" VCFHeaderLine that has key "key". Any lines with that key that already exist + * in the header will be removed. This method can only be used to set unique non-structured (non-ID) + * header lines. + * + * @param uniqueLine the unique line to add + * @throws TribbleException if the line to be added is an ID line. */ - public Collection getOtherHeaderLines() { - return mOtherMetaData.values(); + public void addOtherHeaderLineUnique(final VCFHeaderLine uniqueLine) { + if (uniqueLine.isIDHeaderLine()) { + throw new TribbleException(String.format("Only non-ID header lines can be added using this method: %s", uniqueLine)); + } + getOtherHeaderLines(uniqueLine.getKey()).forEach(hl -> mMetaData.removeMetaDataLine(hl)); + addMetaDataLine(uniqueLine); + } + + /** + * Returns a single "other" VCFHeaderLine that has the key "key", where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. If more than + * one such line is available, throws a TribbleException. + * + * @param key the key to match + * @return a single VCHeaderLine, or null if none + * @throws TribbleException if more than one other line matches the key + */ + public VCFHeaderLine getOtherHeaderLineUnique(final String key) { + final List lineList = getOtherHeaderLines(key); + if (lineList.isEmpty()) { + return null; + } else if (lineList.size() > 1) { + throw new TribbleException( + String.format( + "More than one \"other\" header line matches the key \"%s\". Use getOtherHeaderLines() to retrieve multiple lines:", + key, + lineList.stream().map(VCFHeaderLine::toString).collect(Collectors.joining(",")))); + } else { + return lineList.get(0); + } } /** * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @return true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteEngineHeaders() { return writeEngineHeaders; } @@ -606,6 +499,7 @@ public boolean isWriteEngineHeaders() { * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @param writeEngineHeaders true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteEngineHeaders(final boolean writeEngineHeaders) { this.writeEngineHeaders = writeEngineHeaders; } @@ -614,6 +508,7 @@ public void setWriteEngineHeaders(final boolean writeEngineHeaders) { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @return true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteCommandLine() { return writeCommandLine; } @@ -622,6 +517,7 @@ public boolean isWriteCommandLine() { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @param writeCommandLine true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteCommandLine(final boolean writeCommandLine) { this.writeCommandLine = writeCommandLine; } @@ -640,10 +536,103 @@ public HashMap getSampleNameToOffset() { @Override public String toString() { - final StringBuilder b = new StringBuilder(); - b.append("[VCFHeader:"); - for ( final VCFHeaderLine line : mMetaData ) - b.append("\n\t").append(line); - return b.append("\n]").toString(); + return mMetaData.toString(); + } + + /** + * Obtain a valid fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return the version line + */ + public static VCFHeaderLine makeHeaderVersionLine(final VCFHeaderVersion requestedVersion) { + return new VCFHeaderLine(requestedVersion.getFormatString(), requestedVersion.getVersionString()); + } + + /** + * Obtain a VCFHeaderLine set containing only a fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return a VCFHeaderLine set containing only fileformat/version line for the requestedVersion + */ + public static Set makeHeaderVersionLineSet(final VCFHeaderVersion requestedVersion) { + return new LinkedHashSet() {{ add(VCFHeader.makeHeaderVersionLine(requestedVersion)); }}; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final VCFHeader vcfHeader = (VCFHeader) o; + + if (samplesWereAlreadySorted != vcfHeader.samplesWereAlreadySorted) return false; + if (writeEngineHeaders != vcfHeader.writeEngineHeaders) return false; + if (writeCommandLine != vcfHeader.writeCommandLine) return false; + if (!mMetaData.equals(vcfHeader.mMetaData)) return false; + if (mGenotypeSampleNames != null ? !mGenotypeSampleNames.equals(vcfHeader.mGenotypeSampleNames) : + vcfHeader.mGenotypeSampleNames != null) + return false; + if (sampleNamesInOrder != null ? !sampleNamesInOrder.equals(vcfHeader.sampleNamesInOrder) : + vcfHeader.sampleNamesInOrder != null) + return false; + return sampleNameToOffset != null ? sampleNameToOffset.equals(vcfHeader.sampleNameToOffset) : + vcfHeader.sampleNameToOffset == null; } + + @Override + public int hashCode() { + int result = mMetaData.hashCode(); + result = 31 * result + (mGenotypeSampleNames != null ? mGenotypeSampleNames.hashCode() : 0); + result = 31 * result + (samplesWereAlreadySorted ? 1 : 0); + result = 31 * result + (sampleNamesInOrder != null ? sampleNamesInOrder.hashCode() : 0); + result = 31 * result + (sampleNameToOffset != null ? sampleNameToOffset.hashCode() : 0); + result = 31 * result + (writeEngineHeaders ? 1 : 0); + result = 31 * result + (writeCommandLine ? 1 : 0); + return result; + } + + /** + * Establish the version for this header using the (required) ##fileformat metadata line in the metadata list. + * @throws TribbleException if no ##fileformat line is included in the metadata lines + */ + private VCFHeaderVersion initializeHeaderVersion() { + final VCFHeaderVersion metaDataVersion = mMetaData.getVCFVersion(); + if (metaDataVersion == null) { + //we dont relax this even if VCFUtils.getStrictVCFVersionValidation() == false, since that + //would confound subsequent header version management + throw new TribbleException("The VCFHeader metadata must include a ##fileformat (version) header line"); + } + return metaDataVersion; + } + + public Collection> getValidationErrors(final VCFHeaderVersion targetVersion) { + return mMetaData.getValidationErrors(targetVersion); + } + + private void validateVersionTransition( + final VCFHeaderLine newHeaderLine, + final VCFHeaderVersion currentVersion, + final VCFHeaderVersion newVersion) { + final int compareTo = newVersion.compareTo(currentVersion); + + // We only allow going forward to a newer version, not backwards to an older one, since there + // is really no way to validate old header lines (pre vcfV4.2). If the version moved forward, + // revalidate all the lines, otherwise only validate the new header line. + if (compareTo < 0) { + throw new TribbleException(String.format( + "When changing a header version, the new header version %s must be > the previous version %s", + newVersion, + currentVersion)); + } else if (compareTo > 0) { + logger.debug(() -> String.format("Updating VCFHeader version from %s to %s", + currentVersion.getVersionString(), + newVersion.getVersionString())); + + // the version moved forward, so validate ALL of the existing lines in the list to ensure + // that the transition is valid + mMetaData.validateMetaDataLines(newVersion); + } else { + newHeaderLine.validateForVersion(newVersion); + } + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 0d07a83078..c1bec06d47 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -26,28 +26,23 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; import java.io.Serializable; import java.util.Map; - +import java.util.Optional; /** - * @author ebanks - *

    - * Class VCFHeaderLine - *

    - *

    - * A class representing a key=value entry in the VCF header - *

    + *

    A class representing a key=value entry in the VCF header, and the base class for structured header lines. + * Header lines are immutable, and derived classes should maintain immutability. + *

    */ public class VCFHeaderLine implements Comparable, Serializable { public static final long serialVersionUID = 1L; - protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; + // immutable - we don't want to let the hash value change + private final String mKey; + private final String mValue; /** * create a VCF header line @@ -56,14 +51,9 @@ public class VCFHeaderLine implements Comparable, Serializable { * @param value the value for this header line */ public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null"); - if ( key.contains("<") || key.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain angle brackets"); - if ( key.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain an equals sign"); mKey = key; mValue = value; + validate(); } /** @@ -76,21 +66,98 @@ public String getKey() { } /** - * Get the value + * Get the value. May be null. * - * @return the value + * @return the value. may be null (for subclass implementations that use structured values) */ public String getValue() { return mValue; } /** - * By default the header lines won't be added to the dictionary, unless this method will be override (for example in FORMAT, INFO or FILTER header lines) + * @return true if this is a structured header line (has a unique ID, and key/value pairs), otherwise false + */ + public boolean isIDHeaderLine() { return false; } + + /** + * Return the unique ID for this line. Returns null iff {@link #isIDHeaderLine()} is false. + * @return the line's ID, or null if isIDHeaderLine() is false + */ + public String getID() { return null; } + + /** + * Validate the state of this header line. Require the key be valid as an "id". + */ + private void validate() { + final Optional validationFailure = validateKeyOrID(mKey); + if (validationFailure.isPresent()) { + throw new TribbleException(validationFailure.get()); + } + } + + /** + * Validates this header line against {@code vcfTargetVersion}. + * Subclasses can override this to provide line type-specific version validation, and the + * overrides should also call super.getValidationFailure to allow each class in the class hierarchy + * to do class-level validation. * - * @return false + * @return Optional containing a {@link VCFValidationFailure} describing validation failure if this + * line fails validation, otherwise Optional.empty(). */ - public boolean shouldBeAddedToDictionary() { - return false; + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // If this header line is itself a fileformat/version line, + // make sure it doesn't clash with the requested vcfTargetVersion. + if (VCFHeaderVersion.isFormatString(getKey())) { + if (!vcfTargetVersion.getFormatString().equals(getKey()) || + !vcfTargetVersion.getVersionString().equals(getValue()) + ) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("The target version (%s) is incompatible with the header line's content.", + vcfTargetVersion))); + } + } else if (getKey().equals(VCFConstants.PEDIGREE_HEADER_KEY)) { + // previous to vcf4.3, PEDIGREE header lines are not modeled as VCFPedigreeHeaderLine because they + // were not structured header lines (had no ID), so we need to check HERE to see if an attempt is + // being made to use one of those old-style pedigree lines in a newer-versioned header, and reject + // it if so + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) && ! (this instanceof VCFPedigreeHeaderLine)) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("A pedigree line with no ID cannot be merged with version %s", vcfTargetVersion))); + } + } + + return Optional.empty(); + } + + /** + * Validate that the header line conforms to {@code vcfTargetVersion}. + * @param vcfTargetVersion + * @throws {@link TribbleException.VersionValidationFailure} if this header line fails to conform + */ + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + final Optional> error = getValidationFailure(vcfTargetVersion); + if (error.isPresent()) { + throw new TribbleException.VersionValidationFailure(error.get().getSourceMessage()); + } + } + + /** + * Validate a string that is to be used as a unique id or key field. + */ + protected Optional validateKeyOrID(final String keyString) { + if (keyString == null) { + return Optional.of("VCFHeaderLine: key cannot be null or empty"); + } else if ( keyString.contains("<") || keyString.contains(">") ) { + return Optional.of(String.format("VCFHeaderLine: key %s contains illegal character: angle brackets", keyString)); + } else if ( keyString.contains("=") ) { + return Optional.of(String.format("VCFHeaderLine: key %s contains illegal character: equals sign", keyString)); + } else { + return Optional.empty(); + } } public String toString() { @@ -136,15 +203,17 @@ public int compareTo(Object other) { * @param line the line * @return true if the line is a VCF meta data line, or false if it is not */ - public static boolean isHeaderLine(String line) { + @Deprecated // starting after version 2.24.1 + static boolean isHeaderLine(String line) { return line != null && !line.isEmpty() && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); } /** - * create a string of a mapping pair for the target VCF version + * create a string of a mapping pair * @param keyValues a mapping of the key->value pairs to output * @return a string, correctly formatted */ + @Deprecated // starting after version 2.24.1 public static String toStringEncoding(Map keyValues) { StringBuilder builder = new StringBuilder(); builder.append('<'); @@ -158,10 +227,10 @@ public static String toStringEncoding(Map keyValues) { builder.append(entry.getKey()); builder.append('='); builder.append(entry.getValue().toString().contains(",") || - entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") || - entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes - entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); + entry.getValue().toString().contains(" ") || + entry.getKey().equals("Description") || + entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes + entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); } builder.append('>'); return builder.toString(); diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java index 080153a990..24195c73d3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java @@ -25,9 +25,78 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the count encodings we use for fields in VCF header lines */ public enum VCFHeaderLineCount { INTEGER, A, R, G, UNBOUNDED; + + // A default int value used to represent an integral count value (not a count *type*) when the + // actual count is derived and not a fixed integer (i.e., when isFixedCount()==false) + public static final int VARIABLE_COUNT = -1; + + /** Return true if this line uses a fixed (integer) count. **/ + public boolean isFixedCount() { return this.equals(INTEGER); } + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, assume the string represents a fixed, numeric + * value, and return Integer. The caller should convert and validate the actual value. + * + * @param vcfVersion + * @param countTypeString + * @return + */ + protected static VCFHeaderLineCount decode(final VCFHeaderVersion vcfVersion, final String countTypeString) { + ValidationUtils.nonNull(vcfVersion); + ValidationUtils.nonNull(countTypeString); + + if (countTypeString.equals(VCFConstants.PER_ALTERNATE_COUNT)) { + return A; + } else if (countTypeString.equals(VCFConstants.PER_ALLELE_COUNT)) { + return R; + } else if (countTypeString.equals(VCFConstants.PER_GENOTYPE_COUNT)) { + return G; + } else if ( + (vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { + return VCFHeaderLineCount.UNBOUNDED; + } else { + return VCFHeaderLineCount.INTEGER; // assume integer + } + } + + /** + * Encode a count type as a string suitable for serialization to a VCF header. Note this is + * not version aware and defaults to VCFv4 format. + * + * @param actualCount Must be the special value {@code VARIABLE_COUNT} unless this object is {@code VCFHeaderLineCount.INTEGER}. + * @return String encoding of this enum, or the {@code actualCount} if the type of this count + * is VCFHeaderLineCount.INTEGER. + * + * @throws IllegalArgumentException if {@code actualCount} is not the special value {@code VARIABLE_COUNT} and this + * is not the {@code VCFHeaderLineCount.INTEGER} enum object. + */ + public String encode(final int actualCount) { + if (this != INTEGER && actualCount != VARIABLE_COUNT) { + // Should only supply an actualCount if the count type == INTEGER + throw new IllegalArgumentException("Inconsistent header line number encoding request"); + } + switch (this) { + case A: + return VCFConstants.PER_ALTERNATE_COUNT; + case R: + return VCFConstants.PER_ALLELE_COUNT; + case G: + return VCFConstants.PER_GENOTYPE_COUNT; + case UNBOUNDED: + return VCFConstants.UNBOUNDED_ENCODING_v4; + case INTEGER: + return Integer.toString(actualCount); + } + throw new IllegalStateException("Unexpected VCFHeaderLineCount enum value"); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index 6c83574fee..2397e28641 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -34,7 +34,7 @@ import java.util.Map; /** - * A class for translating between vcf header versions + * A class for translating between vcf header versions and corresponding header line parsers. */ public class VCFHeaderLineTranslator { private static final Map mapping; @@ -50,57 +50,57 @@ public class VCFHeaderLineTranslator { mapping = Collections.unmodifiableMap(map); } + /** + * Parse a VCFHeaderLine for the given version. + * + * @param version VCFHeaderVersion of the header line + * @param valueLine the header line string + * @param expectedTagOrder List of expected tags (interpreted differently by the VCF3 and VCF4 parsers). + * @return a mapping of the tags parsed out. Note that the order of attributes is significant (ID must be + * first) and this should return a LinkedHashMap in order to preserve attribute order. + */ public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return parseLine(version, valueLine, expectedTagOrder, Collections.emptyList()); - } - - public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder, List recommendedTags) { - return mapping.get(version).parseLine(valueLine, expectedTagOrder, recommendedTags); + return mapping.get(version).parseLine(valueLine, expectedTagOrder); } } - +/** + * Parse a VCFHeaderLine. + */ interface VCFLineParser { /** * parse a VCF line - * - * @see #parseLine(String, List, List) VCFv4.2+ recommended tags support - * - * @param valueLine the line - * @param expectedTagOrder List of expected tags - * @return a mapping of the tags parsed out - */ - default Map parseLine(String valueLine, List expectedTagOrder) { - return parseLine(valueLine, expectedTagOrder, Collections.emptyList()); - } - - /** - * parse a VCF line - * - * The recommended tags were introduced in VCFv4.2. - * Older implementations may throw an exception when the recommendedTags field is not empty. - * - * We use a list to represent tags as we assume there will be a very small amount of them, - * so using a {@code Set} is overhead. - * + * + * @see #parseLine(String, List) VCFv4.2+ recommended tags support + * * @param valueLine the line * @param expectedTagOrder List of expected tags - * @param recommendedTags List of tags that may or may not be present. Use an empty list instead of NULL for none. * @return a mapping of the tags parsed out */ - Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags); + Map parseLine(String valueLine, List expectedTagOrder); } - /** * a class that handles the to and from disk for VCF 4 lines */ class VCF4Parser implements VCFLineParser { - + + /** + * Parse a VCFHeaderLine. The expectedTagOrder list prescribes the order in which tags should appear, but + * all tags are treated as optional. Additional tags are allowed after the expected tags, and may appear in + * any order. It is the caller's responsibility to validate that all required tags are present and that + * any additional "optional" tags are valid. + * + * @param valueLine the header line string + * @param expectedTagOrder List of tags that are required to appear in the order they're expected. Additional + * "extra" tags are allowed after the tags in this list, and must be validated by + * the caller. + * @return a mapping of all tags parsed out + */ @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -159,28 +159,30 @@ public Map parseLine(String valueLine, List expectedTagO throw new TribbleException.InvalidHeader("Unclosed quote in header line value " + valueLine); } - // validate the tags against the expected list - index = 0; + + // Validate the order of all discovered tags against requiredTagOrder. All tags are treated as + // "optional". Succeeding does not mean that all expected tags in the list were seen. Also, all + // structured header lines can have "extra" tags, with no order specified, so additional tags + // are tolerated. if ( expectedTagOrder != null ) { - if (ret.keySet().isEmpty() && !expectedTagOrder.isEmpty()) { - throw new TribbleException.InvalidHeader("Header with no tags is not supported when there are expected tags in line " + valueLine); + // If there are N expected tags present in the parsed header, the first N tags must exactly + // match the order of the expected tags list, the remaining tags are considered optional + int numExpectedTagsPresent = 0; + for (final String expectedTag : expectedTagOrder) { + if (ret.containsKey(expectedTag)) numExpectedTagsPresent++; } - for ( String str : ret.keySet() ) { - if (index < expectedTagOrder.size()) { - if (!expectedTagOrder.get(index).equals(str)) { - if (expectedTagOrder.contains(str)) { - throw new TribbleException.InvalidHeader("Tag " + str + " in wrong order (was #" + (index+1) + ", expected #" + (expectedTagOrder.indexOf(str)+1) + ") in line " + valueLine); - } else if (recommendedTags.contains(str)) { - throw new TribbleException.InvalidHeader("Recommended tag " + str + " must be listed after all expected tags in line " + valueLine); - } - else { - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); - } - } + index = 0; + for (final String str : ret.keySet()) { + if (index == numExpectedTagsPresent) { + break; // done - end of requiredTagOrder list + } else if (!expectedTagOrder.get(index).equals(str)) { + throw new TribbleException.InvalidHeader( + String.format("Unexpected tag or tag order for tag \"%s\" in line %s", str, valueLine)); } index++; } } + return ret; } } @@ -188,13 +190,9 @@ public Map parseLine(String valueLine, List expectedTagO class VCF3Parser implements VCFLineParser { @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { - if (!recommendedTags.isEmpty()) { - throw new TribbleException.InternalCodecException("Recommended tags are not allowed in VCFv3.x"); - } - + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -211,20 +209,34 @@ public Map parseLine(String valueLine, List expectedTagO for (char c: valueLine.toCharArray()) { switch (c) { case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + case (',') : + if (!inQuote) { + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + builder = new StringBuilder(); + break; + } // drop the current key value to the return map default: builder.append(c); // otherwise simply append to the current string } index++; } ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - // validate the tags against the expected list + // Validate that: + // we have no more tags than are expected + // the ones we have are in the expected list + // they appear in the same order as in the expected list + // This does no checking for missing tags; all tags are treated as optional + // index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + if (tagIndex != expectedTagOrder.size()) { + throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + } for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + if (!expectedTagOrder.get(index).equals(str)) { + throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + } index++; } return ret; } -} +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java index 785449de89..88432f0b18 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java @@ -25,9 +25,37 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the type encodings we use for fields in VCF header lines */ public enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; + Integer, + Float, + String, + Character, + Flag; + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, we assume the string represents a numeric + * value and return Integer. The caller should convert and validate the value. + * + * @param lineTypeString + * @return VCFHeaderLineType for {@code lineTypeString} + */ + protected static VCFHeaderLineType decode(final String lineTypeString) { + ValidationUtils.nonNull(lineTypeString); + return VCFHeaderLineType.valueOf(lineTypeString); + } + + /** + * Encode this line type as a string suitable for serialization to a VCF header. Note this is + * not version specific and defaults to VCFv42. + * + * The serialized encoding is the simple name of the enum constant + * @return string encoding of this line type + */ + String encode() { return this.toString(); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java new file mode 100644 index 0000000000..becbf64eb1 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java @@ -0,0 +1,286 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceDictionaryUtils; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Class used to produce a set of header lines resulting from the merger of one or more input VCFHeaders. + *

    + * The resulting lines have a version line matching the highest version of any of the input headers. + *

    + * The headers to be merged must conform to certain requirements: + * Some headers sets cannot be merged, and will result in an exception being thrown: + *

      + *
    • Headers must have a version that is at least VCF v4.2. Headers from older versions may not be merged (note + * that older headers that are read from input files are automatically "converted" to VCF v4.2 by VCFCodec. See + * {@link AbstractVCFCodec#setVCFHeader(VCFHeader).}
    • + *
    • any header that contains a header line that doesn't conform to the resulting (highest )version of any + * header in the merge list
    • + *
    • any header that has a sequence dictionary that is incompatible with any other merged header's + * sequence dictionary. All headers must either share a common sequence dictionary, or have a sequence dictionary + * that is a subset of the common sequence dictionary that is taken from the remaining headers.
    • + *
    + */ +public class VCFHeaderMerger { + + /** + * Merge all header lines in a set of headers into a single set of header lines. The resulting set includes + * all unique lines that appeared in any header; duplicates of lines are excluded from the result set. Equivalent + * header lines are reduced to a single representative header line. The resulting set contains a ##fileformat + * version line for the newest version seen in any of the headers provided in the input header collection, + * and all lines in the merged set are compatible with that version. + * + * @param headers the headers to merge + * @param emitWarnings true if warnings should be emitted + * @return a set of merged VCFHeaderLines + * @throws TribbleException if any header has a version < VCFv4.2, or if any header line in any + * input header is not compatible the newest version selected from amongst all headers provided, or if any + * header has a sequence dictionary that is incompatible with any other header's sequence dictionary + */ + public static Set getMergedHeaderLines(final Collection headers, final boolean emitWarnings) { + ValidationUtils.nonNull(headers, "headers"); + ValidationUtils.validateArg(!headers.isEmpty(), "headers collection must be non empty"); + + // use a VCFMetaDataLines object to accumulate header lines + final VCFMetaDataLines mergedMetaData = new VCFMetaDataLines(); + final HeaderMergeConflictWarnings conflictWarner = new HeaderMergeConflictWarnings(emitWarnings); + + final VCFHeaderVersion newestVersion = getNewestHeaderVersion(headers); + final SAMSequenceDictionary commonSequenceDictionary = getCommonSequenceDictionaryOrThrow(headers, conflictWarner); + + for (final VCFHeader sourceHeader : headers) { + for (final VCFHeaderLine line : sourceHeader.getMetaDataInSortedOrder()) { + final String key = line.getKey(); + if (VCFHeaderVersion.isFormatString(key) || key.equals(VCFHeader.CONTIG_KEY)) { + // drop all version and contig lines, and at the end we'll set the version and + // commonSequenceDictionary + continue; + } + + // Structured header lines are only considered equal if they have identical key, id, and + // attribute/value pairs, but for merging we need to reduce lines that have the same key/id pairs + // but different attributes to a single line. So use the more permissive "findEquivalentHeaderLine" + // to detect equivalent lines, and delegate to the individual header line implementations to do the + // smart reconciliation. + final VCFHeaderLine other = mergedMetaData.findEquivalentHeaderLine(line); + if (other != null && !line.equals(other)) { + if (key.equals(VCFConstants.FORMAT_HEADER_KEY)) { + // Delegate to the FORMAT line resolver + mergedMetaData.addMetaDataLine( + VCFFormatHeaderLine.getMergedFormatHeaderLine( + (VCFFormatHeaderLine) line, + (VCFFormatHeaderLine) other, + conflictWarner) + ); + } else if (key.equals(VCFConstants.INFO_HEADER_KEY)) { + // Delegate to the INFO line resolver + mergedMetaData.addMetaDataLine( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + (VCFInfoHeaderLine) line, + (VCFInfoHeaderLine) other, + conflictWarner) + ); + } else if (line.isIDHeaderLine()) { + // equivalent ID header line, but not a compound(format/info) line, and also not strictly equal + // to the existing line: preserve the existing line (this *may* drop attributes/values if the + // dropped line has additional attributes) + conflictWarner.warn( + String.format("Dropping duplicate header line %s during header merge, retaining equivalent line %s", + line, + other)); + } else { + // a non-structured line with a duplicate key of an existing line, but a different value, + // retain the new line in addition to the old one + mergedMetaData.addMetaDataLine(line); + } + } else { + mergedMetaData.addMetaDataLine(line); + } + } + } + return makeMergedMetaDataSet(mergedMetaData, newestVersion, commonSequenceDictionary, conflictWarner); + } + + // Create the final set of all of our merged header lines. Start with the version line for the new + // version, add in the lines from the merged set, use the resulting list to create a header, add the common + // sequence dictionary to that, and then extract and return the resulting set of lines in sorted order + private static Set makeMergedMetaDataSet( + final VCFMetaDataLines mergedMetaData, + final VCFHeaderVersion newestVersion, + final SAMSequenceDictionary commonSequenceDictionary, + final HeaderMergeConflictWarnings conflictWarner) { + + if (conflictWarner.emitWarnings) { + mergedMetaData.getValidationErrors(newestVersion) + .forEach(validationError -> conflictWarner.warn(validationError.getFailureMessage())); + } + + final Set mergedLines = VCFHeader.makeHeaderVersionLineSet(newestVersion); + mergedLines.addAll(mergedMetaData.getMetaDataInInputOrder()); + final VCFHeader mergedHeader = new VCFHeader(mergedLines, Collections.emptySet()); + if (commonSequenceDictionary != null) { + mergedHeader.setSequenceDictionary(commonSequenceDictionary); + } else { + conflictWarner.warn( + "The header lines resulting from a header merge contain no contig lines because none " + + "of the input headers contains a sequence dictionary."); + } + + return new LinkedHashSet<>(mergedHeader.getMetaDataInSortedOrder()); + } + + // Find the newest version af any header in the input set, and return that to use as the target + // version for the merged lines. + private static VCFHeaderVersion getNewestHeaderVersion(final Collection vcfHeaders) { + VCFHeaderVersion newestVersion = null; + for (final VCFHeader header : vcfHeaders) { + final VCFHeaderVersion vcfVersion = header.getVCFHeaderVersion(); + if (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { + throw new TribbleException(String.format( + "Cannot merge a VCFHeader with version (%s) that is older than version %s", + header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2)); + } + if (newestVersion == null || (vcfVersion.ordinal() > newestVersion.ordinal())) { + newestVersion = vcfVersion; + } + } + return newestVersion; + } + + // Create a common sequence dictionary from the set of dictionaries in VCFHeaders. The headers must + // either have identical dictionaries, or contain a common superset dictionary where individual dictionaries + // contain a dictionary that is subset of that common superset. Otherwise throw. + private static SAMSequenceDictionary getCommonSequenceDictionaryOrThrow( + final Collection headers, + final HeaderMergeConflictWarnings conflictWarner) { + SAMSequenceDictionary candidateDictionary = null; + + // Because we're doing pairwise comparisons and always selecting the best dictionary as + // our running candidate, we need to visit the headers in order of dictionary size + // (largest first). This prevents a premature failure where an individual pairwise + // comparison erroneously fails because the source is pairwise incompatible with the + // running candidate, and the common superset exists but we just haven't seen it yet. + final List headersByDictionarySize = new ArrayList<>(headers); + headersByDictionarySize.sort(((Comparator) + (hdr1, hdr2) -> Integer.compare(getDictionarySize(hdr1), getDictionarySize(hdr2))).reversed()); + + for ( final VCFHeader sourceHeader : headersByDictionarySize ) { + final SAMSequenceDictionary sourceDictionary = sourceHeader.getSequenceDictionary(); + if (sourceDictionary != null) { + if (candidateDictionary == null) { + candidateDictionary = sourceDictionary; + } else { + // first, compare with checkContigOrdering on + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility compatibility = + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + true); + switch (compatibility) { + case IDENTICAL: // existing candidateDictionary is identical to sourceDictionary, so keep it + case SUPERSET: // existing candidateDictionary is a superset of sourceDictionary, so keep it + break; + + case COMMON_SUBSET: // fall through + case DIFFERENT_INDICES: + // There exists a common subset of contigs, but for merging purposes we have a slightly + // stricter requirement, that one dictionary is a superset of the other. So try the + // comparison again with checkContigOrdering off, in both directions. If one is a + // superset of the other, retain the superset. + if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + false)) { + break; // keep our candidate + } else if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + sourceDictionary, + candidateDictionary, + false)) { + candidateDictionary = sourceDictionary; // take the sourceDictionary as the new candidate + } else { + // dictionaries are disjoint, and we have no basis to choose a merge order for the + // non-common contigs, so give up + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + break; + + case NO_COMMON_CONTIGS: // no overlap between dictionaries + case UNEQUAL_COMMON_CONTIGS: // common subset has contigs that have the same name but different lengths + case NON_CANONICAL_HUMAN_ORDER: // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + case OUT_OF_ORDER: // the two dictionaries overlap but the overlapping contigs occur in different + default: + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + } + } else { + conflictWarner.warn( + String.format( + "Merging header with no sequence dictionary: %s", + getHeaderFragmentForDisplay(sourceHeader))); + } + } + return candidateDictionary; + } + + private static Integer getDictionarySize(final VCFHeader hdr) { + final SAMSequenceDictionary dictionary = hdr.getSequenceDictionary(); + return dictionary == null ? 0 : dictionary.size(); + } + + private static String createHeaderDictionaryFailureMessage( + final SAMSequenceDictionary commonSequenceDictionary, + final VCFHeader sourceHeader, + final SAMSequenceDictionary sourceSequenceDictionary, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility failureReason) { + // return a nice long message that includes as much of the offending context as is reasonable, + // without printing the entire context, since the headers and sequence dictionaries can have + // thousands of entries + return String.format( + "Can't merge VCF headers with incompatible sequence dictionaries, merge failed due to %s:" + + "\n\nHeader dictionary:\n\n%1.2000s\n\nis incompatible with the common dictionary:\n\n%1.2000s\n\n merging VCF header:\n\n%1.2000s\n", + failureReason, + sourceSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + commonSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + getHeaderFragmentForDisplay(sourceHeader)); + } + + private static String getHeaderFragmentForDisplay(final VCFHeader sourceHeader) { + return sourceHeader.getContigLines().stream().map(VCFContigHeaderLine::toString).collect(Collectors.joining("\n")); + } + + /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ + static final class HeaderMergeConflictWarnings { + boolean emitWarnings; + final Set alreadyIssued = new HashSet<>(); + + protected HeaderMergeConflictWarnings(final boolean emitWarnings ) { + this.emitWarnings = emitWarnings; + } + + public void warn(final String msg) { + if ( emitWarnings && ! alreadyIssued.contains(msg) ) { + alreadyIssued.add(msg); + VCFHeader.logger.warn(msg); + } + } + } +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index 43f43c65c3..454d567300 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -26,6 +26,7 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; /** * information that identifies each header version @@ -47,7 +48,7 @@ public enum VCFHeaderVersion { * @param vString the version string * @param fString the format string */ - VCFHeaderVersion(String vString, String fString) { + VCFHeaderVersion(String vString, String fString) { this.versionString = vString; this.formatString = fString; } @@ -67,7 +68,8 @@ public static VCFHeaderVersion toHeaderVersion(String version) { /** * are we a valid version string of some type - * @param version the version string + * @param version the version string (the part of the header line that specifies the version, + * i.e., "VCFv4.3" if the line is "##fileformat=VCFv4.3") * @return true if we're valid of some type, false otherwise */ public static boolean isVersionString(String version){ @@ -75,7 +77,8 @@ public static boolean isVersionString(String version){ } /** - * are we a valid format string for some type + * are we a valid format string for some type (the key part of the header line that specifies a version, + * i.e., "fileformat" if the line is "##fileformat=VCFv4.3") * @param format the format string * @return true if we're valid of some type, false otherwise */ @@ -87,8 +90,16 @@ public static boolean isFormatString(String format){ return false; } - public static VCFHeaderVersion getHeaderVersion(String versionLine) { - String[] lineFields = versionLine.split("="); + /** + * + * @param versionLine a VCF header version line, including the leading meta data indicator, + * for example "##fileformat=VCFv4.2" + * @return the VCFHeaderVersion for this string + * @throws TribbleException.InvalidHeader if the string is not a version string for a recognized supported version + */ + public static VCFHeaderVersion fromHeaderVersionLine(final String versionLine) { + ValidationUtils.nonNull(versionLine, "version line"); + final String[] lineFields = versionLine.split("="); if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); @@ -98,6 +109,13 @@ public static VCFHeaderVersion getHeaderVersion(String versionLine) { return toHeaderVersion(lineFields[1]); } + /** + * @return A VCF "##fileformat=version" metadata string for the supplied version. + */ + public String toHeaderVersionLine() { + return String.format("%s%s=%s", VCFHeader.METADATA_INDICATOR, getFormatString(), getVersionString()); + } + /** * Utility function to clean up a VCF header string * @@ -125,4 +143,5 @@ public String getVersionString() { public String getFormatString() { return formatString; } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 13df34bc87..410409ca12 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -26,44 +26,88 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.utils.ValidationUtils; + +import java.util.Optional; + /** - * @author ebanks *

    * Class VCFInfoHeaderLine *

    *

    - * A class representing a key=value entry for INFO fields in the VCF header + * A class representing an INFO field in the VCF header *

    */ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } + private static final long serialVersionUID = 1L; + + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + } + + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); } public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version + ); + validateForVersion(version); } - // info fields allow flag values - @Override - boolean allowFlagValues() { - return true; + /** + * Compare two VCFInfoHeaderLine objects to determine if they have compatible number types, and return a + * VCFInfoHeaderLine that represents the result of merging these two lines. + * + * @param infoLine1 first info line to merge + * @param infoLine2 second info line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFInfoHeaderLine + */ + public static VCFInfoHeaderLine getMergedInfoHeaderLine( + final VCFInfoHeaderLine infoLine1, + final VCFInfoHeaderLine infoLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(infoLine1); + ValidationUtils. nonNull(infoLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + infoLine1, + infoLine2, + conflictWarner, + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); } @Override - public boolean shouldBeAddedToDictionary() { - return true; + protected Optional validateKeyOrID(final String id) { + return id.equals(VCFConstants.THOUSAND_GENOMES_KEY) + ? Optional.empty() + : super.validateKeyOrID(id); } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java new file mode 100644 index 0000000000..3055c93889 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -0,0 +1,554 @@ +package htsjdk.variant.vcf; + +import htsjdk.annotations.InternalAPI; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Class for managing the set of VCFHeaderLines maintained by a VCFHeader. + * + * Since this class is used to incrementally build up a set of header lines for use with a VCFHeader, + * it does not require that the list always contain a file format line (its VCFHeader's job to enforce + * that condition). + * + * This class maintains several invariants: + * + * - The "current version" of the lines is tracked by recording whether a version line (a line that + * establishes the VCFHeaderVersion, such as format/fileformat line) has been added to the list. If + * no version line has been added, the list will have a null current version; if a version line has + * been added, it will have a non-null version. If the version line is manually removed, the "current + * version" is reset to null. + * + * - Each contig line that is retained is guaranteed to have a unique contig index. This does + * NOT guarantee that the contig indices are contiguous, or ordered, only that they are unique. + * + * - Each structured (ID) line for a given key will have a unique ID. Any new line that has the same + * key/ID pair as an existing line will replace the previous line. (Previous htsjdk implementations + * preserve such lines in a master line list, but would silently drop them from the typed + * lookup lists, so such duplicates would never be returned in queries for typed lines such as + * getInfoHeaderLines(), but would still be serialized on write.) + * + * This class does NOT validate that the lines contained are valid for the current version (that is + * the caller's responsibility). + */ +//Visible to allow disq Kryo registration for serialization +@InternalAPI +final class VCFMetaDataLines implements Serializable { + public static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFMetaDataLines.class); + + // Master map of all header lines (including file format version lines and contig header lines) + private final Map mMetaData = new LinkedHashMap<>(); + + // Map of contig index to contig header line. Must be kept in sync with the mMetaData map + private final Map contigIndexMap = new LinkedHashMap<>(); + + // Current version for lines included in the list. May be null. Must be kept in sync with the + // contents of the mMetaData map. + private VCFHeaderVersion vcfVersion; + + /** + * Add all metadata lines from Set. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. + * + * @param newMetaData Set of lines to be added to the list. + * @throws IllegalArgumentException if a version is established or if any line fails validation for that version + */ + public void addMetaDataLines(final Set newMetaData) { + newMetaData.forEach(this::addMetaDataLine); + } + + /** + * Add a metadata line to the list. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. + * + * @param newMetaDataLine header line to attempt to add + * @returns an existing (equivalent) header line that was replaced by newMetaDataLine, if any, + * otherwise null + */ + public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.nonNull(newMetaDataLine, "metadata line"); + + if (VCFHeaderVersion.isFormatString(newMetaDataLine.getKey())) { + // for format lines, we need to remove any existing format line (which may have a different key + // than the new line, since old VCF versions use a different format key than modern versions) + return updateVersion(newMetaDataLine); + } else { + // Enforce restriction that contig and ALT line IDs cannot share IDs (c.f. VCF 4.3 spec section 1.4.7) + // We do not store them in the same namespace so that we can distinguish cases of two lines + // of the same type clashing vs an ALT line clashing with an existing contig line or vice versa + switch (newMetaDataLine.getKey()) { + case VCFConstants.CONTIG_HEADER_KEY: + validateContigAndALTLinesDisjoint(VCFConstants.ALT_HEADER_KEY, newMetaDataLine.getID()); + break; + case VCFConstants.ALT_HEADER_KEY: + validateContigAndALTLinesDisjoint(VCFConstants.CONTIG_HEADER_KEY, newMetaDataLine.getID()); + break; + } + + // otherwise, see if there is an equivalent line that the new line will replace + final HeaderLineMapKey newMapKey = makeKeyForLine(newMetaDataLine); + final VCFHeaderLine equivalentMetaDataLine = mMetaData.get(newMapKey); + if (equivalentMetaDataLine == null) { + createNewMapEntry(newMapKey, newMetaDataLine); + } else { + replaceExistingMapEntry(newMapKey, equivalentMetaDataLine, newMetaDataLine); + } + return equivalentMetaDataLine; + } + } + + private void validateContigAndALTLinesDisjoint(final String namespace, final String id) { + if (mMetaData.containsKey(makeKey(namespace, id))) { + throw new IllegalStateException( + String.format("ALT and contig line IDs must be disjoint, but both were found for ID: %s", id)); + } + } + + /** + * Remove an equivalent metadata line from the list. This is the inverse of addMetaDataLine, and removes + * any equivalent line that already exists (any existing file format line if the line to be removed is + * an unstructured file format line; any existing identical line if the line to be removed is an unstructured + * non-file format line, or any existing line with a duplicate key/ID pair if the line to be removed is a + * structured line). + * + * The removed value is returned, and can be used by the caller to determine if the removed line has a + * different value than the line presented. + * + * @param lineToRemove the header line to remove + * @return The actual header line removed, or null of no equivalent header line was found to remove + */ + public VCFHeaderLine removeMetaDataLine(final VCFHeaderLine lineToRemove) { + VCFHeaderLine removedLine = null; + if (VCFHeaderVersion.isFormatString(lineToRemove.getKey()) && vcfVersion != null) { + final VCFHeaderVersion versionToRemove = VCFHeaderVersion.toHeaderVersion(lineToRemove.getValue()); + if (versionToRemove.equals(vcfVersion)) { + // simulate "removal" of the line by recreating the line that we're dropping as the return value + removedLine = VCFHeader.makeHeaderVersionLine(versionToRemove); + vcfVersion = null; + } + } else { + removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); + // only synchronize the dependent contig map variables if a line was ACTUALLY removed + if (removedLine != null && lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { + removeFromContigIndexMap((VCFContigHeaderLine) removedLine); + } + } + return removedLine; + } + + /** + * @return the version for any contained version line. may be null if no file format version + * line is in the list + */ + public VCFHeaderVersion getVCFVersion() { + return vcfVersion; + } + + /** + * Return the existing line from the list that is "equivalent" to the query line, where + * equivalent is defined as having the same key and value for unstructured header lines, + * or the same key and ID, but not necessarily the same value, for structured header lines. + * The "equivalent" line returned by this method is not guaranteed to be equal to the + * queryLine, in the case where the queryLine is an ID line. + * + * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, + * what line, if any, would it replace". + * + * Note that for file format (VCF version) lines, this returns an existing file format line + * if there is one, even if the key is different than the query line (since that behavior + * mirrors the behavior of addMetaDataLine and removeMetaDataLine). + * + * @param queryLine the source line to use to check for equivalents + * @return The existing header line of the type/key provided, otherwise NULL. + */ + public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { + if (VCFHeaderVersion.isFormatString(queryLine.getKey())) { + return vcfVersion == null ? + null : + VCFHeader.makeHeaderVersionLine(vcfVersion); + } else { + return mMetaData.get(makeKeyForLine(queryLine)); + } + } + + /** + * Validate all metadata lines, excluding the file format line against a target version. + * Throws {@link TribbleException.VersionValidationFailure} if any line is incompatible with the given version. + * @param targetVersion the target version to validate against + * @throws {@link TribbleException.VersionValidationFailure} if any existing line fails to validate against + * {@code targetVersion} + */ + //TODO: we need to tell users how to resolve the case where this fails due to version validation + //i.e, use a custom upgrade tool + public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { + mMetaData.values().forEach(headerLine -> { + if (!VCFHeaderVersion.isFormatString(headerLine.getKey())) { + headerLine.validateForVersion(targetVersion); + } + }); + } + + /** + * Get a list of validation failures for all metadata lines (except the file format line) against + * a target version. + * + * @param targetVersion the target version to validate against + * @return an Collection describing the lines that failed to validate + * incompatible with targetVersion. The collections is empty if validation succeeded for all lines. + */ + public Collection> getValidationErrors(final VCFHeaderVersion targetVersion) { + return mMetaData.values().stream() + .filter(line -> !VCFHeaderVersion.isFormatString(line.getKey())) + .map(l -> l.getValidationFailure(targetVersion)) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList()); + } + + /** + * get the meta data, associated with this header, in input order + * + * @return a set of the meta data + */ + public Set getMetaDataInInputOrder() { + return makeMetaDataLineSet(mMetaData.values()); + } + + /** + * get the meta data, associated with this header, in SORTED order + * + * @return a set of the meta data + */ + public Set getMetaDataInSortedOrder() { + // Use an intermediate TreeSet to get the correct sort order (via the header line + // comparators), but return an (unmodifiable) LinkedHashSet because TreeSet has a + // `contains` implementation based on comparator equality that can lead to inconsistent + // results for header line types like VCFContigHeaderLine that have a compareTo + // implementation that is inconsistent with equals. + return makeMetaDataLineSet(new TreeSet<>(mMetaData.values())); + } + + /** + * @return all of the structured (ID) lines in their original file order, or an empty list if none were present + */ + public List getIDHeaderLines() { + return mMetaData.values().stream() + .filter(VCFHeaderLine::isIDHeaderLine) + .map(hl -> (VCFSimpleHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getFilterLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY)) + .map(hl -> (VCFFilterHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present + */ + public List getContigLines() { + return Collections.unmodifiableList(new ArrayList<>(new TreeSet<>(contigIndexMap.values()))); + } + + /** + * Get the VCFHeaderLine(s) whose key equals key. Returns null if no such line exists + * @param key the VCFHeaderLine key to use to locate the headerline + * @return collection of VCFHeaderLine + */ + public Collection getMetaDataLines(final String key) { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Returns the INFO VCFHeaderLine in their original ordering + */ + public Collection getInfoHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.INFO_HEADER_KEY)) + .map(hl -> (VCFInfoHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * Returns the FORMAT VCFHeaderLine in their original ordering + */ + public Collection getFormatHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY)) + .map(hl -> (VCFFormatHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @param id the id of the requested header line + * @return the VCFHeaderLine info line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(final String id) { + return (VCFInfoHeaderLine) mMetaData.get(makeKey(VCFConstants.INFO_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header format line + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { + return (VCFFormatHeaderLine) mMetaData.get(makeKey(VCFConstants.FORMAT_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { + return (VCFFilterHeaderLine) mMetaData.get(makeKey(VCFConstants.FILTER_HEADER_KEY, id)); + } + + /** + * Returns the other VCFHeaderLines in their original ordering, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { + return getMetaDataInInputOrder().stream().filter( + hl -> + !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY) + ) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * A version/fileformat header line representing the version for these lines, otherwise null. + * @return The version file format header line if a version has been established, otherwise null. + */ + public VCFHeaderLine getFileFormatLine() { + return vcfVersion == null ? null : VCFHeader.makeHeaderVersionLine(vcfVersion); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder(); + b.append("[VCFMetaDataLines:"); + for ( final VCFHeaderLine line : mMetaData.values() ) + b.append("\n\t").append(line); + return b.append("\n]").toString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFMetaDataLines)) return false; + + final VCFMetaDataLines that = (VCFMetaDataLines) o; + + return mMetaData.equals(that.mMetaData); + } + + @Override + public int hashCode() { + return mMetaData.hashCode(); + } + + /** + * Generate a unique key for a VCFHeaderLine. If the header line is a VCFStructuredHeaderLine, the key + * is the concatenation of the VCFHeaderLine's key (i.e., the type of the VCFHeaderLine) and the ID for + * that VCFHeaderLine (with a ":" separator). Otherwise, we use the concatenation of the OTHER_KEY, the + * VCFHeaderLine's key, and a nonce value to ensure that unstructured lines never collide with structured + * lines, and also can have duplicate identical instances. + * + * @param headerLine the {@link VCFHeaderLine} for which a key should be returned + * @return the generated HeaderLineMapKey + */ + private HeaderLineMapKey makeKeyForLine(final VCFHeaderLine headerLine) { + if (headerLine.isIDHeaderLine()) { + // these are required to have a unique ID, so use the line key as the key, and the id as the constraint + return makeKey(headerLine.getKey(), headerLine.getID()); + } else { + // Allow duplicate unstructured "other" keys, as long as they have different values. Use + // the line key as the key, and the line hashcode as the constraint. + // + // The previous implementation dropped duplicate keys for unstructured lines, but the spec doesn't + // require these to be unique (only to have unique values). This implementation is more permissive in + // that it allows lines with duplicate keys to accumulate as long as they have different values, but + // retains only one with a unique value. + return makeKey(headerLine.getKey(), Integer.toString(headerLine.hashCode())); + } + } + + // Create a VCFHeaderLine hashmap key given a key and an id + private HeaderLineMapKey makeKey(final String nameSpace, final String id) { return new HeaderLineMapKey(nameSpace, id); } + + private void createNewMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + // for creation of a new entry, call updateMapEntry, but validate that it ALWAYS returns the + final VCFHeaderLine existingLine = updateMapEntry(newMapKey, newMetaDataLine); + if (existingLine != null ) { + throw new TribbleException(String.format( + "Internal header synchronization error - found unexpected previous value %s while adding %s", + existingLine, + newMetaDataLine)); + } + } + + private VCFHeaderLine updateMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + final VCFHeaderLine existingLine = mMetaData.put(newMapKey, newMetaDataLine); + if (newMetaDataLine.isIDHeaderLine() && newMetaDataLine.getKey().equals(VCFHeader.CONTIG_KEY)) { + addToContigIndexMap((VCFContigHeaderLine) newMetaDataLine); + } + return existingLine; + } + + // We can't just blindly replace a line in the map based on the key using map.put, because the contig + // map will get out of sync if the line being replaced is a contig line that has a different contig + // index than the line being replaced. So replace the line in two atomic operations; first remove + // the old line and it's corresponding contig index entry, then add the new contig line and it's + // corresponding contig index entry. + private VCFHeaderLine replaceExistingMapEntry( + final HeaderLineMapKey newMapKey, + final VCFHeaderLine existingMetaDataLine, + final VCFHeaderLine newMetaDataLine) { + removeFromMapOrThrow(existingMetaDataLine); + logger.debug(() -> + "Replacing existing header metadata line: " + + existingMetaDataLine.toStringEncoding() + + " with header metadata line: " + + newMetaDataLine.toStringEncoding() + + "."); + createNewMapEntry(newMapKey, newMetaDataLine); + return existingMetaDataLine; + } + + // remove a line that is expected to be currently in the list, and throw if the line + // isn't found, or if the removed line is different (not equal to) the line to remove + private void removeFromMapOrThrow(final VCFHeaderLine lineToRemove) { + final VCFHeaderLine removedLine = removeMetaDataLine(lineToRemove); + if (removedLine == null || !removedLine.equals(lineToRemove)) { + // sanity check since in this case there should ALWAYS be a non-null line that was removed + // that is an exact duplicate of the "existingLine" + throw new TribbleException(String.format("Internal header synchronization error %s/%s", + lineToRemove, + removedLine == null ? "null line" : removedLine)); + } + } + + //add the new line to our contig index map + private void addToContigIndexMap(final VCFContigHeaderLine newContigLine) { + final VCFContigHeaderLine collidingContigLine = contigIndexMap.get(newContigLine.getContigIndex()); + if (collidingContigLine != null && !collidingContigLine.equals(newContigLine)) { + if (collidingContigLine.getID().equals(newContigLine.getID())) { + // the new line has the same contig ID and index as an existing line, but differ in + // some other attribute, so accept it but log a warning + logger.warn(String.format( + "Replacing an existing contig header line (%s) with a new, similar line that has different attributes (%s)", + collidingContigLine, + newContigLine)); + } else { + // the new contig line collides with an existing contig index, but specifies a different + // contig name, so reject it + throw new TribbleException(String.format( + "Attempt to replace a contig header line (%s) that has the same contig index as an existing line (%s)", + newContigLine, + collidingContigLine)); + } + } + contigIndexMap.put(newContigLine.getContigIndex(), newContigLine); + } + + // remove the contig header line from the contig index map + private void removeFromContigIndexMap(final VCFContigHeaderLine existingContigLine) { + // this remove overload only removes the specified object if its actually in the map + contigIndexMap.remove(existingContigLine.getContigIndex(), existingContigLine); + } + + // First, check for existing header lines that establish a header version. Whenever a new one is + // added, we need to remove the previous version line, validate all remaining lines against the new + // version, then add the new version line, and update our version state. We have to explicitly + // call isFormatString, and manually update the lines, since there is more than one header line key + // that can change the version. In some cases this will result in removing a line fileformat/version + // line with one key and replacing it with a line that has a different key. + private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.validateArg( + VCFHeaderVersion.isFormatString(newMetaDataLine.getKey()), + "a file format line is required"); + + final VCFHeaderVersion newVCFVersion = VCFHeaderVersion.toHeaderVersion(newMetaDataLine.getValue()); + + if (vcfVersion == null) { + logger.debug("Establishing header metadata version ", newVCFVersion); + } else if (!newVCFVersion.equals(vcfVersion)) { + logger.debug(() -> + "Updating header metadata version from " + + vcfVersion + + " to " + + newVCFVersion); + } + + final VCFHeaderLine oldVersionLine = getFileFormatLine(); + vcfVersion = newVCFVersion; + return oldVersionLine; + } + + // make a new metadata line set to hand out to callers that includes + private Set makeMetaDataLineSet(final Collection orderedLines) { + if (vcfVersion != null) { + final Set orderedSet = new LinkedHashSet<>(orderedLines.size() + 1); + orderedSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + orderedSet.addAll(orderedLines); + return Collections.unmodifiableSet(orderedSet); + } else { + return Collections.unmodifiableSet(new LinkedHashSet<>(orderedLines)); + } + } + + // composite keys used by the metadata lines map + private static class HeaderLineMapKey implements Serializable { + public static final long serialVersionUID = 1L; + + final String key; + final String constraint; + + public HeaderLineMapKey(final String key, final String constraint) { + this.key = key; + this.constraint = constraint; + } + + public final String getKey() { return key; } + public final String getConstraint() { return constraint; } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final HeaderLineMapKey that = (HeaderLineMapKey) o; + + if (!key.equals(that.key)) return false; + return constraint.equals(that.constraint); + } + + @Override + public int hashCode() { + int result = key.hashCode(); + result = 31 * result + constraint.hashCode(); + return result; + } + } + +} + diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java index 991faa806f..d8cd83b8bb 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java @@ -1,13 +1,41 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing META fields in the VCF header + * A class representing META fields in the VCF header. */ public class VCFMetaHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; public VCFMetaHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.META_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.META_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFMetaHeaderLine(final Map mapping) { + super(VCFConstants.META_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return Optional.of( + new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ))); + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java index 24abed8eb0..55c172391c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPassThruTextTransformer.java @@ -27,4 +27,14 @@ public String decodeText(final String rawPart) { public List decodeText(final List rawParts) { return rawParts; } + + /** + * No-op encoder for a single string + * @param rawPart the raw string to be decoded + * @return the raw string with no transformation done + */ + @Override + public String encodeText(final String rawPart) { + return rawPart; + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java index 33f163e8dc..f5bd71c474 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java @@ -1,13 +1,51 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing PEDIGREE fields in the VCF header + * A class representing PEDIGREE fields in the VCF header. Applicable starting with version VCFv4.3. + * + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= */ public class VCFPedigreeHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFPedigreeHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.PEDIGREE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.PEDIGREE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFPedigreeHeaderLine(final Map mapping) { + super(VCFConstants.PEDIGREE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // previous to VCFv4.3, the PEDIGREE line did not have an ID. Such lines are not modeled by this + // class (since it is derived from VCFSimpleHeaderLine). Therefore instances of this class always + // represent VCFv4.3 or higher. So throw if the requested version is less than 4.3. + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java index 4c8015eaa5..b98b36e3f3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPercentEncodedTextTransformer.java @@ -1,8 +1,8 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; - +import java.util.Arrays; import java.util.List; +import java.util.function.IntPredicate; import java.util.stream.Collectors; /** @@ -10,65 +10,204 @@ * of characters that have special meaning in VCF. */ public class VCFPercentEncodedTextTransformer implements VCFTextTransformer { - final static private String ENCODING_SENTINEL_STRING = "%"; - final static private char ENCODING_SENTNEL_CHAR = '%'; - final static private int ENCODING_BASE_RADIX = 16; + private static final char ENCODING_SENTINEL_CHAR = '%'; + + private static final byte invalidHexEncoding = ~0; + private static final byte maxPossibleHexDigit = 'f' + 1; + private static final byte[] hexToBytes = new byte[maxPossibleHexDigit]; + private static final char[] bytesToHex = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + }; + + static { + Arrays.fill(hexToBytes, invalidHexEncoding); + for (byte i = '0'; i <= '9'; i++) hexToBytes[i] = (byte) (i - '0'); + for (byte i = 'A'; i <= 'F'; i++) hexToBytes[i] = (byte) (10 + i - 'A'); + for (byte i = 'a'; i <= 'f'; i++) hexToBytes[i] = (byte) (10 + i - 'a'); + } /** - * Transform a single string, replacing % encoded values with their corresponding text. + * Transform a single string, replacing percent encoded values with their corresponding text. * * @param rawPart the raw string to be decoded * @return the decoded string - * @throws TribbleException if the the encoding is uninterpretable */ @Override public String decodeText(final String rawPart) { - return decodePercentEncodedChars(rawPart); + return percentDecode(rawPart); } /** - * Transform a list of strings, replacing % encoded values with their corresponding text in each string. + * Transform a list of strings, replacing percent encoded values with their corresponding text in each string. * - * @param rawParts a list of raw strings + * @param rawParts a list of raw strings * @return a list of decoded strings - * @throws TribbleException if the the encoding is uninterpretable */ @Override public List decodeText(final List rawParts) { - return rawParts.stream().map(this::decodeText).collect(Collectors.toList()); + return rawParts.stream().map(VCFPercentEncodedTextTransformer::percentDecode).collect(Collectors.toList()); } /** - * Transform input strings containing embedded percent=encoded characters. For example, when given the + * Transform input strings containing embedded percent encoded characters. For example, when given the * string '%3D%41' will return the string '=A'. + *

    + * This method is permissive in the input it accepts. Capitalized and lower case percent encoding are both + * accepted, although the VCF spec only allows capitalized encoding. Uninterpretable escape sequences + * (the % character followed by fewer than 2 characters before the end of the string, or the % sentinel + * followed by 2 characters either of which does not match the regular expression [0-9A-Fa-f]) are passed through + * uninterpreted. + *

    + * If the input text does not contain any valid percent encoded sequences, a new string is not allocated, + * and the original string is returned. * - * @param rawText a string containing zero or more embedded encodings + * @param rawString a string containing zero or more embedded encodings * @return a string with all encoded characters replaced with the corresponding character - * @throws TribbleException if the the encoding is uninterpretable */ - protected static String decodePercentEncodedChars(final String rawText) { - if (rawText.contains(ENCODING_SENTINEL_STRING)) { - StringBuilder builder = new StringBuilder(rawText.length()); - for (int i = 0; i < rawText.length(); i++) { - final char c = rawText.charAt(i); - if (c == ENCODING_SENTNEL_CHAR && ((i + 2) < rawText.length())) { - try { - final char[] trans = Character.toChars(Integer.parseInt(rawText.substring(i + 1, i + 3), ENCODING_BASE_RADIX)); - if (trans.length != 1) { - throw new TribbleException(String.format("escape sequence '%c' corresponds to an invalid encoding in '%s'", c, rawText)); - } - builder.append(trans[0]); - i += 2; - } catch (IllegalArgumentException e) { - builder.append(c); + public static String percentDecode(final String rawString) { + int matches = 0; + final int length = rawString.length(); + // A valid percent encoding requires at least 3 characters (the % character and 2 hex digits) + // so we do not scan for % characters in the last 2 characters of the string + // The spec does not specify how "truncated" encodings (% followed by fewer than 2 hex digits + // before the string ends) should be interpreted, but we treat them as literal characters + // and append them uninterpreted + for (int i = 0, l = length - 2; i < l; i++) { + if (rawString.charAt(i) == ENCODING_SENTINEL_CHAR) matches++; + } + + if (matches == 0) { + return rawString; + } else { + final StringBuilder s = new StringBuilder(length - 2 * matches); + int lastMatchEnd = 0; + int matched = 0; + for (int i = 0; ; i++) { + if (rawString.charAt(i) == ENCODING_SENTINEL_CHAR) { + final int hiDecoded = hexDigitToInt(rawString.charAt(++i)); + final int loDecoded = hexDigitToInt(rawString.charAt(++i)); + // Only decode and append the character if both characters after the % were interpretable + // as hex digits + if ((hiDecoded | loDecoded) != invalidHexEncoding) { + // Append on the portion of the original string that came before this matching character + s.append(rawString, lastMatchEnd, i - 2); + s.append((char) ((hiDecoded << 4) | (loDecoded & 0x0F))); + lastMatchEnd = i + 1; + } + matched++; + + // Found all sequences to decode in the string, so append the rest of the original string + if (matched == matches) { + s.append(rawString, lastMatchEnd, length); + return s.toString(); } - } else { - builder.append(c); } } - return builder.toString(); } - return rawText; } + private static int hexDigitToInt(final char c) { + return c < maxPossibleHexDigit ? hexToBytes[c] : invalidHexEncoding; + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + */ + @Override + public String encodeText(final String rawPart) { + return percentEncode(rawPart); + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + */ + public static String percentEncode(final String rawPart) { + return percentEncode(rawPart, VCFPercentEncodedTextTransformer::isVCFSpecialChar); + } + + /** + * Transform a single string, percent encoding values that have special meanings in VCF. + *

    + * This method is suitable for encoding a header value in a key=value pair that is of type String (e.g. Description) + * which have fewer restrictions than fields in the body of the VCF such as INFO and FORMAT. + * + * @param rawString String to encode + * @return the encoded string + */ + public static String percentEncodeHeaderText(final String rawString) { + return percentEncode(rawString, VCFPercentEncodedTextTransformer::isHeaderSpecialChar); + } + + private static String percentEncode(final String rawString, final IntPredicate charPredicate) { + int matches = 0; + final int length = rawString.length(); + for (int i = 0; i < length; i++) { + if (charPredicate.test(rawString.charAt(i))) matches++; + } + + if (matches == 0) { + return rawString; + } else { + final StringBuilder s = new StringBuilder(length + 2 * matches); + int lastMatchEnd = 0; + int matched = 0; + for (int i = 0; ; i++) { + final char c = rawString.charAt(i); + if (charPredicate.test(c)) { + // Append on the portion of the original string that came before this matching character + s.append(rawString, lastMatchEnd, i); + s.append(ENCODING_SENTINEL_CHAR); + s.append(bytesToHex[c >>> 4]); + s.append(bytesToHex[c & 0x0F]); + + lastMatchEnd = i + 1; + matched++; + + // Found all matching characters in the string, so append the rest of the original string + if (matched == matches) { + s.append(rawString, lastMatchEnd, length); + return s.toString(); + } + } + } + } + } + + // Characters that have special meaning in the value part of a structured header line key=value pair. + // Note that this is less restrictive than the full set of characters with special meaning in VCF. + // Space and comma are allowed due to the double-quoting introduced in VCF 4.2, and '=' is allowed because + // key=value pairs are comma-delimited, so internal '=' is unambiguously part of the value as long as ',' is quoted + private static boolean isHeaderSpecialChar(final int c) { + switch (c) { + case '\n': + case '\t': + case '\r': + case '%': + return true; + default: + return false; + } + } + + private static boolean isVCFSpecialChar(final int c) { + switch (c) { + case '\n': + case '\t': + case '\r': + case '%': + case ',': + case ':': + case ';': + case '=': + return true; + default: + return false; + } + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java index 8fe9b67d6d..cbefb13237 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java @@ -27,8 +27,7 @@ public VCFRecordCodec(final VCFHeader header) { public VCFRecordCodec(final VCFHeader header, final boolean allowMissingFieldsInHeader) { this.vcfEncoder = new VCFEncoder(header, allowMissingFieldsInHeader, false); - // Explicitly set the version because it's not available in the header itself. - this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + this.vcfDecoder.setVCFHeader(header); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java index 973a976baa..7c45e9a1b2 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java @@ -1,13 +1,42 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing SAMPLE fields in the VCF header */ public class VCFSampleHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFSampleHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.SAMPLE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.SAMPLE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFSampleHeaderLine(final Map mapping) { + super(VCFConstants.SAMPLE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index 12b45e5bc9..a5271114d4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2012 The Broad Institute +* Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -25,98 +25,120 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; - +import java.util.Optional; /** - * @author ebanks - * - * A class representing a key=value entry for simple VCF header types + * An abstract class representing a VCF metadata line with a key and attribute=value pairs, one of + * which represents an ID. The key determines the "type" of the structured header line (i.e., contig, FILTER, + * INFO, ALT, PEDIGREE, META). + * + * The attribute/value pairs are ordered. The first entry in the map must be an ID attribute (used by the + * VCFHeader to ensure that no two structured header lines that share the same key in a given header have the + * same ID). */ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - private String name; - private Map genericFields = new LinkedHashMap(); + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFSimpleHeaderLine.class); public static final String ID_ATTRIBUTE = "ID"; public static final String DESCRIPTION_ATTRIBUTE = "Description"; + public static final String SOURCE_ATTRIBUTE = "Source"; + public static final String VERSION_ATTRIBUTE = "Version"; + + // List of expected tags (for this base class, its ID only; subclasses with more required tags + // should use a custom tag order if more required tags are expected + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(1) {{ add(ID_ATTRIBUTE); }}); + + // Map used to retain the attribute/value pairs, in original order. The first entry in the map must be + // an ID field. The entire map must be immutable to prevent hash values from changing, since these are + // often stored in Sets. Its not ACTUALLY immutable in orderto allow for special cases where subclasses + // have to be able to "repair" header lines (via a call to updateGenericField) during constructor validation. + // + // Otherwise the values here should never change during the lifetime of the header line. + private final Map genericFields = new LinkedHashMap(); /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param description description for this header line + * Constructor that accepts a key and string that represetns the rest of the line (after the ##KEY="). + * @param key the key to use for this line + * @param line the value part of the line + * @param version the target version to validate the line against */ - public VCFSimpleHeaderLine(String key, String name, String description) { - super(key, ""); - Map map = new LinkedHashMap(1); - map.put(DESCRIPTION_ATTRIBUTE, description); - initialize(name, map); + public VCFSimpleHeaderLine(final String key, final String line, final VCFHeaderVersion version) { + this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder)); + validate(); + validateForVersion(version); } /** - * create a VCF info header line - * - * @see #VCFSimpleHeaderLine(String, VCFHeaderVersion, String, List, List) VCFv4.2+ recommended tags support + * Key cannot be null or empty. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line + * @param key key to use for this header line. can not be null. + * @param id id name to use for this line + * @param description string that will be added as a "Description" tag to this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(line, version, key, expectedTagOrdering, Collections.emptyList()); + public VCFSimpleHeaderLine(final String key, final String id, final String description) { + super(key, ""); + genericFields.put(ID_ATTRIBUTE, id); + genericFields.put(DESCRIPTION_ATTRIBUTE, description); + validate(); } /** - * create a VCF info header line + * Key cannot be null or empty. + * + * Note that for attributes where the order is significant, use a LinkedHashMap + * to ensure that attribute order is honored. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line - * @param recommendedTags tags that are optional for this header line + * @param key key to use for this header line. can not be null. + * @param attributeMapping field mappings to use. may not be null. must contain an "ID" field to use as + * a unique id for this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering, final List recommendedTags) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering, recommendedTags)); - } - - public VCFSimpleHeaderLine(final String key, final Map mapping) { + public VCFSimpleHeaderLine(final String key, final Map attributeMapping) { super(key, ""); - name = mapping.get(ID_ATTRIBUTE); - initialize(name, mapping); + ValidationUtils.nonNull(attributeMapping, "An attribute map is required for structured header lines"); + genericFields.putAll(attributeMapping); + validate(); } - /** - * Returns the String value associated with the given key. Returns null if there is no value. Key - * must not be null. - */ - String getGenericFieldValue(final String key) { - return this.genericFields.get(key); - } + /** + * @return true if this is a structured header line (has a unique ID and multiple key/value pairs), + * otherwise false + */ + @Override + public boolean isIDHeaderLine() { return true; } - protected void initialize(String name, Map genericFields) { - if ( name == null || genericFields == null || genericFields.isEmpty() ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); - if ( name.contains("<") || name.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if ( name.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); + /** + * Return the unique ID for this line. Returns null iff isIDHeaderLine is false. + * @return + */ + @Override + public String getID() { + return getGenericFieldValue(ID_ATTRIBUTE); + } - this.name = name; - this.genericFields.putAll(genericFields); + /** + * Returns the String value associated with the given key. Returns null if there is no value. Key + * must not be null. + */ + public String getGenericFieldValue(final String key) { + return this.genericFields.get(key); } - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put(ID_ATTRIBUTE, name); - map.putAll(genericFields); - return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); + /** + * Returns a list of all attributes for this header line. + */ + public Map getGenericFields() { + return Collections.unmodifiableMap(this.genericFields); } @Override @@ -129,28 +151,112 @@ public boolean equals( final Object o ) { } final VCFSimpleHeaderLine that = (VCFSimpleHeaderLine) o; - return name.equals(that.name) && - genericFields.equals(that.genericFields); + return genericFields.equals(that.genericFields); } @Override public int hashCode() { int result = super.hashCode(); - result = 31 * result + name.hashCode(); result = 31 * result + genericFields.hashCode(); return result; } + /** + * create a string of a mapping pair for the target VCF version + * @return a string, correctly formatted + */ @Override - public String getID() { - return name; + protected String toStringEncoding() { + //NOTE: this preserves/round-trips "extra" attributes such as SOURCE, VERSION, etc. + final StringBuilder s = new StringBuilder(); + s.append(getKey()); + s.append('='); + s.append('<'); + boolean notFirst = false; + for (final Map.Entry e : genericFields.entrySet()) { + if (notFirst) { + s.append(','); + } else { + notFirst = true; + } + + final String k = e.getKey(); + final String v = e.getValue(); + s.append(k); + s.append('='); + s.append(encodeAttributeValueForSerialization(k, v)); + } + s.append('>'); + + return s.toString(); } + // Called by VCFInfoHeaderLine to allow repairing of VCFInfoLines that have a Flag type and a non-zero count + // (the combination of which is forbidden by the spec, but which we tolerate for backward compatibility with + // previous versions of htsjdk, which silently repaired these). + // + // Replaces the original generic fields map with another immutable map with the updated value. + protected void updateGenericField(final String attributeName, final String value) { + genericFields.put(attributeName, value); + } /** - * @return a map of all pairs of fields and values in this header line + * Return true if the attribute name requires quotes. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value should be embedded in quotes during serialization */ - public Map getGenericFields() { - return Collections.unmodifiableMap(genericFields); + protected boolean getIsQuotableAttribute(final String attributeName) { + // the (VCF4.3) spec says that the DESCRIPTION, SOURCE, and VERSION attributes should be quoted + // for INFO/FORMAT lines, but htsjdk seems to have historically quoted these for all structured + // header lines + return attributeName.equals(DESCRIPTION_ATTRIBUTE) || + attributeName.equals(SOURCE_ATTRIBUTE) || + attributeName.equals(VERSION_ATTRIBUTE); + } + + /** + * Return true if the attribute name allows percent encoding. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value may be percent encoded serialization + */ + protected boolean isPercentEncodableAttribute(final String attributeName) { + // As of VCF4.3 attribute values containing characters that have special meanings can be percent encoded. + // ID, NUMBER and TYPE values do not permit values that would require percent encoding, so they are excluded, + // but all other attributes may potentially be percent encoded. + return !(attributeName.equals(VCFSimpleHeaderLine.ID_ATTRIBUTE) || + attributeName.equals(VCFCompoundHeaderLine.NUMBER_ATTRIBUTE) || + attributeName.equals(VCFCompoundHeaderLine.TYPE_ATTRIBUTE)); + } + + private void validate() { + if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { + throw new TribbleException( + String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); + } + } + + // Perform all text transformations required to encode an attribute value + private String encodeAttributeValueForSerialization(final String attribute, final String originalValue) { + final String quotedAttributeValue = quoteAttributeValueForSerialization(attribute, originalValue); + return isPercentEncodableAttribute(attribute) + ? VCFPercentEncodedTextTransformer.percentEncodeHeaderText(quotedAttributeValue) + : quotedAttributeValue; + } + + // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by + // definition per the spec (i.e., Description, Source, Version for INFO lines). + private String quoteAttributeValueForSerialization(final String attribute, final String originalValue) { + return originalValue.contains(",") || originalValue.contains(" ") || getIsQuotableAttribute(attribute) ? + "\""+ escapeQuotes(originalValue) + "\"" : + originalValue; } - } + + private static String escapeQuotes(final String value) { + // java escaping in a string literal makes this harder to read than it should be + // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) + // ie replace: something that's not a backslash ([^\]) followed by a double quote + // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote + return value.replaceAll("([^\\\\])\"", "$1\\\\\""); + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index 6e9e713a20..1032762f0d 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -51,15 +51,21 @@ public class VCFStandardHeaderLines { /** * Enabling this causes us to repair header lines even if only their descriptions differ. */ - private final static boolean REPAIR_BAD_DESCRIPTIONS = false; - private static Standards formatStandards = new Standards(); - private static Standards infoStandards = new Standards(); + private static Standards formatStandards = new Standards<>(); + private static Standards infoStandards = new Standards<>(); /** * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly * allocated {@link VCFHeader} with standard VCF header lines repaired as necessary. */ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { + if (oldHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // the "repair" operation effectively upgrades old header lines to v4.2 format, + // but we don't "back-version" headers that are already newer than v4.2, so skip + // repair for newer headers + return oldHeader; + } + final Set newLines = new LinkedHashSet(oldHeader.getMetaDataInInputOrder().size()); for ( VCFHeaderLine line : oldHeader.getMetaDataInInputOrder() ) { if ( line instanceof VCFFormatHeaderLine ) { @@ -67,17 +73,17 @@ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { } else if ( line instanceof VCFInfoHeaderLine) { line = infoStandards.repair((VCFInfoHeaderLine) line); } - newLines.add(line); } + //NOTE that its possible for this to fail in the (probably rare) case that the repaired + //lines (which are "version-less") fail validation against the header version final VCFHeader repairedHeader = new VCFHeader(newLines, oldHeader.getGenotypeSamples()); - final VCFHeaderVersion oldHeaderVersion = oldHeader.getVCFHeaderVersion(); - if (oldHeaderVersion != null && oldHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // this needs to maintain version 4.3 (and not back-version to v4.2), so propagate - // the old version only for v4.3 - repairedHeader.setVCFHeaderVersion(oldHeaderVersion); - } + + // the "repair" operation effectively upgrades old header lines to v4.2 format, so the new header should + // reflect that since it may no longer conform to it's original version + // new header reflects that + repairedHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); return repairedHeader; } @@ -159,26 +165,30 @@ private static void registerStandard(final VCFFormatHeaderLine line) { // static { // FORMAT lines - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + // This line's count changed from UNBOUNDED in VCF 4.2 to 1 in VCF 4.3, but we keep it at UNBOUNDED + // because VCFStandardHeaderLines is now mainly a facility for upgrading headers from pre-4.2 versions + // to conform to the 4.2 spec. + // Version upgrading for other versions is more difficult, so we do not rely on VCFStandardHeaderLines registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phasing set (typically the position of the first variant in the set)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); // INFO lines - registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); } private static class Standards { @@ -191,7 +201,7 @@ public T repair(final T line) { final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount(); final boolean badType = line.getType() != standard.getType(); final boolean badDesc = ! line.getDescription().equals(standard.getDescription()); - final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); + final boolean needsRepair = badCountType || badCount || badType; if ( needsRepair ) { if ( GeneralUtils.DEBUG_MODE_ENABLED ) { @@ -201,7 +211,17 @@ public T repair(final T line) { + (badCount ? " -- counts disagree; header has " + line.getCount() + " but standard is " + standard.getCount() : "") + (badDesc ? " -- descriptions disagree; header has '" + line.getDescription() + "' but standard is '" + standard.getDescription() + "'": "")); } - return standard; + // Create a new set so we can modify it without mutating the standard line + final Set additionalFields = new HashSet<>(line.getGenericFields().keySet()); + additionalFields.removeAll(standard.getGenericFields().keySet()); + + if (additionalFields.isEmpty()) { + return standard; + } else { + // We need to handle the case where a line has nonstandard attributes, but also additional + // attributes of its own that would be lost if we simply returned the standard line + return mergeStandardLine(standard, line, additionalFields); + } } else { return line; } @@ -210,6 +230,26 @@ public T repair(final T line) { } } + private T mergeStandardLine(final T standard, final T line, final Set additionalFields) { + // Create a new line identical to the standard line + final VCFCompoundHeaderLine mergedLine; + if (standard instanceof VCFFormatHeaderLine) { + mergedLine = standard.isFixedCount() + ? new VCFFormatHeaderLine(standard.getID(), standard.getCount(), standard.getType(), standard.getDescription()) + : new VCFFormatHeaderLine(standard.getID(), standard.getCountType(), standard.getType(), standard.getDescription()); + } else { + mergedLine = standard.isFixedCount() + ? new VCFInfoHeaderLine(standard.getID(), standard.getCount(), standard.getType(), standard.getDescription()) + : new VCFInfoHeaderLine(standard.getID(), standard.getCountType(), standard.getType(), standard.getDescription()); + } + + final Map originalGenericFields = line.getGenericFields(); + for (final String field : additionalFields) { + mergedLine.updateGenericField(field, originalGenericFields.get(field)); + } + return (T) mergedLine; + } + public Set addToHeader(final Set headerLines, final Collection IDs, final boolean throwErrorForMissing) { final Set missing = new HashSet(); for ( final String ID : IDs ) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java b/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java index 36f842b20a..f928507b01 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java +++ b/src/main/java/htsjdk/variant/vcf/VCFTextTransformer.java @@ -28,4 +28,12 @@ public interface VCFTextTransformer { */ List decodeText(final List rawParts); + /** + * Encode a single string. + * + * @param rawPart the raw string to be encoded + * @return the encoded string + * @throws TribbleException if the the encoding is unencodable + */ + String encodeText(final String rawPart); } diff --git a/src/main/java/htsjdk/variant/vcf/VCFUtils.java b/src/main/java/htsjdk/variant/vcf/VCFUtils.java index 6d0e2d7b68..3599da7edc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFUtils.java +++ b/src/main/java/htsjdk/variant/vcf/VCFUtils.java @@ -25,110 +25,59 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.FileExtensions; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class VCFUtils { private static final Pattern INF_OR_NAN_PATTERN = Pattern.compile("^(?[-+]?)((?(INF|INFINITY))|(?NAN))$", Pattern.CASE_INSENSITIVE); + private static final boolean DEFAULT_VCF_STRICT_VERSION_VALIDATION = true; - public static Set smartMergeHeaders(final Collection headers, final boolean emitWarnings) throws IllegalStateException { - // We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set. - // This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering. - final LinkedHashMap map = new LinkedHashMap<>(); // from KEY.NAME -> line - final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); - final Set headerVersions = new HashSet<>(2); + // a global mutable static - is there an alternative ? + // there isn't any other reasonable place to keep this state + private static boolean vcfStrictVersionValidation = true; - // todo -- needs to remove all version headers from sources and add its own VCF version line - for (final VCFHeader source : headers) { - for (final VCFHeaderLine line : source.getMetaDataInSortedOrder()) { - - enforceHeaderVersionMergePolicy(headerVersions, source.getVCFHeaderVersion()); - String key = line.getKey(); - if (line instanceof VCFIDHeaderLine) - key = key + "-" + ((VCFIDHeaderLine) line).getID(); - - if (map.containsKey(key)) { - final VCFHeaderLine other = map.get(key); - if (line.equals(other)) { - // continue; - } else if (!line.getClass().equals(other.getClass())) { - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFFilterHeaderLine) { - final String lineName = ((VCFFilterHeaderLine) line).getID(); - final String otherName = ((VCFFilterHeaderLine) other).getID(); - if (!lineName.equals(otherName)) - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFCompoundHeaderLine) { - final VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine) line; - final VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine) other; - - // if the names are the same, but the values are different, we need to quit - if (!(compLine).equalsExcludingDescription(compOther)) { - if (compLine.getType().equals(compOther.getType())) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if (compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if (compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other); - } - } - if (!compLine.getDescription().equals(compOther.getDescription())) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - } - } - } - - // returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled. - return new LinkedHashSet<>(map.values()); - } + /** + * Determine if strict VCF version validation is enabled. Defaults to true. Strict version validation + * ensures that all VCF contents (header and variant contexts) conforms to the established header version. + * This should only be disabled when absolutely necessary. + * + * @return true if strict version validation is enabled + */ + public static boolean isStrictVCFVersionValidation() { return Defaults.STRICT_VCF_VERSION_VALIDATION; } - // Reject attempts to merge a VCFv4.3 header with any other version - private static void enforceHeaderVersionMergePolicy( - final Set headerVersions, - final VCFHeaderVersion candidateVersion) { - if (candidateVersion != null) { - headerVersions.add(candidateVersion); - if (headerVersions.size() > 1 && headerVersions.contains(VCFHeaderVersion.VCF4_3)) { - throw new IllegalArgumentException( - String.format("Attempt to merge version %s header with incompatible header version %s", - VCFHeaderVersion.VCF4_3.getVersionString(), - headerVersions.stream() - .filter(hv -> !hv.equals(VCFHeaderVersion.VCF4_3)) - .map(VCFHeaderVersion::getVersionString) - .collect(Collectors.joining(" ")))); - } - } + /** + * The headers passed in must be version >= 4.2 (older headers that are read in via AbstractVCFCodecs + * are "repaired" and stamped as VCF4.2 when they're read in). + * + * @param headers the set of headers to merge + * @param emitWarnings true if warning should be emitted by the merge + * @return + * @throws {@link htsjdk.tribble.TribbleException} if any header has a version < vcfV4.2 + * @throws {@link htsjdk.tribble.TribbleException} if any header cannot be upgraded to the newest version amongst + * all headers provided + */ + public static Set smartMergeHeaders( + final Collection headers, + final boolean emitWarnings) { + return VCFHeaderMerger.getMergedHeaderLines(headers, emitWarnings); } /** @@ -149,8 +98,8 @@ public static Set withUpdatedContigsAsLines(final Set withUpdatedContigsAsLines(final Set oldLines, final File referenceFile, final SAMSequenceDictionary refDict, final boolean referenceNameOnly) { final Set lines = new LinkedHashSet<>(oldLines.size()); - for (final VCFHeaderLine line : oldLines) { - if (line instanceof VCFContigHeaderLine) + for ( final VCFHeaderLine line : oldLines ) { + if ( line.isIDHeaderLine() && line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) ) continue; // skip old contig lines if (line.getKey().equals(VCFHeader.REFERENCE_KEY)) continue; // skip the old reference key @@ -184,17 +133,14 @@ public static List makeContigHeaderLines(final SAMSequenceD final File referenceFile) { final List lines = new ArrayList<>(); final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; - for (final SAMSequenceRecord contig : refDict.getSequences()) - lines.add(makeContigHeaderLine(contig, assembly)); + for ( final SAMSequenceRecord contig : refDict.getSequences() ) + lines.add(new VCFContigHeaderLine(contig, assembly)); return lines; } + @Deprecated private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { - final Map map = new LinkedHashMap<>(3); - map.put("ID", contig.getSequenceName()); - map.put("length", String.valueOf(contig.getSequenceLength())); - if (assembly != null) map.put("assembly", assembly); - return new VCFContigHeaderLine(map, contig.getSequenceIndex()); + return new VCFContigHeaderLine(contig, assembly); } /** @@ -295,22 +241,4 @@ else if (refPath.contains("hg38")) return assembly; } - /** - * Only displays a warning if warnings are enabled and an identical warning hasn't been already issued - */ - private static final class HeaderConflictWarner { - boolean emitWarnings; - Set alreadyIssued = new HashSet<>(); - - private HeaderConflictWarner(final boolean emitWarnings) { - this.emitWarnings = emitWarnings; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if (GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && !alreadyIssued.contains(line.getKey())) { - alreadyIssued.add(line.getKey()); - System.err.println(msg); - } - } - } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java new file mode 100644 index 0000000000..c6f0ad8708 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java @@ -0,0 +1,63 @@ +package htsjdk.variant.vcf; + +import htsjdk.utils.ValidationUtils; + +/** + * A class representing a VCF validation failure. + * @param a type representing the object that is being validated + */ +class VCFValidationFailure { + private final VCFHeaderVersion targetVersion; + private final T source; + private final String sourceMessage; + + /** + * A VCF validation failure. + * + * @param targetVersion the version for which validation failed. + * @param source the source object being validated + * @param sourceMessage the validation failure reason + */ + public VCFValidationFailure(final VCFHeaderVersion targetVersion, final T source, final String sourceMessage) { + ValidationUtils.nonNull(targetVersion); + ValidationUtils.nonNull(source); + ValidationUtils.nonNull(sourceMessage); + + this.targetVersion = targetVersion; + this.source = source; + this.sourceMessage = sourceMessage; + } + + /** + * @return the source object being validated + */ + public T getSource() { + return source; + } + + /** + * @return The validation failure reason. + */ + public String getSourceMessage() { + return sourceMessage; + } + + /** + * @return A formatted message describing the validation failure reason and target version. + */ + public String getFailureMessage() { + return String.format( + "Failure validating %s for reason %s, target version %s", + source.toString(), + sourceMessage, + targetVersion); + } + + /** + * @return The version for which validation failed. May be null. + */ + public VCFHeaderVersion getTargetVersion() { + return targetVersion; + } + +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java b/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java new file mode 100644 index 0000000000..8db6a1883c --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFVersionUpgrader.java @@ -0,0 +1,30 @@ +package htsjdk.variant.vcf; + +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; + +import java.util.Collection; + +final class VCFVersionUpgrader { + public static void getOutputVersion(final VCFHeader header, final VCFVersionUpgradePolicy policy) { + // Guaranteed to not be null + final VCFHeaderVersion currentVersion = header.getVCFHeaderVersion(); + switch (policy) { + case ONLY_INFALLIBLE_UPGRADE: + // 4.3+ lines are output as the latest version, pre-4.3 lines are output as 4.2 + final VCFHeaderVersion newVersion = currentVersion.isAtLeastAsRecentAs(VCFHeader.DEFAULT_VCF_VERSION) + ? VCFHeader.DEFAULT_VCF_VERSION + : VCFHeaderVersion.VCF4_2; + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(newVersion)); + case UPGRADE_OR_FALLBACK: + final Collection> failures = header.getValidationErrors(VCFHeader.DEFAULT_VCF_VERSION); + if (failures.isEmpty()) { + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + } + break; + case UPGRADE_OR_FAIL: + // If validation fails, simply pass the exception through + header.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + break; + } + } +} diff --git a/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java b/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java index 2ea873212e..fa5201b754 100644 --- a/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java +++ b/src/test/java/htsjdk/beta/codecs/variants/vcf/HtsVCFCodecTest.java @@ -105,15 +105,6 @@ public void testRoundTripVCFThroughStream(final IOPath inputPath, final HtsVersi } } - @Test(expectedExceptions = IllegalArgumentException.class) - public void testRejectWritingV43HeaderAsV42() { - // read vcf v4.3 and try to write it to a vcf v4.2 (header is rejected) - final IOPath outputPath = IOUtils.createTempPath("rejectWrite43HeaderVCF", ".vcf"); - readWriteVCFToPath(new HtsPath(VARIANTS_TEST_DIR + "variant/vcf43/all43Features.vcf"), - outputPath, - VCFCodecV4_3.VCF_V43_VERSION); - } - @DataProvider(name="gzipSuffixTests") private Object[][] gzipSuffixTests() { return new Object[][] { diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java new file mode 100644 index 0000000000..7167fa8f12 --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -0,0 +1,248 @@ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.Interval; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static htsjdk.samtools.SAMSequenceDictionaryUtils.*; +import static htsjdk.samtools.SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.*; + +public final class SAMSequenceDictionaryUtilsTest extends HtsjdkTest { + + @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) + public Object[][] testSequenceRecordsAreEquivalentDataProvider() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + return new Object[][]{ + {CHR1_HG19, CHR1_HG19, true}, + {CHR1_HG19, CHRM_HG19, false}, + {CHR1_HG19, CHR_NONSTANDARD1, false}, + {null, null, true}, + {CHR1_HG19, null, false}, + {null, CHR1_HG19, false}, + {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true}, + {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false}, + {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true}, + {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false}, + }; + } + + @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") + public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ + final boolean actual = SAMSequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + Assert.assertEquals(actual, expected); + } + + @DataProvider( name = "SequenceDictionaryDataProvider" ) + public Object[][] generateSequenceDictionaryTestData() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + + final SAMSequenceRecord CHR1_HG19_WITH_ATTRIBUTES = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), CHR1_HG19.getSequenceLength()); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); + + final List hg19AllContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1), + new Interval("chr2", 1, 1), + new Interval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1)); + + return new Object[][] { + // Identical dictionaries: + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + + // Dictionaries with a common subset: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, true}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + // If checkContigOrdering == false, ordering of the common contigs should not matter: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, + + // Dictionaries with no common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, + + // Dictionaries with unequal common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + + // One or both dictionaries in non-canonical human order: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, false, true}, + // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + + // Dictionaries with a common subset, but different relative ordering within that subset + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, false, true}, + // If checkContigOrdering == false, we should not get OUT_OF_ORDER: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET,false, false}, + + // Dictionaries with a common subset in the same relative order, but with different indices. + // This will only throw an exception during validation if checkContigOrdering is true + + // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, false, true}, + // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, + + // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, + + // tests for SUPERSET + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, + // Extended attributes should be ignored when determining whether a superset exists: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, false, false} + }; + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryComparison( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final boolean requireSuperset, + final boolean checkContigOrdering) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SAMSequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SAMSequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SAMSequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + + Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, + String.format("Dictionary comparison should have returned %s but instead returned %s. %s", + dictionaryCompatibility, reportedCompatibility, testDescription)); + } + + @DataProvider(name = "StandardValidationIgnoresContigOrderData") + public Object[][] getStandardValidationIgnoresContigOrderData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19) }, + + }; + } + + private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { + final List clonedContigs = new ArrayList<>(contigs.size()); + + // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects + // across multiple dictionaries in tests + for ( SAMSequenceRecord contig : contigs ) { + clonedContigs.add(contig.clone()); + } + + return new SAMSequenceDictionary(clonedContigs); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetContigNamesListExpectingException() { + getContigNamesList(null); + } + + @Test + public void testGetContigNamesList() { + + final SAMSequenceDictionary samSequenceDictionary = new SAMSequenceDictionary(Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37)); + + Assert.assertEquals(getContigNamesList(samSequenceDictionary), Arrays.asList("1", "2", "10")); + } +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/SamStreamsTest.java b/src/test/java/htsjdk/samtools/SamStreamsTest.java index d08a14dabf..7611c762f3 100644 --- a/src/test/java/htsjdk/samtools/SamStreamsTest.java +++ b/src/test/java/htsjdk/samtools/SamStreamsTest.java @@ -28,6 +28,7 @@ import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; +import htsjdk.samtools.util.IOUtil; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -44,7 +45,7 @@ public class SamStreamsTest extends HtsjdkTest { public void testDataFormat(final String inputFile, final boolean isGzippedSAMFile, final boolean isBAMFile, final boolean isCRAMFile) throws Exception { final File input = new File(TEST_DATA_DIR, inputFile); try(final InputStream fis = new BufferedInputStream(new FileInputStream(input))) { //must be buffered or the isGzippedSAMFile will blow up - Assert.assertEquals(SamStreams.isGzippedSAMFile(fis), isGzippedSAMFile, "isGzippedSAMFile:" + inputFile); + Assert.assertEquals(IOUtil.isGZIPInputStream(fis), isGzippedSAMFile, "isGzippedSAMFile:" + inputFile); Assert.assertEquals(SamStreams.isBAMFile(fis), isBAMFile, "isBAMFile:" + inputFile); Assert.assertEquals(SamStreams.isCRAMFile(fis), isCRAMFile, "isCRAMFile:" + inputFile); } diff --git a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java index e127fd4b2f..77a5902ea0 100644 --- a/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java +++ b/src/test/java/htsjdk/tribble/index/IndexFactoryTest.java @@ -196,12 +196,10 @@ public void testCreateTabixIndexFromVCF( @DataProvider(name = "bcfDataFactory") public Object[][] getBCFData(){ return new Object[][] { - //TODO: this needs more test cases, including block compressed and indexed, but bcftools can't - // generate indices for BCF2.1 files, which is all HTSJDK can read, and htsjdk also can't read/write - // block compressed BCFs (https://github.com/samtools/htsjdk/issues/946) - new Object[] { - new File("src/test/resources/htsjdk/variant/serialization_test.bcf") - } + {new File("src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf")}, + // TODO: this needs more test cases, including block compressed and indexed + // The test below, with a bgzipped BCF, fails +// {new File("src/test/resources/htsjdk/variant/serialization_test.bcf")}, }; } diff --git a/src/test/java/htsjdk/utils/BCFToolsTestUtils.java b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java new file mode 100644 index 0000000000..c6c4234f8e --- /dev/null +++ b/src/test/java/htsjdk/utils/BCFToolsTestUtils.java @@ -0,0 +1,136 @@ +package htsjdk.utils; + +import htsjdk.samtools.util.FileExtensions; +import htsjdk.samtools.util.ProcessExecutor; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class BCFToolsTestUtils { + private static final String BCFTOOLS_BINARY_ENV_VARIABLE = "HTSJDK_BCFTOOLS_BIN"; + public static final String expectedBCFtoolsVersion = "1.14"; + + /** + * @return true if bcftools is available, otherwise false + */ + public static boolean isBCFToolsAvailable() { + final String binPath = getBCFToolsBin(); + final Path binFile = Paths.get(binPath); + return Files.exists(binFile); + } + + /** + * @throws RuntimeException if bcftools executable is not available + */ + public static void assertBCFToolsAvailable() { + if (!isBCFToolsAvailable()) { + throw new RuntimeException(String.format( + "No bcftools executable can be found." + + " The %s environment variable must be set to the name of the local bcftools executable.", + BCFTOOLS_BINARY_ENV_VARIABLE + )); + } + } + + /** + * @return the name and location of the local bcftools executable as specified by the environment + * variable HTSJDK_BCFTOOLS_BIN, or the default value of "/usr/local/bin/bcftools" if the environment + * variable is not set + */ + public static String getBCFToolsBin() { + final String bcftoolsPath = System.getenv(BCFTOOLS_BINARY_ENV_VARIABLE); + return bcftoolsPath == null ? "/usr/local/bin/bcftools" : bcftoolsPath; + } + + /** + * Execute a bcftools command line if a local bcftools executable is available see {@link #isBCFToolsAvailable()}. + * + * @param commandLine bcftools command line string, excluding the "bcftools" prefix. For example: + * {@code "view my.vcf > my.bcf"} + * @return the {@link ProcessExecutor.ExitStatusAndOutput} resulting from the command execution, if + * the command succeeds + * @throws RuntimeException if the command fails, or if a local bcftools executable is not available. + */ + public static ProcessExecutor.ExitStatusAndOutput executeBCFToolsCommand(final String commandLine) { + assertBCFToolsAvailable(); + final String commandString = String.format("%s %s", getBCFToolsBin(), commandLine); + final ProcessExecutor.ExitStatusAndOutput processStatus = + ProcessExecutor.executeAndReturnInterleavedOutput(commandString); + if (processStatus.exitStatus != 0) { + // bcftools seems to write some errors to stdout + throw new RuntimeException( + String.format( + "Failure code %d returned from bcftools command %s\n (stderr: %.500s)\n (stdout: %.500s)\n", + processStatus.exitStatus, + commandString, + processStatus.stderr == null ? "" : processStatus.stderr, + processStatus.stdout == null ? "" : processStatus.stdout + ) + ); + } + return processStatus; + } + + /** + * Convert an input VCF file to a temporary BCF file using the bcftools "view" command. The temp + * file will be deleted when the process exits. Use {@link #isBCFToolsAvailable()} to determine if it's safe + * to use this method. + * + * @param inputVCF input file to convert + * @param commandLineOptions additional command line options (--input-fmt-option or --output-fmt-option) + * @return a temporary file containing the bcftools-generated results. + */ + public static File VCFtoBCF( + final File inputVCF, + final String commandLineOptions + ) { + assertBCFToolsAvailable(); + try { + final File tempBCFFile = File.createTempFile("bcftoolsTemporaryBCF", FileExtensions.BCF); + tempBCFFile.deleteOnExit(); + final String commandString = String.format( + "view %s %s -o %s", + commandLineOptions == null ? "" : commandLineOptions, + inputVCF.getAbsolutePath(), + tempBCFFile.getAbsolutePath() + ); + executeBCFToolsCommand(commandString); + return tempBCFFile; + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Convert an input BCF file to a temporary VCF file using the bcftools "view" command. The temp + * file will be deleted when the process exits. Use {@link #isBCFToolsAvailable()} to determine if it's safe + * to use this method. + * + * @param inputBCF input file to convert + * @param commandLineOptions additional command line options (--input-fmt-option or --output-fmt-option) + * @return a temporary file containing the bcftools-generated results. + */ + public static File BCFToVCF( + final File inputBCF, + final String commandLineOptions + ) { + assertBCFToolsAvailable(); + try { + final File tempVCFFile = File.createTempFile("bcftoolsTemporaryVCF" + inputBCF, FileExtensions.VCF); + final String commandString = String.format( + "view %s %s -o %s", + commandLineOptions == null ? "" : commandLineOptions, + inputBCF.getAbsolutePath(), + tempVCFFile.getAbsolutePath() + ); + executeBCFToolsCommand(commandString); + return tempVCFFile; + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } +} diff --git a/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java b/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java new file mode 100644 index 0000000000..9fd5451f55 --- /dev/null +++ b/src/test/java/htsjdk/utils/BCFToolsTestUtilsTest.java @@ -0,0 +1,35 @@ +package htsjdk.utils; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.ProcessExecutor; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.Test; + +public class BCFToolsTestUtilsTest extends HtsjdkTest { + + @Test + public void testBCFToolsIsAvailable() { + Assert.assertTrue(BCFToolsTestUtils.isBCFToolsAvailable()); + } + + @Test + public void testBCFToolsVersion() { + if (!BCFToolsTestUtils.isBCFToolsAvailable()) { + throw new SkipException("bcftools not available on local device"); + } + // If this test runs, but fails because version validation fails, then the local bcftools version is + // not the one expected by the htsjdk tests + final ProcessExecutor.ExitStatusAndOutput processStatus = BCFToolsTestUtils.executeBCFToolsCommand("--version"); + Assert.assertTrue(processStatus.stdout.contains(BCFToolsTestUtils.expectedBCFtoolsVersion)); + } + + + @Test(expectedExceptions = RuntimeException.class) + public void testBCFToolsPresentButCommandFails() { + if (!BCFToolsTestUtils.isBCFToolsAvailable()) { + throw new SkipException("bcftools not available on local device"); + } + BCFToolsTestUtils.executeBCFToolsCommand("--notABcftoolsCommand"); + } +} diff --git a/src/test/java/htsjdk/variant/VariantBaseTest.java b/src/test/java/htsjdk/variant/VariantBaseTest.java index dc59309e7b..58e6cef658 100644 --- a/src/test/java/htsjdk/variant/VariantBaseTest.java +++ b/src/test/java/htsjdk/variant/VariantBaseTest.java @@ -29,11 +29,15 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.Tuple; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureReader; +import htsjdk.tribble.TribbleException; import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; +import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import org.testng.Assert; @@ -141,8 +145,16 @@ public static SAMSequenceDictionary createArtificialSequenceDictionary() { */ public static Tuple> readEntireVCFIntoMemory(final Path vcfPath) { ValidationUtils.nonNull(vcfPath); - try ( final VCFFileReader vcfReader = new VCFFileReader(vcfPath, false) ){ - return new Tuple<>(vcfReader.getFileHeader(), vcfReader.iterator().toList()); + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.UPGRADE_OR_FALLBACK); + try (final FeatureReader reader = AbstractFeatureReader.getFeatureReader( + vcfPath.toUri().toString(), + codec, + false + )) { + return new Tuple<>((VCFHeader) reader.getHeader(), reader.iterator().toList()); + } catch (final IOException e) { + throw new TribbleException("Could not create an iterator from a feature reader.", e); } } @@ -242,7 +254,7 @@ private static void assertAttributesEquals(final Map actual, Map } else { // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + Assert.assertTrue(isMissingAttribute(actualValue), act.getKey() + " present in one but not in the other"); } expectedKeys.remove(act.getKey()); } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java new file mode 100644 index 0000000000..43b8329993 --- /dev/null +++ b/src/test/java/htsjdk/variant/bcf2/BCF2DictionaryTest.java @@ -0,0 +1,203 @@ +package htsjdk.variant.bcf2; + +import htsjdk.tribble.TribbleException; +import htsjdk.variant.VariantBaseTest; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; + +public class BCF2DictionaryTest extends VariantBaseTest { + + @DataProvider(name = "dictionaryProvider") + public Object[][] dictionaryProvider() { + final List cases = new ArrayList<>(); + + final List inputLines = new ArrayList<>(); + int counter = 0; + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFInfoHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFHeaderLine("x", "misc")); + inputLines.add(new VCFHeaderLine("y", "misc")); + inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); + inputLines.add(new VCFFormatHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("A" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); + + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + final BCF2Dictionary dict = BCF2Dictionary.makeBCF2StringDictionary(inputHeader, version); + cases.add(new Object[]{dict}); + } + + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "dictionaryProvider") + public void testCreateDictionary(final BCF2Dictionary dict) { + final int dict_size = dict.size(); + Assert.assertEquals(8, dict_size); + } + + + @DataProvider(name = "invalidIDXProvider") + public Object[][] invalidIDXProvider() { + final List cases = new ArrayList<>(); + // String lines with inconsistent IDX + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); + } + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); + } + // Contig lines with inconsistent IDX + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 3 + )); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 4 + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, false}); + } + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 3 + )); + lines.add(new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 4 + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, false}); + } + + // Headers with one IDX mapped to multiple strings/contigs + { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + + final VCFHeader header = new VCFHeader(lines); + cases.add(new Object[]{header, BCFVersion.BCF2_2Version, true}); + } + + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "invalidIDXProvider", expectedExceptions = TribbleException.class) + public void invalidIDXUsage(final VCFHeader header, final BCFVersion version, final boolean isString) { + if (isString) { + BCF2Dictionary.makeBCF2StringDictionary(header, version); + } else { + BCF2Dictionary.makeBCF2ContigDictionary(header, version); + } + } + + @Test + public void testOutOfOrderAndMissingIDX() { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + final VCFHeader header = new VCFHeader(lines); + + final BCF2Dictionary stringDict = BCF2Dictionary.makeBCF2StringDictionary(header, BCFVersion.BCF2_2Version); + Assert.assertEquals(stringDict.get(6), "FOO"); + Assert.assertEquals(stringDict.get(4), "BAR"); + Assert.assertEquals(stringDict.get(2), "BAZ"); + } + + @Test + public void testLinesWithDifferentKeySameIDShareIDX() { + final LinkedHashSet lines = new LinkedHashSet<>(); + lines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + lines.add(new VCFInfoHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFFormatHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + lines.add(new VCFFilterHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION + )); + final VCFHeader header = new VCFHeader(lines); + + final BCF2Dictionary stringDict = BCF2Dictionary.makeBCF2StringDictionary(header, BCFVersion.BCF2_2Version); + Assert.assertEquals(stringDict.get(2), "FOO"); + } +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java index d0d3a88fe2..5d888f76fd 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2EncoderDecoderUnitTest.java @@ -1,33 +1,33 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; // the imports for unit testing. + import htsjdk.variant.VariantBaseTest; -import htsjdk.variant.variantcontext.writer.BCF2Encoder; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -45,9 +45,9 @@ public class BCF2EncoderDecoderUnitTest extends VariantBaseTest { private final double FLOAT_TOLERANCE = 1e-6; - final List primitives = new ArrayList(); - final List basicTypes = new ArrayList(); - final List forCombinations = new ArrayList(); + final List primitives = new ArrayList<>(); + final List basicTypes = new ArrayList<>(); + final List forCombinations = new ArrayList<>(); @BeforeSuite public void before() { @@ -63,23 +63,23 @@ public void before() { primitives.add(new BCF2TypedValue(-1, BCF2Type.INT8)); primitives.add(new BCF2TypedValue(100, BCF2Type.INT8)); primitives.add(new BCF2TypedValue(-100, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(-127, BCF2Type.INT8)); // last value in range - primitives.add(new BCF2TypedValue( 127, BCF2Type.INT8)); // last value in range + primitives.add(new BCF2TypedValue(-120, BCF2Type.INT8)); // last value in range + primitives.add(new BCF2TypedValue(127, BCF2Type.INT8)); // last value in range // medium ints primitives.add(new BCF2TypedValue(-1000, BCF2Type.INT16)); primitives.add(new BCF2TypedValue(1000, BCF2Type.INT16)); primitives.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue(-32767, BCF2Type.INT16)); // last value in range - primitives.add(new BCF2TypedValue( 32767, BCF2Type.INT16)); // last value in range + primitives.add(new BCF2TypedValue(128, BCF2Type.INT16)); // first value in range + primitives.add(new BCF2TypedValue(-32760, BCF2Type.INT16)); // last value in range + primitives.add(new BCF2TypedValue(32767, BCF2Type.INT16)); // last value in range // larger ints primitives.add(new BCF2TypedValue(-32768, BCF2Type.INT32)); // first value in range - primitives.add(new BCF2TypedValue( 32768, BCF2Type.INT32)); // first value in range + primitives.add(new BCF2TypedValue(32768, BCF2Type.INT32)); // first value in range primitives.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); primitives.add(new BCF2TypedValue(100000, BCF2Type.INT32)); - primitives.add(new BCF2TypedValue(-2147483647, BCF2Type.INT32)); + primitives.add(new BCF2TypedValue(-2147483640, BCF2Type.INT32)); primitives.add(new BCF2TypedValue(2147483647, BCF2Type.INT32)); // floats @@ -116,7 +116,7 @@ public void before() { primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); // missing values - for ( BCF2Type type : BCF2Type.values() ) { + for (final BCF2Type type : BCF2Type.values()) { primitives.add(new BCF2TypedValue(null, type)); } @@ -124,7 +124,7 @@ public void before() { forCombinations.add(new BCF2TypedValue(100, BCF2Type.INT8)); forCombinations.add(new BCF2TypedValue(-100, BCF2Type.INT8)); forCombinations.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - forCombinations.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range + forCombinations.add(new BCF2TypedValue(128, BCF2Type.INT16)); // first value in range forCombinations.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); forCombinations.add(new BCF2TypedValue(100000, BCF2Type.INT32)); forCombinations.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT)); @@ -135,7 +135,7 @@ public void before() { forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); // missing values - for ( BCF2Type type : BCF2Type.values() ) { + for (final BCF2Type type : BCF2Type.values()) { forCombinations.add(new BCF2TypedValue(null, type)); } } @@ -146,16 +146,16 @@ public void before() { // // -------------------------------------------------------------------------------- - private class BCF2TypedValue { + private static class BCF2TypedValue { final BCF2Type type; final Object value; private BCF2TypedValue(final int value, final BCF2Type type) { - this(new Integer(value), type); + this(Integer.valueOf(value), type); } private BCF2TypedValue(final double value, final BCF2Type type) { - this(new Double(value), type); + this(Double.valueOf(value), type); } private BCF2TypedValue(final Object value, final BCF2Type type) { @@ -163,7 +163,9 @@ private BCF2TypedValue(final Object value, final BCF2Type type) { this.value = value; } - public boolean isMissing() { return value == null; } + public boolean isMissing() { + return value == null; + } @Override public String toString() { @@ -179,68 +181,56 @@ public String toString() { @DataProvider(name = "BCF2EncodingTestProviderBasicTypes") public Object[][] BCF2EncodingTestProviderBasicTypes() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : basicTypes ) - tests.add(new Object[]{Arrays.asList(tv)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv : basicTypes) + tests.add(new Object[]{Collections.singletonList(tv), version}); return tests.toArray(new Object[][]{}); } private interface EncodeMe { - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException; + void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException; } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithStaticCalls(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - switch ( tv.type ) { - case INT8: - case INT16: - case INT32: - encoder.encodeTypedInt((Integer)tv.value, tv.type); - break; - case FLOAT: - encoder.encodeTypedFloat((Double)tv.value); - break; - case CHAR: - encoder.encodeTypedString((String)tv.value); - break; - } - } - }); - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encodeTyped(tv.value, tv.type); - } - }); + public void testBCF2BasicTypesWithStaticCalls(final List toEncode, final BCFVersion version) throws IOException { + testBCF2BasicTypesWithEncodeMe( + toEncode, + (encoder, tv) -> { + switch (tv.type) { + case INT8: + case INT16: + case INT32: + encoder.encodeTypedInt((Integer) tv.value, tv.type); + break; + case FLOAT: + encoder.encodeTypedFloat((Double) tv.value); + break; + case CHAR: + encoder.encodeTypedString((String) tv.value); + break; + } + }, + version + ); } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectNoType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encode(tv.value); - } - }); + public void testBCF2BasicTypesWithObjectType(final List toEncode, final BCFVersion version) throws IOException { + testBCF2BasicTypesWithEncodeMe( + toEncode, + (encoder, tv) -> encoder.encodeTyped(tv.value, tv.type), + version + ); } - public void testBCF2BasicTypesWithEncodeMe(final List toEncode, final EncodeMe func) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - BCF2Encoder encoder = new BCF2Encoder(); + public void testBCF2BasicTypesWithEncodeMe(final List toEncode, final EncodeMe func, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); func.encode(encoder, tv); - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); Assert.assertNotNull(decoded); @@ -250,20 +240,25 @@ public void testBCF2BasicTypesWithEncodeMe(final List toEncode, } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectors(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { - BCF2Encoder encoder = new BCF2Encoder(); - List expected = Collections.nCopies(length, tv.value); - encoder.encodeTyped(expected, tv.type); - - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + public void testBCF2EncodingVectors(final List toEncode, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + for (final int length : Arrays.asList(2, 5, 10, 15, 20, 25)) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + final List expected = Collections.nCopies(length, tv.value); + if (tv.type == BCF2Type.CHAR && !expected.isEmpty()) { + encoder.encodeTypedString(encoder.compactStrings((List) expected)); + } else { + encoder.encodeType(expected.size(), tv.type); + encoder.encodeRawValues(expected, tv.type); + } + + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); Assert.assertTrue(decoded instanceof List); - final List decodedList = (List)decoded; + final List decodedList = (List) decoded; Assert.assertEquals(decodedList.size(), expected.size()); - for ( Object decodedValue : decodedList ) + for (final Object decodedValue : decodedList) myAssertEquals(tv, decodedValue); } } @@ -271,16 +266,17 @@ public void testBCF2EncodingVectors(final List toEncode) throws @DataProvider(name = "BCF2EncodingTestProviderSingletons") public Object[][] BCF2EncodingTestProviderSingletons() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : primitives ) - tests.add(new Object[]{Arrays.asList(tv)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv : primitives) + tests.add(new Object[]{Collections.singletonList(tv), version}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "BCF2EncodingTestProviderSingletons") - public void testBCF2EncodingSingletons(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); + public void testBCF2EncodingSingletons(final List toEncode, final BCFVersion version) throws IOException { + final byte[] record = encodeRecord(toEncode, version); + decodeRecord(toEncode, record, version); } // ----------------------------------------------------------------- @@ -291,29 +287,30 @@ public void testBCF2EncodingSingletons(final List toEncode) thro @DataProvider(name = "BCF2EncodingTestProviderSequences") public Object[][] BCF2EncodingTestProviderSequences() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv1 : forCombinations ) - for ( BCF2TypedValue tv2 : forCombinations ) - for ( BCF2TypedValue tv3 : forCombinations ) - tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)}); + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) + for (final BCF2TypedValue tv1 : forCombinations) + for (final BCF2TypedValue tv2 : forCombinations) + for (final BCF2TypedValue tv3 : forCombinations) + tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3), version}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectorsWithMissing(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.type != BCF2Type.CHAR ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { + public void testBCF2EncodingVectorsWithMissing(final List toEncode, final BCFVersion version) throws IOException { + for (final BCF2TypedValue tv : toEncode) { + if (tv.type != BCF2Type.CHAR) { + for (final int length : Arrays.asList(2, 5, 10, 15, 20, 25)) { final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type); - final BCF2Encoder encoder = new BCF2Encoder(); - for ( int i = 0; i < length; i++ ) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + for (int i = 0; i < length; i++) { encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type); } - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); - for ( int i = 0; i < length; i++ ) { + for (int i = 0; i < length; i++) { final Object decoded = decoder.decodeTypedValue(td); myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded); } @@ -323,9 +320,9 @@ public void testBCF2EncodingVectorsWithMissing(final List toEnco } @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons") - public void testBCF2EncodingTestProviderSequences(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); + public void testBCF2EncodingTestProviderSequences(final List toEncode, final BCFVersion version) throws IOException { + final byte[] record = encodeRecord(toEncode, version); + decodeRecord(toEncode, record, version); } // ----------------------------------------------------------------- @@ -334,20 +331,58 @@ public void testBCF2EncodingTestProviderSequences(final List toE // // ----------------------------------------------------------------- + @DataProvider(name = "Strings") + public Object[][] stringsProvider() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + tests.add(new Object[]{"", version}); + tests.add(new Object[]{" ", version}); + tests.add(new Object[]{"s", version}); + tests.add(new Object[]{"sss", version}); + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "Strings") + public void testEncodingOfListOfString(final String s, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + encoder.encodeTypedString(s); + + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); + final String decoded = decoder.decodeUnexplodedString(); + + Assert.assertEquals(s, decoded); + } + @DataProvider(name = "ListOfStrings") - public Object[][] listOfStringsProvider() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"}); + public Object[][] listofStringsProvider() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + for (final int padding : Arrays.asList(0, 1, 5)) { + tests.add(new Object[]{Collections.emptyList(), padding, version}); + tests.add(new Object[]{Collections.singletonList("s"), padding, version}); + tests.add(new Object[]{Arrays.asList("s", ""), padding, version}); + tests.add(new Object[]{Arrays.asList("s", "ss", "sss"), padding, version}); + } + } return tests.toArray(new Object[][]{}); } @Test(dataProvider = "ListOfStrings") - public void testEncodingListOfString(List strings, String expected) throws IOException { - final String collapsed = BCF2Utils.collapseStringList(strings); - Assert.assertEquals(collapsed, expected); - Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings); + public void testEncodingOfListOfString(final List strings, final int padding, final BCFVersion version) { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + final byte[] bytes = encoder.compactStrings(strings); + final int paddedSize = bytes.length + padding; + encoder.encodeRawString(bytes, paddedSize); + + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); + final List decodedStrings = decoder.decodeExplodedStrings(paddedSize, ','); + + // Padding values not included + Assert.assertEquals(strings, decodedStrings); + + // The decoder should have drained all the remaining padding values from the stream + Assert.assertTrue(decoder.blockIsFullyDecoded()); } // ----------------------------------------------------------------- @@ -358,16 +393,16 @@ public void testEncodingListOfString(List strings, String expected) thro @DataProvider(name = "BestIntTypeTests") public Object[][] BestIntTypeTests() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8}); + final List tests = new ArrayList<>(); + tests.add(new Object[]{Collections.singletonList(1), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8}); tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16}); tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16}); tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32}); + tests.add(new Object[]{Collections.singletonList(1000), BCF2Type.INT16}); + tests.add(new Object[]{Collections.singletonList(100000), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32}); tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32}); @@ -376,22 +411,21 @@ public Object[][] BestIntTypeTests() { } @Test(dataProvider = "BestIntTypeTests") - public void determineBestEncoding(final List ints, final BCF2Type expectedType) throws IOException { + public void determineBestEncoding(final List ints, final BCF2Type expectedType) { Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType); Assert.assertEquals(BCF2Utils.determineIntegerType(toPrimitive(ints.toArray(new Integer[0]))), expectedType); } - private static int[] toPrimitive ( final Integer[] array ) { - if ( array == null ) { + private static int[] toPrimitive(final Integer[] array) { + if (array == null) { return null; - } - else if ( array.length == 0 ) { + } else if (array.length == 0) { return new int[0]; } final int[] result = new int[array.length]; for (int i = 0; i < array.length; i++) { - result[i] = array[i].intValue(); + result[i] = array[i]; } return result; } @@ -403,20 +437,20 @@ else if ( array.length == 0 ) { // ----------------------------------------------------------------- @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences") - public void testReadAndSkipWithMultipleBlocks(final List block) throws IOException { - testReadAndSkipWithMultipleBlocks(block, forCombinations); - testReadAndSkipWithMultipleBlocks(forCombinations, block); + public void testReadAndSkipWithMultipleBlocks(final List block, final BCFVersion version) throws IOException { + testReadAndSkipWithMultipleBlocks(block, forCombinations, version); + testReadAndSkipWithMultipleBlocks(forCombinations, block, version); } - public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2) throws IOException { - final byte[] record1 = encodeRecord(block1); - final byte[] record2 = encodeRecord(block2); + public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2, final BCFVersion version) throws IOException { + final byte[] record1 = encodeRecord(block1, version); + final byte[] record2 = encodeRecord(block2, version); // each record is individually good - decodeRecord(block1, record1); - decodeRecord(block2, record2); + decodeRecord(block1, record1, version); + decodeRecord(block2, record2, version); - BCF2Decoder decoder = new BCF2Decoder(); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version); // test setting decoder.setRecordBytes(record1); @@ -426,7 +460,7 @@ public void testReadAndSkipWithMultipleBlocks(final List block1, // test combining the streams final byte[] combined = combineRecords(record1, record2); - final List combinedObjects = new ArrayList(block1); + final List combinedObjects = new ArrayList<>(block1); combinedObjects.addAll(block2); // the combined bytes is the same as the combined objects @@ -447,70 +481,60 @@ public void testReadAndSkipWithMultipleBlocks(final List block1, // // Test encoding / decoding arrays of ints // - // This checks that we can encode and decode correctly with the - // low-level decodeIntArray function arrays of values. This - // has to be pretty comprehensive as decodeIntArray is a highly optimized + // This checks that we can correctly encode and decode int[] with + // the low-level decodeIntArray function arrays. This has to be + // pretty comprehensive as decodeIntArray is a highly optimized // piece of code with lots of edge cases. The values we are encoding // don't really matter -- just that the values come back as expected. // + // decodeIntArray is only meant to decode arrays that are guaranteed + // to not have internal missing values, but may be missing (or EOV) + // padded, so we are interested in whether the encoder correctly + // truncates padded arrays while draining the stream. // ----------------------------------------------------------------- - @DataProvider(name = "IntArrays") - public Object[][] makeIntArrays() { - List tests = new ArrayList(); + @DataProvider(name = "BCF2_2IntArrays") + public Object[][] IntArrays() { + final List tests = new ArrayList<>(); + for (final BCFVersion version : BCFVersion.SUPPORTED_VERSIONS) { + for (final int nValues : Arrays.asList(0, 1, 2, 5, 10, 100)) { + for (final int nPad : Arrays.asList(0, 1, 2, 5, 10, 100)) { + final int nElements = nValues + nPad; - for ( int nValues : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - for ( int nPad : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - int nElements = nValues + nPad; + final int[] vs = new int[nValues]; - List values = new ArrayList(nElements); + // add nValues from 0 to nValues - 1 + for (int i = 0; i < nValues; i++) + vs[i] = i; - // add nValues from 0 to nValues - 1 - for ( int i = 0; i < nValues; i++ ) - values.add(i); - - // add nPad nulls - for ( int i = 0; i < nPad; i++ ) - values.add(null); - - tests.add(new Object[]{values}); + tests.add(new Object[]{vs, nElements, version}); + } } } return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "IntArrays") - public void testIntArrays(final List ints) throws IOException { - final BCF2Encoder encoder = new BCF2Encoder(); - encoder.encodeTyped(ints, BCF2Type.INT16); + @Test(dataProvider = "BCF2_2IntArrays") + public void testBCF2_2IntArrays(final int[] ints, final int paddedSize, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + encoder.encodeTypedVecInt(ints, paddedSize); - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - - final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2Decoder decoder = BCF2Decoder.getDecoder(version, encoder.getRecordBytes()); // read the int[] with the low-level version + final byte typeDescriptor = decoder.readTypeDescriptor(); final int size = decoder.decodeNumberOfElements(typeDescriptor); final int[] decoded = decoder.decodeIntArray(typeDescriptor, size); - if ( isMissing(ints) ) { - // we expect that the result is null in this case - Assert.assertNull(decoded, "Encoded all missing values -- expected null"); + if (ints.length == 0) { + Assert.assertNull(decoded); } else { - // we expect at least some values to come back - Assert.assertTrue(decoded.length > 0, "Must have at least 1 element for non-null encoded data"); - - // check corresponding values - for ( int i = 0; i < ints.size(); i++ ) { - final Integer expected = ints.get(i); + // Padding values not included + Assert.assertEquals(ints.length, decoded.length); - if ( expected == null ) { - Assert.assertTrue(decoded.length <= i, "we expect decoded to be truncated for missing values"); - } else { - Assert.assertTrue(decoded.length > i, "we expected at least " + i + " values in decoded array"); - Assert.assertEquals(decoded[i], (int)expected); - } - } + // The decoder should have drained all the remaining padding values from the stream + Assert.assertTrue(decoder.blockIsFullyDecoded()); } } @@ -520,24 +544,17 @@ public void testIntArrays(final List ints) throws IOException { // // ----------------------------------------------------------------- - private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); + private byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); baos.write(record1); baos.write(record2); return baos.toByteArray(); } - private final byte[] encodeRecord(final List toEncode) throws IOException { - BCF2Encoder encoder = new BCF2Encoder(); - - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.isMissing() ) - encoder.encodeTypedMissing(tv.type); - else { - final BCF2Type encodedType = encoder.encode(tv.value); - if ( tv.type != null ) // only if we have an expectation - Assert.assertEquals(encodedType, tv.type); - } + private byte[] encodeRecord(final List toEncode, final BCFVersion version) throws IOException { + final BCF2Encoder encoder = BCF2Encoder.getEncoder(version); + for (final BCF2TypedValue tv : toEncode) { + encoder.encodeTyped(tv.value, tv.type); } // check output @@ -547,12 +564,12 @@ private final byte[] encodeRecord(final List toEncode) throws IO return record; } - private final void decodeRecord(final List toEncode, final byte[] record) throws IOException { - decodeRecord(toEncode, new BCF2Decoder(record)); + private void decodeRecord(final List toEncode, final byte[] record, final BCFVersion version) throws IOException { + decodeRecord(toEncode, BCF2Decoder.getDecoder(version, record)); } - private final void decodeRecord(final List toEncode, final BCF2Decoder decoder) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { + private void decodeRecord(final List toEncode, final BCF2Decoder decoder) throws IOException { + for (final BCF2TypedValue tv : toEncode) { Assert.assertFalse(decoder.blockIsFullyDecoded()); final Object decoded = decoder.decodeTypedValue(); @@ -562,25 +579,17 @@ private final void decodeRecord(final List toEncode, final BCF2D Assert.assertTrue(decoder.blockIsFullyDecoded()); } - private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { - if ( tv.value == null ) { // special needs for instanceof double - Assert.assertEquals(decoded, tv.value); - } else if ( tv.type == BCF2Type.FLOAT ) { // need tolerance for floats, and they aren't null + private void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { + if (tv.value == null) { // special needs for instanceof double + Assert.assertNull(decoded); + } else if (tv.type == BCF2Type.FLOAT) { // need tolerance for floats, and they aren't null Assert.assertTrue(decoded instanceof Double); - final double valueFloat = (Double)tv.value; - final double decodedFloat = (Double)decoded; + final double valueFloat = (Double) tv.value; + final double decodedFloat = (Double) decoded; VariantBaseTest.assertEqualsDoubleSmart(decodedFloat, valueFloat, FLOAT_TOLERANCE); } else Assert.assertEquals(decoded, tv.value); } - - private final boolean isMissing(final List values) { - if ( values != null ) - for ( Integer value : values ) - if ( value != null ) - return false; - return true; - } } \ No newline at end of file diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java new file mode 100644 index 0000000000..7c5583c99f --- /dev/null +++ b/src/test/java/htsjdk/variant/bcf2/BCF2FieldWriter/BCF2FieldEncoderTest.java @@ -0,0 +1,465 @@ +package htsjdk.variant.bcf2.BCF2FieldWriter; + +import htsjdk.variant.VariantBaseTest; +import htsjdk.variant.bcf2.BCF2Type; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.bcf2.BCF2Encoder; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +public class BCF2FieldEncoderTest extends VariantBaseTest { + + private static final BCF2Encoder.BCF2_2Encoder ENCODER = new BCF2Encoder.BCF2_2Encoder(); + private static final BCF2FieldEncoder.AtomicIntFieldEncoder ATOMIC_INT = new BCF2FieldEncoder.AtomicIntFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.AtomicFloatFieldEncoder ATOMIC_FLOAT = new BCF2FieldEncoder.AtomicFloatFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.CharFieldEncoder CHAR = new BCF2FieldEncoder.CharFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.StringFieldEncoder STRING = new BCF2FieldEncoder.StringFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.VecIntFieldEncoder VEC_INT = new BCF2FieldEncoder.VecIntFieldEncoder(ENCODER); + private static final BCF2FieldEncoder.VecFloatFieldEncoder VEC_FLOAT = new BCF2FieldEncoder.VecFloatFieldEncoder(ENCODER); + + + @DataProvider(name = "fieldEncoderCases") + public static Object[][] fieldEncoderCases() { + final List cases = new ArrayList<>(); + + // Integer encoding + { + for (final BCF2Type intType : BCF2Utils.INTEGER_TYPES_BY_SIZE) { + final int byteWidth = intType.getSizeInBytes(); + final List intsToEncode = Arrays.asList(1, -1, null, 1 << (byteWidth * 8 - 2)); + final ByteBuffer bytes = ByteBuffer.allocate(intsToEncode.size() * byteWidth); + for (final Object o : intsToEncode) { + final int i = o == null ? intType.getMissingBytes() : (Integer) o; + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + ATOMIC_INT, + intsToEncode, + bytes.array(), + }); + } + } + + // Float encoding + { + final int byteWidth = BCF2Type.FLOAT.getSizeInBytes(); + final List floatsToEncode = Arrays.asList(1.0, -1.0, null, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); + final ByteBuffer bytes = ByteBuffer.allocate(floatsToEncode.size() * byteWidth); + for (final Object o : floatsToEncode) { + final int i = o == null ? BCF2Type.FLOAT.getMissingBytes() : Float.floatToRawIntBits((float) (double) (Double) o); + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + ATOMIC_FLOAT, + floatsToEncode, + bytes.array(), + }); + } + + // Char encoding + // TODO see https://github.com/samtools/hts-specs/issues/618 + { + final List stringsToEncode = Arrays.asList("str", null, "\0a\0"); + final int maxByteWidth = stringsToEncode + .stream() + .mapToInt(o -> o == null ? 0 : ((String) o).getBytes(StandardCharsets.UTF_8).length) + .max().getAsInt(); + final ByteBuffer bytes = ByteBuffer.allocate(stringsToEncode.size() * maxByteWidth); + for (final Object o : stringsToEncode) { + final byte[] b = o == null ? new byte[0] : ((String) o).getBytes(StandardCharsets.UTF_8); + bytes.put(b); + for (int i = maxByteWidth - b.length; i > 0; i--) bytes.put((byte) 0); + } + cases.add(new Object[]{ + CHAR, + stringsToEncode, + bytes.array(), + }); + } + + // String encoding + // TODO see https://github.com/samtools/hts-specs/issues/618 + { + final List stringsToEncode = Arrays.asList("st", null, Arrays.asList("a", "b"), new String[]{"a", "b"}); + final byte[] bytes = new byte[]{ + 's', 't', '\0', // padding + '\0', '\0', '\0', // null values should be encoded as all NULL bytes + 'a', ',', 'b', // lists of strings joined with , + 'a', ',', 'b', // arrays of strings joined with , + }; + cases.add(new Object[]{ + STRING, + stringsToEncode, + bytes, + }); + } + + // Vector of integers encoding + { + for (final BCF2Type intType : BCF2Utils.INTEGER_TYPES_BY_SIZE) { + final int byteWidth = intType.getSizeInBytes(); + final List vecsToEncode = Arrays.asList( + Arrays.asList(null, 1), // Internal null should be missing bytes, not EOV + new int[]{1}, // Short vector should be EOV padded + null, // Entirely missing vector should start with one MISSING, then be EOV padded + 1 << (byteWidth * 8 - 2) // Atomic value should be treated as vector of size 1 + ); + final int nValues = 2; + final ByteBuffer bytes = ByteBuffer.allocate(nValues * vecsToEncode.size() * byteWidth); + final int[] ints = new int[]{ + intType.getMissingBytes(), 1, + 1, intType.getEOVBytes(), + intType.getMissingBytes(), intType.getEOVBytes(), + 1 << (byteWidth * 8 - 2), intType.getEOVBytes(), + }; + for (final int i : ints) { + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + VEC_INT, + vecsToEncode, + bytes.array(), + }); + } + } + + // Vector of floats encoding + { + final int byteWidth = BCF2Type.FLOAT.getSizeInBytes(); + final List vecsToEncode = Arrays.asList( + Arrays.asList(null, 1.0), // Internal null should be missing bytes, not EOV + new double[]{1.0}, // Short vector should be EOV padded + null, // Entirely missing vector should start with one MISSING, then be EOV padded + Double.NaN // Atomic value should be treated as vector of size 1 + ); + final int nValues = 2; + final ByteBuffer bytes = ByteBuffer.allocate(nValues * vecsToEncode.size() * byteWidth); + final int[] ints = new int[]{ + BCF2Type.FLOAT.getMissingBytes(), Float.floatToRawIntBits(1.0f), + Float.floatToRawIntBits(1.0f), BCF2Type.FLOAT.getEOVBytes(), + BCF2Type.FLOAT.getMissingBytes(), BCF2Type.FLOAT.getEOVBytes(), + Float.floatToRawIntBits((float) Double.NaN), BCF2Type.FLOAT.getEOVBytes(), + }; + for (final int i : ints) { + for (int shift = 0; shift < byteWidth; shift++) { + bytes.put((byte) (i >> (shift * 8))); + } + } + cases.add(new Object[]{ + VEC_FLOAT, + vecsToEncode, + bytes.array(), + }); + } + + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "fieldEncoderCases") + public static void testFieldEncoders( + final BCF2FieldEncoder encoder, + final List objects, + final byte[] expectedBytes + ) throws IOException { + for (final Object o : objects) { + encoder.load(o); + } + encoder.encode(); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } + + + @DataProvider(name = "siteWriterCases") + public static Object[][] siteWriterCases() { + final List cases = new ArrayList<>(); + + // Generic encoder + { + final VCFInfoHeaderLine info = new VCFInfoHeaderLine("genericKey", 2, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter.SiteAttributeWriter writer = new BCF2FieldWriter.SiteAttributeWriter(info, 1, ENCODER); + final VariantContext vc1 = new VariantContextBuilder() + .attribute("genericKey", 1) + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes1 = new byte[]{ + 0x21, // 2 8-bit ints + 1, (byte) BCF2Type.INT8.getEOVBytes() // Field writer should pad out array to 2 elements to match header count + }; + cases.add(new Object[]{ + writer, vc1, bytes1, + }); + + final VariantContext vc2 = new VariantContextBuilder() + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes2 = new byte[]{ + 0x01, // Field writer should directly write typed missing, ignoring header count + }; + cases.add(new Object[]{ + writer, vc2, bytes2, + }); + } + + // Flag writer + { + final VCFInfoHeaderLine info = new VCFInfoHeaderLine("genericKey", 0, VCFHeaderLineType.Flag, "test"); + final BCF2FieldWriter.SiteFlagWriter writer = new BCF2FieldWriter.SiteFlagWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .attribute("genericKey", true) + .chr("dummy") + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x00, // MISSING type just used as a filler value + }; + cases.add(new Object[]{ + writer, vc, bytes, + }); + } + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "siteWriterCases") + public void testSiteWriters( + final BCF2FieldWriter.SiteWriter writer, + final VariantContext vc, + final byte[] expectedBytes + ) throws IOException { + // Skip writing key so that we don't get key in output + writer.encode(vc); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } + + + @DataProvider(name = "genotypeWriterCases") + public static Object[][] genotypeWriterCases() { + final List cases = new ArrayList<>(); + + // Generic encoder + { + final VCFFormatHeaderLine info = new VCFFormatHeaderLine("genericKey", 2, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter.GenotypeAttributeWriter writer = new BCF2FieldWriter.GenotypeAttributeWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .attribute("genericKey", 1) + .chr("dummy") + .genotypes(new GenotypeBuilder() + .name("sample") + .attribute("genericKey", 1) + .make() + ) + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + 1, (byte) BCF2Type.INT8.getEOVBytes() // Field writer should pad out array to 2 elements to match header count + }; + cases.add(new Object[]{ + writer, vc, Collections.singletonList("sample"), bytes, + }); + } + + // FT encoder + { + final VCFFormatHeaderLine info = new VCFFormatHeaderLine("FT", 1, VCFHeaderLineType.String, "test"); + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(info, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .genotypes( + new GenotypeBuilder() + .name("hasFilter") + .filter("f") + .make(), + new GenotypeBuilder() + .name("noFilter") + .unfiltered() // should be encoded as PASS + .make() + ) + .alleles("A") + .make(); + final byte[] bytes = new byte[]{ + 0x47, // Strings of length 4 + 'f', 0, 0, 0, + 'P', 'A', 'S', 'S', + }; + cases.add(new Object[]{ + writer, vc, Arrays.asList("hasFilter", "noFilter"), bytes, + }); + } + + // GT encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "test"); + final Allele ref = Allele.REF_A; + final Allele alt = Allele.ALT_T; + + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + { + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("refAlt") + .alleles(Arrays.asList(ref, alt)) + .make(), + new GenotypeBuilder() + .name("refAltPhased") + .alleles(Arrays.asList(ref, alt)) + .phased(true) + .make(), + new GenotypeBuilder() + .name("missingMissing") + .alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)) + .make(), + new GenotypeBuilder() + .name("haploid") + .alleles(Collections.singletonList(ref)) + .make() + ) + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + 0x02, 0x04, + 0x02, 0x05, + 0x00, 0x00, + 0x02, (byte) 0x81, + }; + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + + // Test encoding for a VC entirely missing genotype data + { + final VariantContext vcMissingGenotypes = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("refAlt") + .alleles(Arrays.asList(ref, alt)) + .make() + ) + .make(); + final byte[] bytes = new byte[]{ + 0x21, // 2 8-bit ints + (byte) 0, (byte) 0 + }; + cases.add(new Object[]{ + writer, vcMissingGenotypes, + Collections.singletonList("sampleNameNotPresentInGenotype"), + bytes, + }); + } + } + + // Inline integer encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "test"); + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .genotypes( + new GenotypeBuilder() + .name("small") + .DP(2) + .make(), + new GenotypeBuilder() + .name("big") + .DP(256) + .make() + ) + .alleles("A") + .make(); + + final byte[] bytes = new byte[]{ + 0x12, // 1 16-bit int + 0x02, 0x00, + (byte) 256, 256 >> 8, + }; + + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + + // Inline vector of integer encoder + { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "test"); + final Allele ref = Allele.REF_A; + final Allele alt = Allele.ALT_T; + final BCF2FieldWriter writer = BCF2FieldWriter.createGenotypeWriter(format, 1, ENCODER); + final VariantContext vc = new VariantContextBuilder() + .chr("dummy") + .alleles(Arrays.asList(ref, alt)) + .genotypes( + new GenotypeBuilder() + .name("small") + .alleles(Arrays.asList(ref, alt)) + .PL(new int[]{1, 2}) + .make(), + new GenotypeBuilder() + .name("big") + .alleles(Arrays.asList(ref, alt)) + .PL(new int[]{256}) // should pad out + .make() + ) + .make(); + + final byte[] bytes = new byte[]{ + 0x32, // 3 16-bit ints + 0x01, 0x00, 0x02, 0x00, (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8), + (byte) 256, 256 >> 8, (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8), (byte) BCF2Type.INT16.getEOVBytes(), (byte) (BCF2Type.INT16.getEOVBytes() >> 8) + }; + + cases.add(new Object[]{ + writer, vc, + vc.getGenotypes().stream().map(Genotype::getSampleName).collect(Collectors.toList()), + bytes, + }); + } + return cases.toArray(new Object[0][]); + } + + @Test(dataProvider = "genotypeWriterCases") + public void testGenotypeWriters( + final BCF2FieldWriter.GenotypeWriter writer, + final VariantContext vc, + final List sampleNames, + final byte[] expectedBytes + ) throws IOException { + writer.encode(vc, sampleNames); + Assert.assertEquals(expectedBytes, ENCODER.getRecordBytes()); + } +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 91804c48dc..e07d23cfd9 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -34,9 +34,8 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -51,47 +50,6 @@ * Tests for BCF2Utils */ public final class BCF2UtilsUnitTest extends VariantBaseTest { - @DataProvider(name = "CollapseExpandTest") - public Object[][] makeCollapseExpandTest() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("A"), "A", false}); - tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true}); - tests.add(new Object[]{Arrays.asList("AB"), "AB", false}); - tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true}); - tests.add(new Object[]{Arrays.asList(), "", false}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CollapseExpandTest") - public void testCollapseExpandTest(final List in, final String expectedCollapsed, final boolean isCollapsed) { - final String actualCollapsed = BCF2Utils.collapseStringList(in); - Assert.assertEquals(actualCollapsed, expectedCollapsed); - Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed); - if ( isCollapsed ) - Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in); - } - - @Test - public void testCreateDictionary() { - final List inputLines = new ArrayList(); - int counter = 0; - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFHeaderLine("x", "misc")); - inputLines.add(new VCFHeaderLine("y", "misc")); - inputLines.add(new VCFSimpleHeaderLine("GATKCommandLine","z","misc")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); - final ArrayList dict = BCF2Utils.makeDictionary(inputHeader); - final int dict_size = dict.size(); - Assert.assertEquals(7,dict_size); - } /** * Wrapper class for HeaderOrderTestProvider test cases to prevent TestNG from calling toString() @@ -102,7 +60,7 @@ private static class HeaderOrderTestCase { public final VCFHeader testHeader; public final boolean expectedConsistent; - public HeaderOrderTestCase( final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent ) { + public HeaderOrderTestCase(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) { this.inputHeader = inputHeader; this.testHeader = testHeader; this.expectedConsistent = expectedConsistent; @@ -111,41 +69,44 @@ public HeaderOrderTestCase( final VCFHeader inputHeader, final VCFHeader testHea @DataProvider(name = "HeaderOrderTestProvider") public Object[][] makeHeaderOrderTestProvider() { - final List inputLines = new ArrayList(); - final List extraLines = new ArrayList(); + final List inputLines = new ArrayList<>(); + final List extraLines = new ArrayList<>(); int counter = 0; - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + // We prefix all the line IDs with "l" because as of VCF 4.3, IDs cannot start with a number + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFFilterHeaderLine("l" + counter++)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + inputLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); - extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFFilterHeaderLine("l" + counter++)); + extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "l" + counter++), counter)); + extraLines.add(new VCFInfoHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFFormatHeaderLine("l" + counter++, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); extraLines.add(new VCFHeaderLine("x", "misc")); extraLines.add(new VCFHeaderLine("y", "misc")); - List tests = new ArrayList(); - for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) { + final List tests = new ArrayList<>(); + for (final int extrasToTake : Arrays.asList(0, 1, 2, 3)) { final List empty = Collections.emptyList(); final List> permutations = extrasToTake == 0 - ? Collections.singletonList(empty) - : GeneralUtils.makePermutations(extraLines, extrasToTake, false); - for ( final List permutation : permutations ) { - for ( int i = -1; i < inputLines.size(); i++ ) { - final List allLines = new ArrayList(inputLines); - if ( i >= 0 ) + ? Collections.singletonList(empty) + : GeneralUtils.makePermutations(extraLines, extrasToTake, false); + for (final List permutation : permutations) { + for (int i = -1; i < inputLines.size(); i++) { + final List allLines = new ArrayList<>(inputLines); + if (i >= 0) allLines.remove(i); allLines.addAll(permutation); - final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); + allLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<>(allLines)); final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter); tests.add(new Object[]{new HeaderOrderTestCase(inputHeader, testHeader, expectedConsistent)}); } @@ -154,18 +115,18 @@ public Object[][] makeHeaderOrderTestProvider() { // sample name tests final List> sampleNameTests = Arrays.asList( - new ArrayList(), - Arrays.asList("A"), - Arrays.asList("A", "B"), - Arrays.asList("A", "B", "C")); - for ( final List inSamples : sampleNameTests ) { - for ( final List testSamples : sampleNameTests ) { + new ArrayList<>(), + Collections.singletonList("A"), + Arrays.asList("A", "B"), + Arrays.asList("A", "B", "C")); + for (final List inSamples : sampleNameTests) { + for (final List testSamples : sampleNameTests) { final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples); final List> permutations = testSamples.isEmpty() - ? Collections.singletonList(testSamples) - : GeneralUtils.makePermutations(testSamples, testSamples.size(), false); - for ( final List testSamplesPermutation : permutations ) { + ? Collections.singletonList(testSamples) + : GeneralUtils.makePermutations(testSamples, testSamples.size(), false); + for (final List testSamplesPermutation : permutations) { final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation); final boolean expectedConsistent = testSamples.equals(inSamples); tests.add(new Object[]{new HeaderOrderTestCase(inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent)}); @@ -179,14 +140,15 @@ public Object[][] makeHeaderOrderTestProvider() { private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { final List ids = new ArrayList(); for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine) { - ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); + if ( line.isIDHeaderLine()) { + // Strip off "l" prefix + ids.add(Integer.valueOf(line.getID().substring(1))); } } // as long as the start contains all of the ids up to minCounterForInputLines in order - for ( int i = 0; i < minCounterForInputLines; i++ ) - if ( i >= ids.size() || ids.get(i) != i ) + for (int i = 0; i < minCounterForInputLines; i++) + if (i >= ids.size() || ids.get(i) != i) return false; return true; @@ -197,32 +159,8 @@ private static boolean expectedConsistent(final VCFHeader combinationHeader, fin // even when the header file is slightly different // @Test(dataProvider = "HeaderOrderTestProvider") - public void testHeaderOrder( final HeaderOrderTestCase testCase ) { + public void testHeaderOrder(final HeaderOrderTestCase testCase) { final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testCase.testHeader, testCase.inputHeader); Assert.assertEquals(actualOrderConsistency, testCase.expectedConsistent); } - - - private void assertListsAreEquivalent(final List a, final List b) { - Assert.assertEquals(a.size(), b.size()); - for (int i=0; i tests = new ArrayList(); - tests.add(new Object[]{Object.class, null, Collections.emptyList()}); - tests.add(new Object[]{Integer.class, 1, Arrays.asList(1)}); - tests.add(new Object[]{Integer.class, new int[]{1, 2, 3}, Arrays.asList(1, 2, 3)}); - tests.add(new Object[]{String.class, Arrays.asList("X", "Y"), Arrays.asList("X", "Y")}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "toListTestProvider") - public void testToList(final Class cls, final Object input, final List expectedOutput) { - assertListsAreEquivalent(BCF2Utils.toList(cls, input), expectedOutput); - } - - } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 7a99916c5b..7256c9c967 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -1,34 +1,36 @@ /* -* Copyright (c) 2017 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2017 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package htsjdk.variant.bcf2; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.TestUtil; +import htsjdk.samtools.util.Tuple; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.utils.BCFToolsTestUtils; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -37,30 +39,40 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.VariantContextTestProvider; -import htsjdk.variant.variantcontext.writer.*; -import htsjdk.variant.vcf.*; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; /** * @author amila - *

    - * Class BCF2WriterUnitTest - *

    - * This class tests out the ability of the BCF writer to correctly write BCF files + *

    + * Class BCF2WriterUnitTest + *

    + * This class tests out the ability of the BCF writer to correctly write BCF files */ public class BCF2WriterUnitTest extends VariantBaseTest { @@ -72,17 +84,18 @@ public class BCF2WriterUnitTest extends VariantBaseTest { * @return a fake VCF header */ private static VCFHeader createFakeHeader() { - final SAMSequenceDictionary sequenceDict = createArtificialSequenceDictionary(); + final SAMSequenceDictionary sequenceDict = VariantBaseTest.createArtificialSequenceDictionary(); final Set metaData = new HashSet<>(); final Set additionalColumns = new HashSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); final VCFHeader header = new VCFHeader(metaData, additionalColumns); - header.addMetaDataLine(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.String, "x")); + header.addMetaDataLine(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "x")); header.addMetaDataLine(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x")); header.addMetaDataLine(new VCFFormatHeaderLine("BB", 1, VCFHeaderLineType.String, "x")); - header.addMetaDataLine(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.String, "x")); + header.addMetaDataLine(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "x")); header.setSequenceDictionary(sequenceDict); return header; } @@ -101,25 +114,23 @@ private void createTemporaryDirectory() { public void testWriteAndReadBCF() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider - .readAllVCs(bcfOutputFile, new BCF2Codec()); + final VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider + .readAllVCs(bcfOutputFile, new BCF2Codec()); int counter = 0; - final Iterator it = container.getVCs().iterator(); - while (it.hasNext()) { - it.next(); + for (final VariantContext ignored : container.getVCs()) { counter++; } Assert.assertEquals(counter, 2); - } @@ -131,21 +142,20 @@ public void testWriteAndReadBCFWithIndex() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); Tribble.indexFile(bcfOutputFile).deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .setOptions(EnumSet.of(Options.INDEX_ON_THE_FLY)) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .setOptions(EnumSet.of(Options.INDEX_ON_THE_FLY)) + .build() + ) { writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider - .readAllVCs(bcfOutputFile, new BCF2Codec()); + final VariantContextTestProvider.VariantContextContainer container = VariantContextTestProvider + .readAllVCs(bcfOutputFile, new BCF2Codec()); int counter = 0; - final Iterator it = container.getVCs().iterator(); - while (it.hasNext()) { - it.next(); + for (final VariantContext ignored : container.getVCs()) { counter++; } Assert.assertEquals(counter, 2); @@ -161,41 +171,43 @@ public void testWriteAndReadBCFHeaderless() throws IOException { final File bcfOutputHeaderlessFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".bcf", tempDir); bcfOutputHeaderlessFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // we write two files, bcfOutputFile with the header, and bcfOutputHeaderlessFile with just the body try (final VariantContextWriter fakeBCFFileWriter = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { fakeBCFFileWriter.writeHeader(header); // writes header } try (final VariantContextWriter fakeBCFBodyFileWriter = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputHeaderlessFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputHeaderlessFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { fakeBCFBodyFileWriter.setHeader(header); // does not write header fakeBCFBodyFileWriter.add(createVC(header)); fakeBCFBodyFileWriter.add(createVC(header)); } - VariantContextTestProvider.VariantContextContainer container; - - try (final PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputFile)); - final PositionalBufferedStream bodyPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputHeaderlessFile))) { + try (final PositionalBufferedStream headerPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputFile))); + final PositionalBufferedStream bodyPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputHeaderlessFile))) + ) { - BCF2Codec codec = new BCF2Codec(); + final BCF2Codec codec = new BCF2Codec(); codec.readHeader(headerPbs); // we use the header information read from identical file with header+body to read just the body of second file int counter = 0; while (!bodyPbs.isDone()) { - VariantContext vc = codec.decode(bodyPbs); + codec.decode(bodyPbs); counter++; } Assert.assertEquals(counter, 2); } - } /** @@ -207,42 +219,45 @@ public void testReadAndWritePhasedBCF() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - try ( VCFFileReader vcfFile = new VCFFileReader(vcfInputFile); - - VariantContextWriter bcfWriter = new VariantContextWriterBuilder().setOutputFile(bcfOutputFile).setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()).build(); - - ) { - bcfWriter.writeHeader(vcfFile.getFileHeader()); - - for (VariantContext vc : vcfFile.iterator().toList()) { - Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); - bcfWriter.add(vc); + try (final VCFFileReader vcfFile = new VCFFileReader(vcfInputFile)) { + try (final VariantContextWriter bcfWriter = new VariantContextWriterBuilder() + .setOutputFile(bcfOutputFile) + .setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()) + .build() + ) { + bcfWriter.writeHeader(vcfFile.getFileHeader()); + for (final VariantContext vc : vcfFile.iterator().toList()) { + Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); + bcfWriter.add(vc); + } } - bcfWriter.close(); // Reading the VCF and writing it to a BCF final File vcfOutputFile = File.createTempFile("testWriteAndReadBCFHeaderless.", ".vcf", tempDir); vcfOutputFile.deleteOnExit(); - try (final PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(bcfOutputFile)); - VariantContextWriter vcfWriter = new VariantContextWriterBuilder().setOutputFile(vcfOutputFile).setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()).build(); - ) { + try (final PositionalBufferedStream headerPbs = + new PositionalBufferedStream(new GZIPInputStream(new FileInputStream(bcfOutputFile))); + final VariantContextWriter vcfWriter = new VariantContextWriterBuilder() + .setOutputFile(vcfOutputFile) + .setReferenceDictionary(vcfFile.getFileHeader().getSequenceDictionary()) + .build() + ) { vcfWriter.writeHeader(vcfFile.getFileHeader()); - BCF2Codec codec = new BCF2Codec(); + final BCF2Codec codec = new BCF2Codec(); codec.readHeader(headerPbs); // we use the header information read from identical file with header+body to read just the body of second file while (!headerPbs.isDone()) { - VariantContext vc = codec.decode(headerPbs); + final VariantContext vc = codec.decode(headerPbs); Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); vcfWriter.add(vc); } - vcfWriter.close(); } - try (VCFFileReader vcfOutput = new VCFFileReader(vcfInputFile);) { - for (VariantContext vc : vcfOutput.iterator().toList()) { + try (final VCFFileReader vcfOutput = new VCFFileReader(vcfInputFile)) { + for (final VariantContext vc : vcfOutput.iterator().toList()) { Assert.assertEquals(vc.getGenotypes().stream().filter(Genotype::isPhased).count(), 2); } } @@ -254,12 +269,13 @@ public void testWriteHeaderTwice() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent writing header twice try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.writeHeader(header); } @@ -270,12 +286,13 @@ public void testChangeHeaderAfterWritingHeader() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent changing header if it's already written try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.writeHeader(header); writer.setHeader(header); } @@ -286,12 +303,13 @@ public void testChangeHeaderAfterWritingBody() throws IOException { final File bcfOutputFile = File.createTempFile("testWriteAndReadVCF.", ".bcf", tempDir); bcfOutputFile.deleteOnExit(); - final VCFHeader header = createFakeHeader(); + final VCFHeader header = BCF2WriterUnitTest.createFakeHeader(); // prevent changing header if part of body is already written try (final VariantContextWriter writer = new VariantContextWriterBuilder() - .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) - .unsetOption(Options.INDEX_ON_THE_FLY) - .build()) { + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { writer.setHeader(header); writer.add(createVC(header)); writer.setHeader(header); @@ -304,7 +322,7 @@ public void testChangeHeaderAfterWritingBody() throws IOException { * @param header the VCF header * @return a VCFRecord */ - private VariantContext createVC(final VCFHeader header) { + private static VariantContext createVC(final VCFHeader header) { final List alleles = new ArrayList<>(); final Map attributes = new HashMap<>(); final GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size()); @@ -314,14 +332,121 @@ private VariantContext createVC(final VCFHeader header) { attributes.put("DP", "50"); for (final String name : header.getGenotypeSamples()) { - final Genotype gt = new GenotypeBuilder(name, alleles.subList(1, 2)).GQ(0).attribute("BB", "1").phased(true) - .make(); + final Genotype gt = new GenotypeBuilder(name, alleles.subList(1, 2)) + .GQ(0).attribute("BB", "1") + .phased(true) + .make(); genotypes.add(gt); } return new VariantContextBuilder("RANDOM", "1", 1, 1, alleles) - .genotypes(genotypes).attributes(attributes).make(); + .genotypes(genotypes).attributes(attributes).make(); } + @DataProvider + public Object[][] bcftoolsReadsHtsjdkOutputProvider() { + return new Object[][]{ + {"phased.vcf"}, + {"test1.vcf"}, + {"test2.vcf"}, + {"NA12891.vcf"}, + {"NA12891.fp.vcf"}, + {"structuralvariants.vcf"}, + {"ex2.vcf"}, + {"test.vcf.bgz"}, + {"vcf43/all43Features.utf8.vcf"}, + // This test fails because the BCF decoder cannot distinguish between a vector of Characters and a String +// {"missingStringAndCharacterTest.vcf"}, + }; + } + @Test(dataProvider = "bcftoolsReadsHtsjdkOutputProvider") + public void testBCFToolsReadsHtsjdkOutput(final String testFile) throws IOException { + // Take an input VCF and read it into memory as our expected output + // Take the same VCF and write it out as a BCF using htsjdk's BCF2Writer, use bcftools to convert from + // BCF back to VCF, and read the converted VCF into memory again as our actual output + final Path path = new File(VariantBaseTest.variantTestDataRoot + testFile).toPath(); + final Tuple> expectedVCF = readEntireVCFIntoMemory(path); + final VCFHeader header = expectedVCF.a; + final List expectedVariantContexts = expectedVCF.b; + + final File bcfOutputFile = File.createTempFile("testBCFToolsRoundTrip" + testFile, ".bcf", tempDir); + bcfOutputFile.deleteOnExit(); + + try (final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(bcfOutputFile).setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build() + ) { + writer.writeHeader(header); + for (final VariantContext vc : expectedVariantContexts) { + writer.add(vc.fullyDecode(header, false)); + } + } + + final Path converted = BCFToolsTestUtils.BCFToVCF(bcfOutputFile, "").toPath(); + final Tuple> actualVCF = readEntireVCFIntoMemory(converted); + final List actualVariantContexts = actualVCF.b; + + // Don't compare the headers, since they might contain extraneous lines, and the BCF codec isn't responsible + // for headers + Assert.assertEquals(expectedVariantContexts.size(), actualVCF.b.size()); + final int length = expectedVariantContexts.size(); + for (int i = 0; i < length; i++) { + // Fully decode both variant contexts so that we're comparing actual objects and not their string + // representations, which can be different without affecting semantics, e.g. number of digits in a double + VariantBaseTest.assertVariantContextsAreEqual( + actualVariantContexts.get(i).fullyDecode(header, false), + expectedVariantContexts.get(i).fullyDecode(header, false) + ); + } + } + + @DataProvider + public Object[][] htsjdkReadsBCFToolsOutputProvider() { + return new Object[][]{ + {"phased.vcf"}, + {"test1.vcf"}, + {"test2.vcf"}, + {"NA12891.vcf"}, + {"NA12891.fp.vcf"}, + {"structuralvariants.vcf"}, + {"ex2.vcf"}, + {"test.vcf.bgz"}, + // TODO bcftools does not convert '.' into the MISSING value for Character (0x07), + // but writes it out as literal '.' which causes this test to fail when we compare '.' against null, + // see https://github.com/samtools/hts-specs/issues/618 +// {"missingStringAndCharacterTest.vcf"}, + // bcftools does not to decoding of percent encoded VCFs, so its BCF output contains the literal characters +// {"vcf43/all43Features.utf8.vcf"} + }; + } + + @Test(dataProvider = "htsjdkReadsBCFToolsOutputProvider") + public void testHtsjdkReadsBCFToolsOutput(final String testFile) { + // Take an input VCF and read it into memory as our expected output + // Take the same VCF and convert it to BCF using bcftools, then read the BCF into memory again as our actual output + final Path path = new File(VariantBaseTest.variantTestDataRoot + testFile).toPath(); + final Tuple> expectedVCF = readEntireVCFIntoMemory(path); + final VCFHeader header = expectedVCF.a; + final List expectedVariantContexts = expectedVCF.b; + + final File converted = BCFToolsTestUtils.VCFtoBCF(path.toFile(), ""); + final VCFFileReader reader = new VCFFileReader(converted, false); + + final List actualVariantContexts = reader.iterator().stream().collect(Collectors.toList()); + + // Don't compare the headers, since they might contain extraneous lines, and the BCF codec isn't responsible + // for headers + Assert.assertEquals(expectedVariantContexts.size(), actualVariantContexts.size()); + final int length = expectedVariantContexts.size(); + for (int i = 0; i < length; i++) { + // Fully decode both variant contexts so that we're comparing actual objects and not their string + // representations, which can be different without affecting semantics, e.g. number of digits in a double + VariantBaseTest.assertVariantContextsAreEqual( + actualVariantContexts.get(i).fullyDecode(header, false), + expectedVariantContexts.get(i).fullyDecode(header, false) + ); + } + } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java b/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java index 39fce34b18..5c0af6c761 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCFCodecTest.java @@ -13,24 +13,13 @@ import java.io.IOException; public class BCFCodecTest extends VariantBaseTest { - final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/"; - - // should reject bcf v2.2 on read, see issue https://github.com/samtools/htsjdk/issues/1323 - @Test(expectedExceptions = TribbleException.class) - private void testRejectBCFVersion22() throws IOException { - BCF2Codec bcfCodec = new BCF2Codec(); - try (final FileInputStream fis = new FileInputStream(new File(TEST_DATA_DIR, "BCFVersion22Uncompressed.bcf")); - final PositionalBufferedStream pbs = new PositionalBufferedStream(fis)) { - bcfCodec.readHeader(pbs); - } - } + private static final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/"; @Test - private void testBCFCustomVersionCompatibility() throws IOException { + public void testBCFCustomVersionCompatibility() throws IOException { final BCF2Codec bcfCodec = new BCF2Codec() { @Override protected void validateVersionCompatibility(final BCFVersion supportedVersion, final BCFVersion actualVersion) { - return; } }; diff --git a/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java b/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java index 5e3f0b9eb8..caed6dbdf8 100644 --- a/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/GenotypeBuilderTest.java @@ -25,16 +25,85 @@ package htsjdk.variant.variantcontext; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; public class GenotypeBuilderTest extends VariantBaseTest { + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFilters() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filter("x;y;x"); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFiltersCollection() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(Arrays.asList("x", "y", "x")); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectDuplicateFiltersArray() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("x", "y", "x"); + } + + @DataProvider + public Object[][] illegalFilterNameProvider() { + return new Object[][]{ + // Reserved string 0 + {"0"}, + // Contains whitespace + {"a b"}, + // Contains separator + {"a;b"} + }; + } + + @Test(dataProvider = "illegalFilterNameProvider", expectedExceptions = TribbleException.class) + public void testRejectIllegalFilterName(final String filter) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(Collections.singletonList(filter)); + } + + @DataProvider + public Object[][] illegalFilterSeparatorPlacementProvider() { + return new Object[][]{ + // Begins with ; + {";a"}, + // Ends with ; + {"a;"}, + // Contains adjacent internal ; + {"a;;b"} + }; + } + + @Test(dataProvider = "illegalFilterSeparatorPlacementProvider", expectedExceptions = TribbleException.class) + public void testRejectIllegalFilterSeparatorPlacement(final String filter) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters(filter); + } + + @Test(expectedExceptions = TribbleException.class) + public void testRejectMissingWithValueFilterString() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("a;."); + } + + @Test + public void testAcceptMissingFilterString() { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + gb.filters("."); + } + @Test public void testMakeWithShallowCopy() { final GenotypeBuilder gb = new GenotypeBuilder("test"); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java index c8871bd2be..fdd95e1e14 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextBuilderTest.java @@ -1,5 +1,6 @@ package htsjdk.variant.variantcontext; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -258,7 +259,7 @@ public static Object[][] illegalFilterStrings() { }; } - @Test(dataProvider = "illegalFilterStrings", expectedExceptions = IllegalStateException.class) + @Test(dataProvider = "illegalFilterStrings", expectedExceptions = TribbleException.class) public void testFilterCannotUseBadFilters(final String filter) { final Set filters = new HashSet<>(); filters.add(filter); @@ -322,7 +323,7 @@ public void testCanResetFilters() { builder.filter("mayIPlease?"); } - @Test(expectedExceptions = IllegalStateException.class) + @Test(expectedExceptions = TribbleException.class) public void testCantCreateNullFilter(){ final VariantContextBuilder builder = new VariantContextBuilder("source", "contig", 1, 1, Arrays.asList(Tref, C, G)).filter("TEST"); builder.filters((String)null); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java index 8cff545f78..91efd8bcf0 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java @@ -27,11 +27,9 @@ import htsjdk.HtsjdkTest; import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.FeatureCodecHeader; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.LineIteratorImpl; -import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.readers.SynchronousLineReader; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.bcf2.BCF2Codec; @@ -48,7 +46,8 @@ import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; - +import htsjdk.variant.vcf.VCFIterator; +import htsjdk.variant.vcf.VCFIteratorBuilder; import org.testng.Assert; import java.io.BufferedInputStream; @@ -221,6 +220,7 @@ private final static void addHeaderLine(final Set metaData, final private static void createSyntheticHeader() { Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); @@ -754,29 +754,9 @@ public void remove() { } } public static VariantContextContainer readAllVCs(final File input, final BCF2Codec codec) throws IOException { - PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(input)); - FeatureCodecHeader header = codec.readHeader(headerPbs); - headerPbs.close(); - - final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(input)); - pbs.skip(header.getHeaderEnd()); - - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new VariantContextTestProvider.VariantContextContainer(vcfHeader, new VariantContextTestProvider.VCIterable(codec, vcfHeader) { - @Override - public boolean hasNext() { - try { - return !pbs.isDone(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public Object nextSource() { - return pbs; - } - }); + final VCFIterator iterator = new VCFIteratorBuilder().open(input); + final VCFHeader vcfHeader = iterator.getHeader(); + return new VariantContextTestProvider.VariantContextContainer(vcfHeader, () -> iterator); } public static VariantContextContainer readAllVCs(final File input, final VCFCodec codec) throws FileNotFoundException { @@ -867,7 +847,7 @@ public static void assertEquals(final Genotype actual, final Genotype expected) // inline attributes Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad"); Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java index 8613be1e01..085bf6d10e 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java @@ -41,6 +41,7 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeSuite; @@ -1200,7 +1201,7 @@ private VariantContext createTestVariantContext(final List alleles, fina // most of the fields are not important to the tests, we just need alleles and gc set properly return new VariantContext("genotypes", VCFConstants.EMPTY_ID_FIELD, snpLoc, snpLocStart, snpLocStop, alleles, gc, VariantContext.NO_LOG10_PERROR, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, VCFHeader.DEFAULT_VCF_VERSION, toValidate); } // validateReferenceBases: PASS conditions @@ -1296,7 +1297,7 @@ private VariantContext createTestVariantContextRsIds(final String rsId) { return new VariantContext("genotypes", rsId, snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T), GenotypesContext.NO_GENOTYPES, VariantContext.NO_LOG10_PERROR, filters, attributes, - fullyDecoded, toValidate); + fullyDecoded, VCFHeader.DEFAULT_VCF_VERSION, toValidate); } private Set makeRsIDsSet(final String... rsIds) { return new HashSet<>(Arrays.asList(rsIds)); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java index 9e7f7e45cb..379130407c 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java @@ -89,7 +89,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (final FileInputStream fis = new FileInputStream(fakeVCFFile)) { final AsciiLineReaderIterator iterator = new AsciiLineReaderIterator(new AsciiLineReader(fis)); @@ -110,6 +110,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { */ public static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index ca2afcbec0..83376588c3 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -29,12 +29,9 @@ import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.TestUtil; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureReader; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.AsciiLineReader; import htsjdk.tribble.readers.AsciiLineReaderIterator; -import htsjdk.tribble.util.TabixUtils; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -43,10 +40,18 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFStandardHeaderLines; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.io.File; import java.io.FileInputStream; @@ -56,16 +61,10 @@ import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - /** * @author aaron *

    @@ -108,31 +107,23 @@ public void testBasicWriteAndRead(final String extension) throws IOException { writer.add(createVC(header)); writer.add(createVC(header)); writer.close(); - final VCFCodec codec = new VCFCodec(); - final FeatureReader reader = AbstractFeatureReader.getFeatureReader(fakeVCFFile.getAbsolutePath(), codec, false); - final VCFHeader headerFromFile = (VCFHeader)reader.getHeader(); + final VCFFileReader reader = new VCFFileReader(fakeVCFFile.toPath(), false, VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VCFHeader headerFromFile = reader.getHeader(); int counter = 0; // validate what we're reading in validateHeader(headerFromFile, sequenceDict); - try { - final Iterator it = reader.iterator(); - while(it.hasNext()) { - it.next(); - counter++; - } - Assert.assertEquals(counter, 2); - } - catch (final IOException e ) { - throw new RuntimeException(e.getMessage()); + for (final VariantContext variantContext : reader) { + counter++; } + Assert.assertEquals(counter, 2); } /** test, using the writer and reader, that we can output and input a VCF body without problems */ - @Test(dataProvider = "vcfExtensionsDataProvider") + @Test(dataProvider = "vcfHeaderlessExtensionsDataProvider") public void testWriteAndReadVCFHeaderless(final String extension) throws IOException { final File fakeVCFFile = File.createTempFile("testWriteAndReadVCFHeaderless.", extension, tempDir); fakeVCFFile.deleteOnExit(); @@ -154,7 +145,7 @@ public void testWriteAndReadVCFHeaderless(final String extension) throws IOExcep writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (BlockCompressedInputStream bcis = new BlockCompressedInputStream(fakeVCFFile); FileInputStream fis = new FileInputStream(fakeVCFFile)) { @@ -228,8 +219,14 @@ public void testChangeHeaderAfterWritingBody() { */ private static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); + // Explicitly add GT, AD, and BB keys because the .bcf tests that use this fake header require that the header + // contain INFO/FORMAT lines for all the attributes written + metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); + metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY)); + metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); + metaData.add(new VCFFormatHeaderLine("BB", 1, VCFHeaderLineType.Integer, "test key")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); final VCFHeader ret = new VCFHeader(metaData, additionalColumns); @@ -330,13 +327,21 @@ public void TestWritingLargeVCF(final String extension) throws FileNotFoundExcep @DataProvider(name = "vcfExtensionsDataProvider") public Object[][]vcfExtensionsDataProvider() { return new Object[][] { - // TODO: BCF doesn't work because header is not properly constructed. - // {".bcf"}, + {FileExtensions.BCF}, {FileExtensions.VCF}, {FileExtensions.COMPRESSED_VCF} }; } + // Testing writing headerless files does not make sense for .bcf because BCF's strong typing makes writing + // bodies without headers impossible, so we only test VCF and compressed VCF with headerless writing + @DataProvider(name = "vcfHeaderlessExtensionsDataProvider") + public Object[][]vcfHeaderlessExtensionsDataProvider() { + return new Object[][] { + {FileExtensions.VCF}, + {FileExtensions.COMPRESSED_VCF} + }; + } /** * A test to ensure that if we add a line to a VCFHeader it will persist through @@ -369,7 +374,7 @@ public void testModifyHeader() { * * A test to check that we can't write VCF with missing header. */ - @Test(dataProvider = "vcfExtensionsDataProvider", expectedExceptions = IllegalStateException.class) + @Test(dataProvider = "vcfHeaderlessExtensionsDataProvider", expectedExceptions = IllegalStateException.class) public void testWriteWithEmptyHeader(final String extension) throws IOException { final File fakeVCFFile = File.createTempFile("testWriteAndReadVCFHeaderless.", extension, tempDir); metaData = new HashSet<>(); diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 273b0f24af..8bdc321b51 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -13,6 +13,7 @@ import java.util.Iterator; import java.util.List; + public class AbstractVCFCodecTest extends VariantBaseTest { @Test @@ -31,11 +32,28 @@ public void shouldPreserveSymbolicAlleleCase() { public void TestSpanDelParseAlleles() { final List list = VCF3Codec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); } + @DataProvider(name="AllVCFCodecs") + public Object[][] allVCFCodecs() { + return new Object[][] { + {new VCF3Codec() }, + {new VCFCodec() }, + }; + } + + @Test(dataProvider = "AllVCFCodecs") + public void TestSpanDelParseAlleles(final AbstractVCFCodec vcfCodec){ + // TODO: why is there no Assert here ?? + vcfCodec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); + } @Test(expectedExceptions = TribbleException.class) public void TestSpanDelParseAllelesException() { final List list1 = VCF3Codec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); } + @Test(dataProvider = "AllVCFCodecs", expectedExceptions = TribbleException.class) + public void TestSpanDelParseAllelesException(final AbstractVCFCodec vcfCodec){ + vcfCodec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); + } @DataProvider(name = "thingsToTryToDecode") public Object[][] getThingsToTryToDecode() { @@ -47,16 +65,49 @@ public Object[][] getThingsToTryToDecode() { }; } - @Test(dataProvider = "thingsToTryToDecode") - public void testCanDecodeFile(String potentialInput, boolean canDecode) { - Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); - } + @Test(dataProvider = "thingsToTryToDecode") + public void testCanDecodeFile(String potentialInput, boolean canDecode) { + //TODO: add VCF43Codec when available + //TODO: its not sufficient to test for ANY v4 prefix since it will succeed on 4.3 as well + Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); + } - @Test - public void testGetTabixFormat() { - Assert.assertEquals(new VCFCodec().getTabixFormat(), TabixFormat.VCF); - Assert.assertEquals(new VCF3Codec().getTabixFormat(), TabixFormat.VCF); - } + @Test(dataProvider = "AllVCFCodecs") + public void testGetTabixFormat(final AbstractVCFCodec vcfCodec) { + Assert.assertEquals(vcfCodec.getTabixFormat(), TabixFormat.VCF); + } + + @DataProvider(name="otherHeaderLines") + public Object[][] otherHeaderLines() { + return new Object[][] { + { "key=<", new VCFHeaderLine("key", "<") }, + // taken from Funcotator test file as ##ID= + // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse + // into a VCFHeaderLine (just not a VCFSimpleHeaderLine) + { "ID=", + new VCFHeaderLine("ID", "") }, + }; + } + + @Test(dataProvider="otherHeaderLines") + public void testGetOtherHeaderLine(final String headerLineString, final VCFHeaderLine headerLine) { + Assert.assertEquals(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2), headerLine); + } + + @DataProvider(name="badOtherHeaderLines") + public Object[][] badOtherHeaderLines() { + return new Object[][] { + { "=" }, + { "=<" }, + { "=<>" }, + { "key" }, + }; + } + + @Test(dataProvider="badOtherHeaderLines", expectedExceptions=TribbleException.InvalidHeader.class) + public void testBadOtherHeaderLine(final String headerLineString) { + Assert.assertNull(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2)); + } @Test public void testGLnotOverridePL() { diff --git a/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java new file mode 100644 index 0000000000..ed6a1d2b96 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFAltHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String ALT_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFAltHeaderLine vcfLine = new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java index cbc027ab5d..d11af08105 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java @@ -10,6 +10,10 @@ import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -39,27 +43,40 @@ public class VCFCodec43FeaturesTest extends VariantBaseTest { private static final Path TEST_43_UTF8_FILE = TEST_PATH.resolve("all43Features.utf8.vcf"); private static final Path TEST_43_UTF8_GZ_FILE = TEST_PATH.resolve("all43FeaturesCompressed.utf8.vcf.gz"); + private static final Path TEST_42_PEDIGREE_FILE = TEST_PATH.resolve("42Pedigree.vcf"); + private static final Path TEST_INVALID_43_CONTIG_NAME_FILE = TEST_PATH.resolve("invalid43ContigName.vcf"); + private static final Path TEST_VALID_43_CONTIG_NAME_FILE = TEST_PATH.resolve("valid43ContigName.vcf"); + private static final Path TEST_42_AUTOMATICALLY_CONVERTIBLE_FILE = TEST_PATH.resolve("42AutomaticallyConvertible.vcf"); + @DataProvider(name="all43Files") private Object[][] allVCF43Files() { return new Object[][] { // a .vcf, .vcf.gz, .vcf with UTF8 chars, and .vcf.gz with UTF8 chars - { TEST_43_FILE }, - { TEST_43_UTF8_FILE }, - { TEST_43_GZ_FILE }, - { TEST_43_UTF8_GZ_FILE } + + // these first two files have a duplicate INFO header line in them that differ + // from each other only by virtue of having different descriptions: + //WARNING 2021-02-23 15:37:13 VCFMetaDataLines Attempt to add header line (INFO=) collides with existing line header line (INFO=). + // The existing line will be retained + { TEST_43_FILE, 69 }, + { TEST_43_UTF8_FILE, 69 }, + + { TEST_43_GZ_FILE, 70 }, + { TEST_43_UTF8_GZ_FILE, 70 } }; } @Test(dataProvider="all43Files") - public void testReadAllVCF43Features(final Path testFile) { + public void testReadAllVCF43Features(final Path testFile, final int expectedHeaderLineCount) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); - Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), 70); + Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), expectedHeaderLineCount); Assert.assertEquals(entireVCF.b.size(), 25); } @Test(dataProvider="all43Files") - public void testVCF43SampleLine(final Path testFile) { + public void testVCF43SampleLine(final Path testFile, int ignored) { // ##SAMPLE= final VCFSampleHeaderLine sampleLine = getHeaderLineFromTestFile( @@ -77,7 +94,7 @@ public void testVCF43SampleLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43AltLine(final Path testFile) { + public void testVCF43AltLine(final Path testFile, int ignored) { // ##ALT= final VCFAltHeaderLine altLine = getHeaderLineFromTestFile( testFile, @@ -90,7 +107,7 @@ public void testVCF43AltLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PedigreeLine(final Path testFile) { + public void testVCF43PedigreeLine(final Path testFile, int ignored) { // ##PEDIGREE= final VCFPedigreeHeaderLine pedigreeLine = getHeaderLineFromTestFile( testFile, @@ -116,7 +133,7 @@ public void testV43PedigreeParsing() { } @Test(dataProvider="all43Files") - public void testVCF43MetaLine(final Path testFile) { + public void testVCF43MetaLine(final Path testFile, int ignored) { // ##META= final VCFMetaHeaderLine metaLine = getHeaderLineFromTestFile( testFile, @@ -129,12 +146,12 @@ public void testVCF43MetaLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PercentEncoding(final Path testFile) { + public void testVCF43PercentEncoding(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE // AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth - final VariantContext vc = entireVCF.b.get(0); + final VariantContext vc = entireVCF.b.get(0).fullyDecode(entireVCF.a, false); Assert.assertEquals(vc.getContig(), "1"); Assert.assertEquals(vc.getStart(), 327); // set=fil%3AteredInBoth @@ -142,7 +159,7 @@ public void testVCF43PercentEncoding(final Path testFile) { } @Test(dataProvider="all43Files") - public void testSymbolicAlternateAllele(final Path testFile) { + public void testSymbolicAlternateAllele(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE @@ -158,6 +175,81 @@ public void testSymbolicAlternateAllele(final Path testFile) { Assert.assertEquals(symbolicAlternateAllele, Allele.create(Allele.UNSPECIFIED_ALTERNATE_ALLELE_STRING)); } + @Test(dataProvider = "all43Files") + public void testReadWriteRoundTrip(final Path testFile, final int ignored) throws IOException { + // Make sure 4.3 files round trip through reading into memory, writing, then reading back in + final Tuple> readVCF = readEntireVCFIntoMemory(testFile); + final VCFHeader readHeader = readVCF.a; + + final File out = File.createTempFile("testReadWriteRoundTrip", testFile.getFileName().toString()); + out.deleteOnExit(); + + final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(out) + .unsetOption(Options.INDEX_ON_THE_FLY) + .unsetOption(Options.DO_NOT_WRITE_GENOTYPES) + .build(); + + writer.writeHeader(readHeader); + for (final VariantContext vc : readVCF.b) { + writer.add(vc.fullyDecode(readHeader, false)); + } + + writer.close(); + + final Tuple> writeVCF = readEntireVCFIntoMemory(out.toPath()); + final VCFHeader writeHeader = writeVCF.a; + + Assert.assertNotNull(readHeader.getVCFHeaderVersion()); + Assert.assertNotNull(writeHeader.getVCFHeaderVersion()); + + Assert.assertEquals(readHeader.getMetaDataInSortedOrder(), writeHeader.getMetaDataInSortedOrder()); + Assert.assertEquals(readHeader.getInfoHeaderLines(), writeHeader.getInfoHeaderLines()); + Assert.assertEquals(readHeader.getFormatHeaderLines(), writeHeader.getFormatHeaderLines()); + + Assert.assertEqualsNoOrder(readHeader.getFilterLines().toArray(), writeHeader.getFilterLines().toArray()); + Assert.assertEqualsNoOrder(readHeader.getContigLines().toArray(), writeHeader.getContigLines().toArray()); + + for (int i = 0; i < writeVCF.b.size(); i++) { + VariantBaseTest.assertVariantContextsAreEqual( + writeVCF.b.get(i).fullyDecode(writeHeader, false), + readVCF.b.get(i).fullyDecode(readHeader, false) + ); + } + } + + @DataProvider(name = "automaticUpConversionTestFiles") + private Object[][] automaticUpConversionTestFiles() { + return new Object[][]{ + {TEST_42_PEDIGREE_FILE, VCFHeaderVersion.VCF4_2}, + {TEST_INVALID_43_CONTIG_NAME_FILE, VCFHeaderVersion.VCF4_2}, + {TEST_VALID_43_CONTIG_NAME_FILE, VCFHeaderVersion.VCF4_3}, + {TEST_42_AUTOMATICALLY_CONVERTIBLE_FILE, VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "automaticUpConversionTestFiles") + public void testAutomaticUpConversion(final Path testFile, final VCFHeaderVersion expectedVersion) throws IOException { + // Pre 4.3 files which can be automatically converted to 4.3 should be + // and files which cannot should be left as 4.2 + final Tuple> readVCF = readEntireVCFIntoMemory(testFile); + + final File out = File.createTempFile("test", testFile.getFileName().toString()); + out.deleteOnExit(); + + final VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(out) + .unsetOption(Options.INDEX_ON_THE_FLY) + .unsetOption(Options.DO_NOT_WRITE_GENOTYPES) + .build(); + + writer.writeHeader(readVCF.a); + writer.close(); + + final Tuple> writeVCF = readEntireVCFIntoMemory(out.toPath()); + Assert.assertEquals(writeVCF.a.getVCFHeaderVersion(), expectedVersion); + } + @DataProvider(name="all43IndexableFiles") private Object[][] allVCF43IndexableFiles() { return new Object[][] { @@ -241,7 +333,7 @@ public void testVCF43PercentEncodingWithUTF8() { // given a vcf file, extract a header line with the given key and ID, cast to the target // header line type (T) via the transformer function - private static T getHeaderLineFromTestFile( + private static T getHeaderLineFromTestFile( final Path testVCFFile, final String key, final String ID, @@ -265,5 +357,4 @@ private static List getIDHeaderLinesWithKey(final VCFHeader heade .collect(Collectors.toList()); return headerLines; } - } diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index f94435a833..d0cc69d565 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -25,22 +25,287 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; /** - * User: ebanks - * Date: Apr 2, 2014 + * Tests for VCFCompoundHeaderLine. + * + * NOTE: This class uses VCFInfoHeaderLine instances to test shared VCFCompoundHeaderLine functionality since + * VCFCompoundHeaderLine abstract. */ public class VCFCompoundHeaderLineUnitTest extends VariantBaseTest { + @DataProvider (name = "badOrMissingAttributes") + public Object[][] getMissingAttributes() { + return new Object[][] { + {""}, // no Type + {""}, // no Type + {""}, // no Number + {""}, // bogus Type + {""}, // bogus Number + }; + } + + @Test(dataProvider= "badOrMissingAttributes", expectedExceptions=TribbleException.class) + public void testBadOrMissingAttributes(final String lineString) { + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "acceptedAttributes") + public Object[][] getAcceptedAttributes() { + return new Object[][] { + {"", "Description", "foo"}, + //next two cases from https://github.com/samtools/htsjdk/issues/517 + {"", "Version", "3"}, + {"", "Source", "mySource"}, + }; + } + + @Test(dataProvider= "acceptedAttributes") + public void testAcceptedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @DataProvider (name = "invalidIDs") + public Object[][] getInvalidLines() { + return new Object[][] { + // ID cannot start with number + {""}, + // ID cannot start with '.'' + {""}, + // Test that IDs with the special thousand genomes key as a prefix are rejected + // The thousand genomes key is only accepted for VCFInfoHeaderLine and is tested in VCFInfoHeaderLineUnitTest + {""}, + // Contains invalid character '&' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testGetValidationError(final String lineString) { + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "headerLineTypes") + public Object[][] getHeaderLineTypes() { + return new Object[][] { + {"", VCFHeaderLineType.Float}, + {"", VCFHeaderLineType.Integer}, + {"", VCFHeaderLineType.String}, + {"", VCFHeaderLineType.Character}, + // Number must be 0 for flag type + {"", VCFHeaderLineType.Flag}, + }; + } + + @Test(dataProvider = "headerLineTypes") + public void testGetType(final String lineString, final VCFHeaderLineType expectedType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getType(), expectedType); + } + + @DataProvider (name = "headerLineCountTypes") + public Object[][] getLineCountTypes() { + return new Object[][] { + {"", VCFHeaderLineCount.A}, + {"", VCFHeaderLineCount.R}, + {"", VCFHeaderLineCount.G}, + {"", VCFHeaderLineCount.INTEGER}, + {"", VCFHeaderLineCount.UNBOUNDED}, + }; + } + + @Test(dataProvider= "headerLineCountTypes") + public void testGetLineCountType(final String lineString, final VCFHeaderLineCount expectedCountType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getCountType(), expectedCountType); + Assert.assertEquals(headerline.isFixedCount(), expectedCountType == VCFHeaderLineCount.INTEGER); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIntegerTypeWithNegativeCount() { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + @Test - public void supportsVersionFields() { - final String line = ""; - new VCFInfoHeaderLine(line, VCFHeaderVersion.VCF4_2); - // if we don't support version fields then we should fail before we ever get here - Assert.assertTrue(true); + public void testRepairFlagTypeWithNegativeCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", + VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(infoLine.getCount(), 0); } + + @DataProvider(name = "validHeaderIDs") + public Object[][] validHeaderIDs() { + return new Object[][] { + // 1000 Genomes ID key requires special handling + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + // Test all characters allowed after first character + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + // ID can start with underscore _ + {new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION)}, + }; + } + + @Test(dataProvider = "validHeaderIDs") + public void testValidHeaderIDs(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider(name = "invalidHeaderIDs") + public Object[][] invalidHeaderIDs() { + return new Object[][] { + // 1000G key is only allowed for INFO lines, not FORMAT + {new VCFFormatHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key with 1000G key as prefix should be rejected + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key cannot start with number + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + // Key containing invalid character - + {new VCFInfoHeaderLine("", VCFHeaderVersion.VCF4_2)}, + }; + } + + @Test(dataProvider = "invalidHeaderIDs") + public void testPre43LenientHandling(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeaderVersion.VCF4_2); + } + + @Test(dataProvider = "invalidHeaderIDs", expectedExceptions = TribbleException.class) + public void testInvalidHeaderIDs(final VCFCompoundHeaderLine line) { + line.validateForVersion(VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "equalsData") + public Object[][] getEqualsData() { + return new Object[][] { + //pos + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + + //neg + {"", + "", false}, // different ID + {"", + "", false}, // different Type + {"", + "", false}, // different Number + {"", + "", false}, // different integer Number + {"", + "", false}, // different description + {"", + "", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, resolve as new unbounded + }, + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine) { + VCFCompoundHeaderLine mergedLine = VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + Assert.assertEquals(mergedLine, expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("",VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2) { + VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> { throw new IllegalArgumentException("lambda should never execute - this exception should never be thrown"); } + ); + } + + @Test + public void testEncodeWithUnescapedQuotes() { + + VCFFilterHeaderLine unescapedFilterLine = new VCFFilterHeaderLine( + "aFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + + final String encodedAttributes = unescapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEncodeWithEscapedQuotes() { + + VCFFilterHeaderLine escapedFilterLine = new VCFFilterHeaderLine("aFilter", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + final String encodedAttributes = escapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + } diff --git a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java new file mode 100644 index 0000000000..8c4ef944f5 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java @@ -0,0 +1,183 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +public class VCFContigHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedIDs") + public Object[][] getAllowedIDs() { + return new Object[][]{ + {"", "1"}, + {"", "10"}, + {"", "X"}, + {"", "Y"}, + {"", "MT"}, + {"", "NC_007605"}, + {"", "GL000191.1"}, + {"", "HLA-A*01:01:01:01"}, //https://github.com/samtools/hts-specs/issues/124 + }; + } + + @Test(dataProvider= "allowedIDs") + public void testAllowedIDs(final String lineString, final String expectedIDString) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getID(), expectedIDString); + } + + @DataProvider(name = "invalidIDs") + public Object[][] getInvalidIDs() { + return new Object[][]{ + // IDs cannot start with '*' + {""}, + // IDs cannot start with '=' + // The parser cannot handle attributes starting with '=' so we cannot express this test case + // {""}, + // IDs cannot contain '{' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testInvalidIDs(final String lineString) { + new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 1); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectNegativeIndex() { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, -1); + } + + @DataProvider(name = "allowedAttributes") + public Object[][] getAllowedAttributes() { + return new Object[][] { + {"", "ID", "contig1"}, // https://github.com/samtools/htsjdk/issues/389 (no length) + {"", "length", "100"}, + {"", "taxonomy", "Homo sapiens"}, + {"", "assembly", "b37"}, + {"", "md5", "1a258fe76dfc8abd926f81f0e9b82ed7"}, + {"", + "URL", "http://www.refserve.org:8080/path/"}, + {"", + "species", "Homo sapiens"}, + }; + } + + @Test(dataProvider= "allowedAttributes") + public void testAllowedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @Test + public void testRoundTripThroughSequenceRecord() { + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 0); + + final String lengthString = "100"; + final String assemblyString = "b37"; + final String md5String = "1a258fe76dfc8abd926f81f0e9b82ed7"; + final String URLString = "http://www.refserve.org:8080/path/"; + final String speciesString = "Homo sapiens"; + + final SAMSequenceRecord sequenceRecord = contigLine.getSAMSequenceRecord(); + + Assert.assertEquals(Integer.toString(sequenceRecord.getSequenceLength()), lengthString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.LENGTH_ATTRIBUTE), lengthString); + + Assert.assertEquals(sequenceRecord.getAssembly(), assemblyString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.ASSEMBLY_ATTRIBUTE), assemblyString); + + Assert.assertEquals(sequenceRecord.getMd5(), md5String); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.MD5_ATTRIBUTE), md5String); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG), URLString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.URL_ATTRIBUTE), URLString); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG), speciesString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.SPECIES_ATTRIBUTE), speciesString); + + // now turn the SAMSequenceRecord back into a contig line, and compare the result to the + // original contig line + Assert.assertEquals( + new VCFContigHeaderLine(sequenceRecord, assemblyString), + contigLine); + } + + @DataProvider (name = "hashEqualsCompareData") + public Object[][] getHashEqualsCompareData() { + return new Object[][] { + + // For contig lines, equals and hash depend on the id, all other attributes, and the contig index, + // but compareTo only cares about the index. + + // line, index, line, line, index -> expected hash equals, expected equals, expected compare, + {"", 0, "", 0, true, true, 0 }, // identical + {"", 0, "", 1, false, false, -1 }, // identical except contig index + {"", 1, "", 0, false, false, 1 }, // identical except contig index + + {"", 0, "", 0, false, false, 0 }, // identical except attributes + {"", 0, "", 1, false, false, -1 }, // different attributes, different index + + {"", 0, "", 0, false, false, 0 }, // identical except ID + // different ID, same attributes and index, -> not equal, different hash, compare==0 + {"", 0, "", 0, false, false, 0 }, // different ID, attributes, same index + }; + } + + @Test(dataProvider = "hashEqualsCompareData") + public void testHashEqualsCompare( + final String line1, + final int index1, + final String line2, + final int index2, + final boolean expectedHashEquals, + final boolean expectedEquals, + final int expectedCompare) + { + final VCFContigHeaderLine headerLine1 = new VCFContigHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION, index1); + final VCFContigHeaderLine headerLine2 = new VCFContigHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION, index2); + + Assert.assertEquals(headerLine1.hashCode() == headerLine2.hashCode(), expectedHashEquals); + Assert.assertEquals(headerLine1.equals(headerLine2), expectedEquals); + Assert.assertEquals(headerLine1.compareTo(headerLine2), expectedCompare); + } + + @Test + public void testSortOrder() { + + final List expectedLineOrder = new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + }}; + + final TreeSet sortedLines = new TreeSet<>( + new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + }} + ); + + final Iterator sortedIt = sortedLines.iterator(); + for (final VCFContigHeaderLine cl : expectedLineOrder) { + Assert.assertTrue(sortedIt.hasNext()); + Assert.assertEquals(cl, sortedIt.next()); + } + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java index 547549aa81..f51589783b 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java @@ -148,6 +148,7 @@ public void testMissingFormatFields(final VCFEncoder encoder, final VariantConte private static Set createSyntheticMetadata() { final Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x")); diff --git a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java index 383d272a8d..b6835e1d25 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java @@ -75,7 +75,10 @@ Object[][] pathsData() { {TEST_DATA_DIR + "Vcf4.2WithSourceVersionInfoFields.vcf", null, false, true}, // // // should reject bcf v2.2 on read, see issue https://github.com/samtools/htsjdk/issues/1323 - {TEST_DATA_DIR + "BCFVersion22Uncompressed.bcf", null, false, false} + {TEST_DATA_DIR + "BCFVersion22Uncompressed.bcf", null, false, true}, + + // Test that gzipped BCFs can be read + {TEST_DATA_DIR + "bcfV22.bcf", null, false, true} }; } diff --git a/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java new file mode 100644 index 0000000000..1e07ff9c2d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java @@ -0,0 +1,19 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to FORMAT lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFFormatHeaderLineUnitTest extends HtsjdkTest { + + // FORMAT lines aren't allowed to have type==Flag + @Test(expectedExceptions=TribbleException.class) + public void testRejectInfoLineWithFlagField() { + new VCFFormatHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java index 73116f53f0..0ddacd7ec7 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java @@ -102,12 +102,12 @@ private Object[][] getInvalidHeaderLines() { List sourceVersion = Arrays.asList("Source", "Version"); return new Object[][]{ // to parse, expected, recommended, error message - {"", idDesc, none, "Tag Description in wrong order (was #1, expected #2)"}, - {"", idDesc, none, "Unexpected tag Desc"}, - {"<>", idDesc, none, "Unexpected tag "}, + {"", idDesc, none, "Unexpected tag or tag order for tag \"Description\""}, - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"}, - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"} + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""}, + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""} }; } @@ -119,7 +119,7 @@ private static void callTranslator(final String line, VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } else { - VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder, recommendedTags); + VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } } @@ -153,13 +153,4 @@ private Object[][] getVcfV3Versions() { }; } - @Test(dataProvider = "vcfv3", expectedExceptions = TribbleException.class) - public void testVcfV3FailsRecommendedTags(final VCFHeaderVersion vcfVersion) { - VCFHeaderLineTranslator.parseLine( - vcfVersion, - "", - Arrays.asList("ID"), - Arrays.asList("Description") - ); - } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index e04d3c69c8..f02ccd0585 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -1,6 +1,9 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.LinkedHashMap; @@ -9,46 +12,139 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; + public class VCFHeaderLineUnitTest extends VariantBaseTest { @Test public void testEncodeVCFHeaderLineWithUnescapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } @Test public void testEncodeVCFHeaderLineWithEscapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testFormatNumberExeptions() { + @Test + public void testIsNotStructuredHeaderLine() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertFalse(hl.isIDHeaderLine()); + Assert.assertNull(hl.getID()); + } + + @Test + public void testStringEncoding() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertEquals(hl.toStringEncoding(), "key=value"); + } + + @DataProvider(name = "headerLineEquals") + public Object[][] headerLineEquals() { + return new Object[][]{ + { + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), + true + }, + { + new VCFHeaderLine("key", "value1"), + new VCFHeaderLine("key", "value2"), + false + }, + { + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), + false + }, + { + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), + false + } + }; + } + + @Test(dataProvider = "headerLineEquals") + public void testEquals(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectedEquals) { + Assert.assertEquals(hl1.equals(hl2), expectedEquals); + } + + @DataProvider(name = "invalidHeaderLineKeys") + public Object[][] invalidHeaderLineKeys() { + return new Object[][]{ + {null}, + {"embedded<"}, + {"embedded="}}; + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testInvalidKeys(final String testKey) { + new VCFHeaderLine(testKey, ""); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "vcfVersions") + public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { + VCFHeader.makeHeaderVersionLine(vcfVersion).validateForVersion(vcfVersion); + } + + @DataProvider(name = "incompatibleVersions") + public Object[][] incompatibleVersionPairs() { + return new Object[][]{ + // each pair just has to be different + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2} + }; + } + + @Test(dataProvider="incompatibleVersions", expectedExceptions= TribbleException.VersionValidationFailure.class) + public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { + VCFHeader.makeHeaderVersionLine(vcfVersion).validateForVersion(incompatibleVersion); + } + + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testFormatNumberExceptions() { new VCFFormatHeaderLine("test", 0, VCFHeaderLineType.Integer, ""); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testInfoNumberExeptions() { + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testInfoNumberExceptions() { new VCFInfoHeaderLine("test", 0, VCFHeaderLineType.Integer, diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java new file mode 100644 index 0000000000..818aae84a0 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java @@ -0,0 +1,553 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.IntStream; + +import static htsjdk.variant.vcf.VCFConstants.PEDIGREE_HEADER_KEY; + +public class VCFHeaderMergerUnitTest extends VariantBaseTest { + + @DataProvider(name="mergeValidVersions") + public Object[][] getMergeValidVersions() { + + // only v4.2+ headers can be merged, merge result version is always the highest version presented + return new Object[][] { + // headers to merge, expected result version + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + }; + } + + @DataProvider(name="mergeInvalidVersions") + public Object[][] getMergeInvalidVersions() { + // only v4.2+ headers can be merged + return new Object[][] { + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1)}, + }; + } + + @Test(dataProvider="mergeValidVersions") + public void testMergeValidVersions(final List headerVersions, final VCFHeaderVersion expectedVersion) { + // merge the headers, and then verify that the merged lines have the expected version by + // instantiating a VCFMetaDataLines instance to determine the resulting version + final Set mergedHeaderLines = doHeaderMergeForVersions(headerVersions); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + Assert.assertEquals(metaDataLines.getVCFVersion(), expectedVersion); + + // now create a new header using the merged VersionLines, and make sure *it* has the expected version + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getVCFHeaderVersion(), expectedVersion); + + // also verify that all the header lines in the merged set are also in the resulting header + Assert.assertEquals(mergedHeader.getMetaDataInInputOrder(), mergedHeaderLines); + } + + @Test(dataProvider="mergeInvalidVersions", expectedExceptions = TribbleException.class) + public void testMergeInvalidVersions(final List headerVersions) { + doHeaderMergeForVersions(headerVersions); + } + + @Test(expectedExceptions = TribbleException.class) + public void testMergeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final Set oldHeaderLines = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + oldHeaderLines.add(new VCFHeaderLine(PEDIGREE_HEADER_KEY, "")); + final VCFHeader oldHeader = new VCFHeader(oldHeaderLines); + Assert.assertEquals(oldHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + + // now create a simple 4.3 header; the merge should fail because the old PEDIGREE line isn't valid + // for 4.3 (for which pedigree lines mut have an ID) + final VCFHeader newHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + Assert.assertEquals(newHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + + VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(oldHeader, newHeader),true); + } + + private Set doHeaderMergeForVersions(final List headerVersions) { + // This is a somewhat sketchy way to write a test...for each header we create here, we're + // using the same fixed set of VCF42-conforming VCFHeader lines, and then we add a fileformat + // line with whatever VCFVersion the test calls for. Its conceivable that as time goes on + // and we add new versions, the VCFHeader constructor could throw if any of the lines don't + // conform to the requested version. + final List headerList = new ArrayList<>(headerVersions.size()); + for (final VCFHeaderVersion version : headerVersions) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(version)); + final VCFHeader header = new VCFHeader(metaDataSet); + Assert.assertEquals(header.getVCFHeaderVersion(), version); + headerList.add(header); + } + + return VCFUtils.smartMergeHeaders(headerList, false); + } + + @DataProvider(name = "subsetHeaders") + public Iterator getSubsetHeaders() { + final List headerLineList = new ArrayList<>(new VCFHeaderUnitTestData().getTestMetaDataLinesSet()); + final Collection mergeTestCase = new ArrayList<>(); + // For each header line in the list of test lines, create a test case consisting of a pair of headers, + // one of which is a header created with all of the lines, and one of which is a subset of the full header + // with one line removed. Skip the case where the line to be removed is a fileformat line, since thats + // required to create a header. + for (int i = 0; i < headerLineList.size(); i++) { + // take the header line set and remove the ith line, unless its a fileformat line, since if we remove + // that, then we won't be able to create a header using the resulting lines at all. + final VCFHeaderLine candidateLine = headerLineList.get(i); + if (!VCFHeaderVersion.isFormatString(candidateLine.getKey())) { + List subsetList = new ArrayList<>(headerLineList); + subsetList.remove(i); + mergeTestCase.add( + new Object[] { + new VCFHeader(VCFHeaderUnitTestData.getTestMetaDataLinesSet()), + new VCFHeader(new LinkedHashSet<>(subsetList)) + }); + } + } + + return mergeTestCase.iterator(); + } + + @Test(dataProvider = "subsetHeaders") + public void testMergeSubsetHeaders( + final VCFHeader fullHeader, + final VCFHeader subsetHeader) + { + final List headerList = new ArrayList() {{ + add(fullHeader); + add(subsetHeader); + add(subsetHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(headerList, false), + fullHeader.getMetaDataInSortedOrder()); + + // now again, in the reverse order + final List reverseHeaderList = new ArrayList() {{ + add(subsetHeader); + add(subsetHeader); + add(fullHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(reverseHeaderList, false), + fullHeader.getMetaDataInSortedOrder()); + } + + @Test + public void testDictionaryMergeDuplicateFile() { + final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + "diagnosis_targets_testfile.vcf"), false).getFileHeader(); + final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy + final List sampleList = new ArrayList<>(); + sampleList.addAll(headerOne.getSampleNamesInOrder()); + + // Check that the two dictionaries start out the same + headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); + + // Run the merge command + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(headerOne, headerTwo), false), sampleList); + + // Check that the mergedHeader's sequence dictionary matches the first two + mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); + } + + @DataProvider(name="dictionaryMergePositive") + private Object[][] getDictionaryMergePositive() { + return new Object[][] { + // input dictionary list, expected merged dictionary + { + // one dictionary + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // two identical dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three different subsets; superset first + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset second + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset third (requires the merge implementation to sort on dictionary size) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // one non-null dictionary, one null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // one non-null dictionary, one null, in reverse direction + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, non-null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: subset, null, superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // all null dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + null + } + }; + } + + @Test(dataProvider = "dictionaryMergePositive") + private void testDictionaryMergePositive( + final List sourceHeaders, final SAMSequenceDictionary expectedDictionary) { + final Set mergedHeaderLines = VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getSequenceDictionary(), expectedDictionary); + } + + @DataProvider(name="dictionaryMergeNegative") + private Object[][] getDictionaryMergeNegative() { + final SAMSequenceDictionary forwardDictionary = createTestSAMDictionary(1, 2); + final SAMSequenceDictionary reverseDictionary = createReverseDictionary(forwardDictionary); + + return new Object[][] { + { + // SequenceDictionaryCompatibility.NO_COMMON_CONTIGS + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2))) + }, + { + // SequenceDictionaryCompatibility.OUT_OF_ORDER + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(forwardDictionary), + createTestVCFHeaderWithSAMDictionary(reverseDictionary)) + }, + { + // SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS common subset has contigs that have the same name but different lengths + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(100)), + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(200))) + }, + { + // SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER human reference detected but the order of the contigs is non-standard (lexicographic, for example) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryInCanonicalHumanOrder()), + createTestVCFHeaderWithSAMDictionary(createDictionaryInNonCanonicalHumanOrder())) + }, + { + // three mutually disjoint dictionaries, no superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(4, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(6, 2)) + ) + }, + }; + } + + @Test(dataProvider = "dictionaryMergeNegative", expectedExceptions = TribbleException.class) + private void testDictionaryMergeNegative(final List sourceHeaders) { + VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + } + + @Test + final void testDuplicateNonStructuredKeys() { + // merge 2 headers, one has "##sample=foo", one has "##sample=bar", both should survive the merge + final VCFHeaderLine fooLine = new VCFHeaderLine("sample", "foo"); + final Set fooLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + fooLines.add(fooLine); + final VCFHeader fooHeader = new VCFHeader(fooLines); + + final VCFHeaderLine barLine = new VCFHeaderLine("sample", "bar"); + final Set barLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + barLines.add(barLine); + final VCFHeader barHeader = new VCFHeader(barLines); + + final Set mergedLines = VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(fooHeader, barHeader), false); + Assert.assertEquals(mergedLines.size(), 3); + Assert.assertTrue(mergedLines.contains(fooLine)); + Assert.assertTrue(mergedLines.contains(barLine)); + } + + @DataProvider(name = "compatibleInfoLines") + public Object[][] getMergerData() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number, promote to "." + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type in reverse direction, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "compatibleInfoLines") + public void testMergeCompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + Assert.assertEquals(mergedHeader.getInfoHeaderLine(id), expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + } + + private final SAMSequenceDictionary createTestSAMDictionary(final int startSequence, final int numSequences) { + final SAMSequenceDictionary samDictionary = new SAMSequenceDictionary(); + IntStream.range(startSequence, startSequence + numSequences).forEachOrdered( + i -> samDictionary.addSequence(new SAMSequenceRecord(Integer.toString(i), i))); + return samDictionary; + } + + private final VCFHeader createTestVCFHeaderWithSAMDictionary(final SAMSequenceDictionary samDictionary) { + final VCFHeader vcfHeader = createTestVCFHeader(); + vcfHeader.setSequenceDictionary(samDictionary); + return vcfHeader; + } + + private SAMSequenceDictionary createDictionaryInNonCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryInCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryWithLengths(final int length) { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", length)); + sequences.add(new SAMSequenceRecord("2", length)); + sequences.add(new SAMSequenceRecord("3", length)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createReverseDictionary(final SAMSequenceDictionary forwardDictionary){ + // its not sufficient to reuse the existing sequences by just reordering them, since + // SAMSequenceDictionary *mutates* the sequence indices to match the input order. So we need + // to create the new sequence dictionary using entirely new sequence records, and let + // SAMSequenceDictionary assign them indices that match the input order. + final List reverseSequences = new ArrayList<>(forwardDictionary.getSequences()); + Collections.reverse(reverseSequences); + final SAMSequenceDictionary reverseDictionary = new SAMSequenceDictionary(); + + int count = 0; + for (final SAMSequenceRecord samSequenceRecord : reverseSequences) { + final SAMSequenceRecord newSequenceRecord = new SAMSequenceRecord( + samSequenceRecord.getSequenceName(), + samSequenceRecord.getSequenceLength()); + reverseDictionary.addSequence(newSequenceRecord); + Assert.assertEquals(newSequenceRecord.getSequenceIndex(), count); + count++; + } + return reverseDictionary; + } + + private final VCFHeader createTestVCFHeader() { + return new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index e4d5099eda..188375ba58 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -27,9 +27,7 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; -import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.TestUtil; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.AsciiLineReader; @@ -39,69 +37,80 @@ import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.*; -import java.math.BigInteger; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.stream.Collectors; -/** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 30, 2010 - * Time: 3:32:08 PM - * To change this template use File | Settings | File Templates. - */ public class VCFHeaderUnitTest extends VariantBaseTest { - private File tempDir; - - private VCFHeader createHeader(String headerStr) { - VCFCodec codec = new VCFCodec(); - VCFHeader header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new SynchronousLineReader( - new StringReader(headerStr)))); - Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount); - return header; - } - - @BeforeClass - private void createTemporaryDirectory() { - tempDir = TestUtil.getTempDirectory("VCFHeader", "VCFHeaderTest"); + @DataProvider(name="headerRoundTrip") + private Object[][] getHeaderRoundTrip() { + return new Object[][] { + { VCFHeaderUnitTestData.getVCFV42TestHeaderString() }, + { VCFHeaderUnitTestData.VCF42headerStrings_with_negativeOne } + }; } - @AfterClass - private void deleteTemporaryDirectory() { - for (File f : tempDir.listFiles()) { - f.delete(); - } - tempDir.delete(); + @Test(dataProvider = "headerRoundTrip") + public void test42HeaderRoundTrip(final String headerString) throws IOException { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(headerString); + Assert.assertEquals(header.getMetaDataInSortedOrder(), getRoundTripEncoded(header)); } @Test - public void testVCF4ToVCF4() { - VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "91c33dadb92e01ea349bd4bcdd02d6be"); - } + public void test42FileRoundtrip() throws Exception { + // this test validates that source/version fields are round-tripped properly - @Test - public void testVCF4ToVCF4_alternate() { - VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "39318d9713897d55be5ee32a2119853f"); + // read an existing VCF + final File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + + // write the file out into a new copy + final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); + actualFile.deleteOnExit(); + + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile.toPath(), false, VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VariantContextWriter copyWriter = new VariantContextWriterBuilder() + .setOutputFile(actualFile) + .setReferenceDictionary(createArtificialSequenceDictionary()) + .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) + .build() + ) { + final VCFHeader originalHeader = originalFileReader.getFileHeader(); + + copyWriter.writeHeader(originalHeader); + for (final VariantContext variantContext : originalFileReader) { + copyWriter.add(variantContext); + } + } + + final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); + final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); + Assert.assertEquals(actualContents, expectedContents); } @Test - public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { + public void testSampleRenamingSingleSample() throws Exception { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "HiSeq.10000.vcf"))); @@ -120,57 +129,25 @@ public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { } } - @DataProvider - public Object[][] testVCFHeaderDictionaryMergingData() { + @DataProvider(name="testSampleRenamingFailsTests") + public Object[][] testSampleRenamingFailsTests() { return new Object[][]{ - {"diagnosis_targets_testfile.vcf"}, // numerically ordered contigs - {"dbsnp_135.b37.1000.vcf"} // lexicographically ordered contigs + {variantTestDataRoot + "ex2.vcf"}, // multi sample vcf + {variantTestDataRoot + "dbsnp_135.b37.1000.vcf"} // sites only vcf }; } - @Test(dataProvider = "testVCFHeaderDictionaryMergingData") - public void testVCFHeaderDictionaryMerging(final String vcfFileName) { - final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + vcfFileName), false).getFileHeader(); - final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy - final List sampleList = new ArrayList(); - sampleList.addAll(headerOne.getSampleNamesInOrder()); - - // Check that the two dictionaries start out the same - headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); - - // Run the merge command - final VCFHeader mergedHeader = new VCFHeader(VCFUtils.smartMergeHeaders(Arrays.asList(headerOne, headerTwo), false), sampleList); - - // Check that the mergedHeader's sequence dictionary matches the first two - mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingMultiSampleVCF() throws Exception { - final VCFCodec codec = new VCFCodec(); - codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "ex2.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingSitesOnlyVCF() throws Exception { + @Test(dataProvider = "testSampleRenamingFailsTests", expectedExceptions = TribbleException.class) + public void testSampleRenamingFails(final String fileName) throws IOException { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - private VCFHeader getHiSeqVCFHeader() { - final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); - final VCFFileReader reader = new VCFFileReader(vcf, false); - final VCFHeader header = reader.getFileHeader(); - reader.close(); - return header; + final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator( + AsciiLineReader.from(new FileInputStream(fileName))); + codec.readHeader(vcfIterator).getHeaderValue(); } @Test - public void testVCFHeaderAddInfoLine() { + public void testAddInfoLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("TestInfoLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info line"); header.addMetaDataLine(infoLine); @@ -185,13 +162,8 @@ public void testVCFHeaderAddInfoLine() { Assert.assertFalse(header.getOtherHeaderLines().contains(infoLine), "TestInfoLine present in other header lines"); } - private static Collection asCollectionOfVCFHeaderLine(Collection headers) { - // create a collection of VCFHeaderLine so that contains tests work correctly - return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); - } - @Test - public void testVCFHeaderAddFormatLine() { + public void testAddFormatLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFFormatHeaderLine formatLine = new VCFFormatHeaderLine("TestFormatLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test format line"); header.addMetaDataLine(formatLine); @@ -207,11 +179,11 @@ public void testVCFHeaderAddFormatLine() { } @Test - public void testVCFHeaderAddFilterLine() { + public void testAddFilterLine() { final VCFHeader header = getHiSeqVCFHeader(); final String filterDesc = "TestFilterLine Description"; - final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine",filterDesc); - Assert.assertEquals(filterDesc,filterLine.getDescription()); + final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine", filterDesc); + Assert.assertEquals(filterDesc, filterLine.getDescription()); header.addMetaDataLine(filterLine); Assert.assertTrue(header.getFilterLines().contains(filterLine), "TestFilterLine not found in filter header lines"); @@ -225,10 +197,15 @@ public void testVCFHeaderAddFilterLine() { } @Test - public void testVCFHeaderAddContigLine() { + public void testAddContigLine() { final VCFHeader header = getHiSeqVCFHeader(); + // no contig lines in this header + Assert.assertTrue(header.getContigLines().isEmpty()); + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); + Assert.assertEquals(contigLine.getKey(), VCFHeader.CONTIG_KEY); + Assert.assertEquals(contigLine.getID(), "chr1"); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); @@ -241,10 +218,70 @@ public void testVCFHeaderAddContigLine() { } @Test - public void testVCFHeaderContigLineMissingLength() { + public void testAddContigLineExactDuplicateSilentlyDropped() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); + + final int numContigLinesBefore = header.getContigLines().size(); + // try to read the first contig line + header.addMetaDataLine(header.getContigLines().get(0)); + final int numContigLinesAfter = header.getContigLines().size(); + + // assert that we have the same number of contig lines before and after + Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); + } + + @Test + public void testAddContigLineWithDifferentAttributesSilentlyDropped() { + final VCFContigHeaderLine contigOneNoAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + }}, + 0); + final VCFContigHeaderLine contigOneWithAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + put("assembly", "b37"); + }}, + 1); + Assert.assertNotEquals(contigOneNoAssembly.hashCode(), contigOneWithAssembly.hashCode()); + + final Set headerLineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + headerLineSet.add(contigOneNoAssembly); + headerLineSet.add(contigOneWithAssembly); + Assert.assertEquals(headerLineSet.size(), 3); // one fileformat line, plus 2 contig lines + + // silently drops contigOneNoAssembly since it has the same ID AND contig index as contigOneWithAssembly + final VCFHeader vcfHeader = new VCFHeader(headerLineSet); + final Set allMetaDataInput = vcfHeader.getMetaDataInInputOrder(); + Assert.assertEquals(allMetaDataInput.size(), 2); + final Set allMetaDataSorted = vcfHeader.getMetaDataInSortedOrder(); + Assert.assertEquals(allMetaDataSorted.size(), 2); + final List allContigLines = vcfHeader.getContigLines(); + Assert.assertEquals(allContigLines.size(), 1); // one contig + Assert.assertEquals(allContigLines.get(0).getGenericFieldValue("assembly"), "b37"); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineRejectDuplicateContigIndex() { + final VCFHeader header = new VCFHeader(); + // add two contig lines that share an index, but have different IDs and represetn different contifs + final VCFContigHeaderLine contigLine1 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + final VCFContigHeaderLine contigLine2 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + + header.addMetaDataLine(contigLine1); + header.addMetaDataLine(contigLine2); + } + + @Test + public void testAddContigLineMissingLength() { final VCFHeader header = getHiSeqVCFHeader(); final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(contigLine), "Test contig line not found in set of all header lines"); @@ -252,58 +289,68 @@ public void testVCFHeaderContigLineMissingLength() { final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary(); Assert.assertNotNull(sequenceDictionary); Assert.assertEquals(sequenceDictionary.getSequence("chr1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); - } - @Test - public void testVCFHeaderHonorContigLineOrder() throws IOException { + @Test + public void testGetContigLinesHonorsSortOrder() { + // NOTE: this test file has *lexicographically* ordered contigs try (final VCFFileReader vcfReader = new VCFFileReader(new File(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"), false)) { // start with a header with a bunch of contig lines final VCFHeader header = vcfReader.getFileHeader(); - final List originalHeaderList = header.getContigLines(); - Assert.assertTrue(originalHeaderList.size() > 0); - - // copy the contig lines to a new list, sticking an extra contig line in the middle - final List orderedList = new ArrayList<>(); - final int splitInTheMiddle = originalHeaderList.size() / 2; - orderedList.addAll(originalHeaderList.subList(0, splitInTheMiddle)); - final VCFContigHeaderLine outrageousContigLine = new VCFContigHeaderLine( - "", + final List originalContigsInSortedOrder = header.getContigLines(); + Assert.assertTrue(originalContigsInSortedOrder.size() > 0); + + // copy the contig lines to a new list + final int midPoint = originalContigsInSortedOrder.size() / 2; + final List confoundedList = new ArrayList<>(originalContigsInSortedOrder.subList( + 0, + midPoint + )); + + // deliberately stick an extra contig line in the middle of the list, but using a contig index + // that will cause the line to sort to the end + final String newContigID = "newContigID"; + final int newContigIndex = originalContigsInSortedOrder.size(); + final VCFContigHeaderLine newContigLine = new VCFContigHeaderLine( + String.format( + "", newContigID), VCFHeaderVersion.VCF4_2, - VCFHeader.CONTIG_KEY, - 0); - orderedList.add(outrageousContigLine); - // make sure the extra contig line is outrageous enough to not collide with a real contig ID - Assert.assertTrue(orderedList.contains(outrageousContigLine)); - orderedList.addAll(originalHeaderList.subList(splitInTheMiddle, originalHeaderList.size())); - Assert.assertEquals(originalHeaderList.size() + 1, orderedList.size()); - - // crete a new header from the ordered list, and test that getContigLines honors the input order - final VCFHeader orderedHeader = new VCFHeader(); - orderedList.forEach(hl -> orderedHeader.addMetaDataLine(hl)); - Assert.assertEquals(orderedList, orderedHeader.getContigLines()); + newContigIndex); + confoundedList.add(newContigLine); + confoundedList.addAll(originalContigsInSortedOrder.subList(midPoint, originalContigsInSortedOrder.size())); + + // make sure the new contig line was actually added + Assert.assertEquals(originalContigsInSortedOrder.size() + 1, confoundedList.size()); + Assert.assertTrue(confoundedList.contains(newContigLine)); + + // create a new header from the confounded list, call getContigLines() on the header, and validate + // that the new line is included in the resulting list, and is at the end + final VCFHeader newHeader = new VCFHeader(); + confoundedList.forEach(newHeader::addMetaDataLine); + final List roundTrippedLines = newHeader.getContigLines(); + Assert.assertEquals(roundTrippedLines.size(), originalContigsInSortedOrder.size() + 1); + Assert.assertEquals(roundTrippedLines.get(roundTrippedLines.size() - 1), newContigLine); + + // make sure the sequence dictionary has the contig with the correct contig index, and in + // the same relative location in the dictionary (at the end of the list) + final SAMSequenceDictionary orderedSeqDict = newHeader.getSequenceDictionary(); + Assert.assertEquals( + orderedSeqDict.getSequence(newContigID).getSequenceIndex(), + roundTrippedLines.size() - 1); + Assert.assertEquals( + orderedSeqDict.getSequences().get(newHeader.getContigLines().size() - 1).getSequenceName(), + newContigID); } } @Test - public void testVCFSimpleHeaderLineGenericFieldGetter() { - VCFHeader header = createHeader(VCF4headerStrings); - List filters = header.getFilterLines(); - VCFFilterHeaderLine filterHeaderLine = filters.get(0); - Map genericFields = filterHeaderLine.getGenericFields(); - Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); - Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); - } - - @Test - public void testVCFHeaderAddOtherLine() { + public void testAddOtherLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFHeaderLine otherLine = new VCFHeaderLine("TestOtherLine", "val"); header.addMetaDataLine(otherLine); Assert.assertTrue(header.getOtherHeaderLines().contains(otherLine), "TestOtherLine not found in other header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(otherLine), "TestOtherLine not found in set of all header lines"); - Assert.assertNotNull(header.getOtherHeaderLine("TestOtherLine"), "Lookup for TestOtherLine by key failed"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getInfoHeaderLines()).contains(otherLine), "TestOtherLine present in info header lines"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getFormatHeaderLines()).contains(otherLine), "TestOtherLine present in format header lines"); @@ -312,15 +359,16 @@ public void testVCFHeaderAddOtherLine() { } @Test - public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddMetaDataLineDoesNotDuplicateContigs() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); final int numContigLinesBefore = header.getContigLines().size(); - VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine("test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); + final VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine( + "test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); header.addMetaDataLine(newInfoField); // getting the sequence dictionary was failing due to duplicating contigs in issue #214, @@ -333,109 +381,300 @@ public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { } @Test - public void testVCFHeaderAddDuplicateContigLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); - - - final int numContigLinesBefore = header.getContigLines().size(); - // try to readd the first contig line - header.addMetaDataLine(header.getContigLines().get(0)); - final int numContigLinesAfter = header.getContigLines().size(); - - // assert that we have the same number of contig lines before and after - Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); - } - - @Test - public void testVCFHeaderAddDuplicateHeaderLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddDuplicateKeyValueHeaderLine() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); - VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); + final VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); // add this new header line header.addMetaDataLine(newHeaderLine); final int numHeaderLinesBefore = header.getOtherHeaderLines().size(); - // readd the same header line + // add the same header line again header.addMetaDataLine(newHeaderLine); final int numHeaderLinesAfter = header.getOtherHeaderLines().size(); - // assert that we have the same number of other header lines before and after + // Note: we don't allow duplicate unstructured lines with the same key unless they have + // different content + // assert that we have the one more other header line after Assert.assertEquals(numHeaderLinesBefore, numHeaderLinesAfter); } + @Test + public void testSimpleHeaderLineGenericFieldGetter() { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(VCFHeaderUnitTestData.getVCFV42TestHeaderString()); + final List filters = header.getFilterLines(); + final VCFFilterHeaderLine filterHeaderLine = filters.get(0); + final Map genericFields = filterHeaderLine.getGenericFields(); + Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); + Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); + } + + @Test + public void testSerialization() throws Exception { + final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); + final VCFHeader originalHeader = reader.getFileHeader(); + reader.close(); + + final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + + Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + } + @DataProvider(name="validHeaderVersionTransitions") public Object[][] validHeaderVersionTransitions() { - // v4.3 can never transition, all other version transitions are allowed + // all (forward) version transitions are allowed return new Object[][] { + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_0}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3} }; } @DataProvider(name="invalidHeaderVersionTransitions") public Object[][] invalidHeaderVersionTransitions() { - // v4.3 can never transition with, all other version transitions are allowed + // v4.3 can never be transitioned down to pre v4.3 + // Pre v4.3 might be able to be transitioned to 4.3, and this is tested in VCFCodec43FeaturesTest return new Object[][] { - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + //reject any attempt to go backwards in time {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2}, - {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2}, }; } @Test(dataProvider="validHeaderVersionTransitions") - public void testValidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionLineValidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()); + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), toVersion); } @Test(dataProvider="invalidHeaderVersionTransitions", expectedExceptions = TribbleException.class) - public void testInvalidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionInvalidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()) + .addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; } - private void doHeaderTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - final VCFHeader vcfHeader = - fromVersion == null ? - new VCFHeader() : - new VCFHeader(fromVersion, Collections.EMPTY_SET, Collections.EMPTY_SET); - vcfHeader.setVCFHeaderVersion(toVersion); + @Test(expectedExceptions = TribbleException.class) + public void testVersionUpgradeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + + // now try to force a version upgrade to 4.3, old style pedigree line should cause a failure + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testConstructorRequiresFileFormatLine() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + // create a new header from this set (containing no fileformat line), no requested version in constructor + new VCFHeader(metaDataSet, Collections.emptySet()); //defaults to v4.2 + } + + @Test(dataProvider = "vcfVersions") + public void testConstructorWithSingleFileFormatLine(final VCFHeaderVersion vcfVersion) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + + // add in the corresponding fileformat line; create a new versioned header + // since the version requested in the constructor and the format lines are in sync, there is + // no conflict, and the resulting header's version should always match the requested version + metaDataSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), vcfVersion); } @Test - public void testVCFHeaderSerialization() throws Exception { - final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); - final VCFHeader originalHeader = reader.getFileHeader(); - reader.close(); + public void testConstructorWithMultipleFileFormatLines() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); - final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + // multiple version lines will be ignored, with only the last one retained + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 2); - Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testConstructorWithInvalidLineForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + metaDataSet.add(new VCFPedigreeHeaderLine(attributes)); + new VCFHeader(metaDataSet, Collections.emptySet()); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testAddMetaDataLineInvalidForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final VCFHeader header = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + header.addMetaDataLine(new VCFPedigreeHeaderLine(attributes)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineWithValidationFailure() { + // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) + // which should cause a failure + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineFileFormat() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); + + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 1); + + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + + // add a new line that uses the same header version already established + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + + // add a new line that tries to move the version forward + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + + // now try to go backwards (throws) + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + } + + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeader vcfHeader = new VCFHeader(orderedLineSet, Collections.emptySet()); + + final Collection inputOrderLines = vcfHeader.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = vcfHeader.getMetaDataInSortedOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); } + @Test + public void testPreserveSequenceDictionaryAttributes() { + // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back + // to a SAMSequenceDictionary with the same attributes. + // https://github.com/samtools/htsjdk/issues/730 + + final String assemblyString = "hg37"; + final String md5String = "68b329da9893e34099c7d8ad5cb9c940"; + final String speciesString = "Home Sapiens"; + final String urlString = "http://www.refserve.org:8080/path/"; + + final SAMSequenceDictionary samDict = new SAMSequenceDictionary(); + + final SAMSequenceRecord seqRec1 = new SAMSequenceRecord("1", 1); + seqRec1.setAssembly(assemblyString); + seqRec1.setMd5(md5String); + seqRec1.setAttribute(SAMSequenceRecord.URI_TAG, urlString); + seqRec1.setSpecies(speciesString); + final SAMSequenceRecord seqRec2 = new SAMSequenceRecord("2", 1); + samDict.addSequence(seqRec1); + samDict.addSequence(seqRec2); + + final VCFHeader vcfHeader = new VCFHeader(); + vcfHeader.setSequenceDictionary(samDict); + final SAMSequenceDictionary roundTrippedDict = vcfHeader.getSequenceDictionary(); + + final SAMSequenceRecord rtRec1 = roundTrippedDict.getSequence("1"); + Assert.assertEquals(assemblyString, rtRec1.getAssembly()); + Assert.assertEquals(md5String, rtRec1.getMd5()); + Assert.assertEquals(urlString, rtRec1.getAttribute(SAMSequenceRecord.URI_TAG)); + Assert.assertEquals(speciesString, rtRec1.getSpecies()); + + Assert.assertEquals(seqRec1, roundTrippedDict.getSequence("1")); // somewhat redundant check on full record + Assert.assertEquals(seqRec2, roundTrippedDict.getSequence("2")); + } + + ///////////////////////////////////////////////////////////////// + ////////////////************************* End new tests block... + ///////////////////////////////////////////////////////////////// + @Test public void testVCFHeaderQuoteEscaping() throws Exception { // this test ensures that the end-to-end process of quote escaping is stable when headers are @@ -449,10 +688,9 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFHeader originalHeader = originalFileReader.getFileHeader(); // add a header line with quotes to the header - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final VCFSimpleHeaderLine addedHeaderLine = new VCFSimpleHeaderLine("GATKCommandLine.Test", attributes); + final VCFSimpleHeaderLine addedHeaderLine = new VCFFilterHeaderLine( + "FakeFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); originalHeader.addMetaDataLine(addedHeaderLine); final VCFFilterHeaderLine originalCopyAnnotationLine1 = originalHeader.getFilterHeaderLine("ANNOTATION"); @@ -485,9 +723,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) .build(); firstCopyWriter.writeHeader(originalHeader); - final CloseableIterator firstCopyVariantIterator = originalFileReader.iterator(); - while (firstCopyVariantIterator.hasNext()) { - VariantContext variantContext = firstCopyVariantIterator.next(); + for (final VariantContext variantContext : originalFileReader) { firstCopyWriter.add(variantContext); } originalFileReader.close(); @@ -496,7 +732,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { // read the copied file back in final VCFFileReader firstCopyReader = new VCFFileReader(firstCopyVCFFile, false); final VCFHeader firstCopyHeader = firstCopyReader.getFileHeader(); - final VCFHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(firstCopyNewHeaderLine); final VCFFilterHeaderLine firstCopyAnnotationLine1 = firstCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -528,9 +764,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) .build(); secondCopyWriter.writeHeader(firstCopyHeader); - final CloseableIterator secondCopyVariantIterator = firstCopyReader.iterator(); - while (secondCopyVariantIterator.hasNext()) { - VariantContext variantContext = secondCopyVariantIterator.next(); + for (final VariantContext variantContext : firstCopyReader) { secondCopyWriter.add(variantContext); } secondCopyWriter.close(); @@ -539,7 +773,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFFileReader secondCopyReader = new VCFFileReader(secondCopyVCFFile, false); final VCFHeader secondCopyHeader = secondCopyReader.getFileHeader(); - final VCFHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(secondCopyNewHeaderLine); final VCFFilterHeaderLine secondCopyAnnotationLine1 = secondCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -549,8 +783,8 @@ public void testVCFHeaderQuoteEscaping() throws Exception { Assert.assertNotNull(secondCopyAnnotationLine2); Assert.assertEquals(firstCopyNewHeaderLine, secondCopyNewHeaderLine); - Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); - Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); + Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "FILTER="); + Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "FILTER="); Assert.assertEquals(firstCopyAnnotationLine1, secondCopyAnnotationLine1); Assert.assertEquals(secondCopyAnnotationLine1.getGenericFieldValue("Description"), "ANNOTATION != \"NA\" || ANNOTATION <= 0.01"); @@ -574,12 +808,34 @@ public void testVCFHeaderQuoteEscaping() throws Exception { } + ///////////////////////////////////////////////////////////////////// + // Private helper methods + ///////////////////////////////////////////////////////////////////// + + // Serialize/encode the header to a file, read metaData back in + private static Set getRoundTripEncoded(final VCFHeader header) throws IOException { + final File myTempFile = File.createTempFile("VCFHeader", "vcf"); + try (final VariantContextWriter vcfWriter = new VariantContextWriterBuilder() + .setOutputFile(myTempFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOptions(VariantContextWriterBuilder.NO_OPTIONS) + .build() + ) { + vcfWriter.writeHeader(header); + } + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(new LineIteratorImpl( + new SynchronousLineReader(new FileReader(myTempFile.getAbsolutePath())))); + return vcfHeader.getMetaDataInSortedOrder(); + } + @Test public void testVcf42Roundtrip() throws Exception { // this test ensures that source/version fields are round-tripped properly // read an existing VCF - File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + final File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); // write the file out into a new copy final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); @@ -593,7 +849,7 @@ public void testVcf42Roundtrip() throws Exception { .build() ) { final VCFHeader originalHeader = originalFileReader.getFileHeader(); - + copyWriter.writeHeader(originalHeader); for (final VariantContext variantContext : originalFileReader) { copyWriter.add(variantContext); @@ -602,108 +858,138 @@ public void testVcf42Roundtrip() throws Exception { final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); - Assert.assertEquals(actualContents, expectedContents); + Assert.assertEquals(actualContents.substring(actualContents.indexOf('\n')), expectedContents.substring(actualContents.indexOf('\n'))); } + private static final int VCF4headerStringCount = 16; // 17 -1 for the #CHROM... line - /** - * a little utility function for all tests to md5sum a file - * Shameless taken from: - *

    - * http://www.javalobby.org/java/forums/t84420.html - * - * @param file the file - * @return a string - */ - private static String md5SumFile(File file) { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Unable to find MD5 digest"); - } - InputStream is; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to open file " + file); - } - byte[] buffer = new byte[8192]; - int read; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - - } catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } finally { - try { - is.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } + + private static VCFHeader getHiSeqVCFHeader() { + final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); + final VCFFileReader reader = new VCFFileReader(vcf, false); + final VCFHeader header = reader.getFileHeader(); + reader.close(); + return header; } - private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { - File myTempFile = null; - PrintWriter pw = null; - try { - myTempFile = File.createTempFile("VCFHeader", "vcf"); - myTempFile.deleteOnExit(); - pw = new PrintWriter(myTempFile); - } catch (IOException e) { - Assert.fail("Unable to make a temp file!"); - } - for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) - pw.println(line); - pw.close(); - Assert.assertEquals(md5SumFile(myTempFile), md5sum); - } - - public static final int VCF4headerStringCount = 16; - - public static final String VCF4headerStrings = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - - - public static final String VCF4headerStrings_with_negativeOne = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + private static Collection asCollectionOfVCFHeaderLine(final Collection headers) { + // create a collection of VCFHeaderLine so that contains tests work correctly + return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); + } + + @DataProvider(name="duplicateHeaderLineCases") + private Object[][] getDuplicateHeaderLineCases() { + return new Object[][] { + + // these tests use VCFAltHeaderLine to test structured/ID lines, but the behavior should be the same + // for any header ID line + + // duplicate IDs, duplicate description; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description1"), false }, + // duplicate IDs, different descriptions; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description2"), false }, + // different IDs, different descriptions; line is retained + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("Y", "description2"), true }, + // different IDs, duplicate descriptions; line is retained + { new VCFAltHeaderLine("X", "description"), + new VCFAltHeaderLine("Y", "description"), true }, + + // .......unstructured header lines........ + + // duplicate key, duplicate value, line is dropped + { new VCFHeaderLine("CommandLine", "command"), new VCFHeaderLine("CommandLine", "command"), false }, + // duplicate key, different value, line is retained + { new VCFHeaderLine("CommandLine", "command1"), new VCFHeaderLine("CommandLine", "command2"), true }, + + /////////////////////////////////////////////////////////////////////////////////////////// + // since the VCFHeaderLine constructor is public, it can be used erroneously to model header + // lines that have structured syntax, but which will not obey structured header line rules, + // since those are enabled via VCFSimpleHeaderLine, and VCFHeaderLine is intended to be used + // for non-structured lines. so include some tests that simulate this + + // duplicate key, duplicate value (...duplicate ID), line is dropped + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), false }, + // duplicate key, different value (different ID), line is retained + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + //NOTE: this case illustrates how its possible to use the API to cause two structured lines + // with duplicate IDs to be retained if they are not modeled as VCFStructuredHeaderLines + // duplicate key, different value (but IDENTICAL ID), line is RETAINED + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + // different key, duplicate value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + // different key, different value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + }; + } + + @Test(dataProvider = "duplicateHeaderLineCases") + private void testDuplicateHeaderLine(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectHL2Retained) { + final Set lineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + lineSet.add(hl1); + lineSet.add(hl2); + final VCFHeader vcfHeader = new VCFHeader(lineSet); + + Assert.assertEquals(vcfHeader.getMetaDataInInputOrder().size(), expectHL2Retained ? 3 : 2); + } + + @Test + public void testAddOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final List otherLines1 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines1.size(), 1); + Assert.assertTrue(otherLines1.contains(otherLine1)); + + // now add a second line + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + final List otherLines2 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines2.size(), 2); + Assert.assertTrue(otherLines2.contains(otherLine1)); + Assert.assertTrue(otherLines2.contains(otherLine2)); + + // now call addOtherHeaderLineUnique with a 3rd line, the first two should be removed + final VCFHeaderLine otherLine3= new VCFHeaderLine(TEST_KEY, "Test Value 3"); + vcfHeader.addOtherHeaderLineUnique(otherLine3); + final List otherLines3 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines3.size(), 1); + Assert.assertFalse(otherLines3.contains(otherLine1)); + Assert.assertFalse(otherLines3.contains(otherLine2)); + Assert.assertTrue(otherLines3.contains(otherLine3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddOtherHeaderLineUniqueRejectsIDLines() { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("testKey", "testID","test description"); + vcfHeader.addOtherHeaderLineUnique(simpleHeaderLine); + } + + @Test(expectedExceptions = TribbleException.class) + public void testGetOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + + // now add two lines with the same key + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + + final List otherLines = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines.size(), 2); + Assert.assertTrue(otherLines.contains(otherLine1)); + Assert.assertTrue(otherLines.contains(otherLine2)); + + // now call getOtherHeaderLineUnique, should throw + vcfHeader.getOtherHeaderLineUnique(TEST_KEY); + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java new file mode 100644 index 0000000000..6c197f1c30 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -0,0 +1,205 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.readers.LineIteratorImpl; +import htsjdk.tribble.readers.SynchronousLineReader; +import htsjdk.variant.variantcontext.writer.VCFVersionUpgradePolicy; +import org.testng.Assert; + +import java.io.StringReader; +import java.util.*; + +// Unit test data used by unit tests for VCFHeader, VCFMetaDataLines, and VCFHeaderLine hierarchy. +public class VCFHeaderUnitTestData { + public final static VCFHeaderVersion TEST_VERSION = VCFHeader.DEFAULT_VCF_VERSION; + + // fileformat line + public static List getTestDefaultFileFormatLine() { + return new ArrayList() {{ + add(VCFHeader.makeHeaderVersionLine(TEST_VERSION)); + }}; + } + + // FILTER lines + public static List getTestFilterLines() { + return new ArrayList() {{ + add(new VCFFilterHeaderLine("LowQual", "Description=\"Low quality\"")); + add(new VCFFilterHeaderLine("highDP", "Description=\"DP < 8\"")); + add(new VCFFilterHeaderLine("TruthSensitivityTranche98.50to98.80", "Truth sensitivity tranche level at VSQ Lod: -0.1106 <= x < 0.6654")); + }}; + } + + // FORMAT lines + public static List getTestFormatLines() { + return new ArrayList() {{ + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + add(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + add(new VCFFormatHeaderLine("MLPSAF", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + }}; + } + + // INFO lines + public static List getTestInfoLines() { + return new ArrayList() {{ + add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + add(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + add(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + }}; + } + + // CONTIG lines + public static List getTestContigLines() { + return new ArrayList() {{ + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "2"), 1)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "3"), 2)); + }}; + } + + //misc lines + public static List getTestMiscellaneousLines() { + return new ArrayList() {{ + add(new VCFHeaderLine("reference", "g37")); + add(new VCFHeaderLine("GATKCommandLine", "SelectVariants and such.")); + }}; + } + + //Return a full set of metadata lines, retaining order in a LinkedHashSet. + public static LinkedHashSet getTestMetaDataLinesSet() { + final LinkedHashSet allHeaderLines = new LinkedHashSet() {{ //preserve order + addAll(getTestDefaultFileFormatLine()); + addAll(getTestFilterLines()); + addAll(getTestFormatLines()); + addAll(getTestInfoLines()); + addAll(getTestContigLines()); + addAll(getTestMiscellaneousLines()); + }}; + Assert.assertEquals(allHeaderLines.size(), + 1 + // file format line + getTestFilterLines().size() + getTestFormatLines().size() + + getTestInfoLines().size() + getTestContigLines().size() + getTestMiscellaneousLines().size()); + return allHeaderLines; + } + + //Return a full set of metadata lines as a VCFMetaDataLines. + public static VCFMetaDataLines getTestMetaDataLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + md.addMetaDataLines(getTestMetaDataLinesSet()); + return md; + } + + private static final int VCF_4_HEADER_STRING_COUNT = 16; // 17 -1 for the #CHROM... line + + public static String getVCFV42TestHeaderString() { + return "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + } + + public static final String VCF42headerStrings_with_negativeOne = + "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + + public static Set getV42HeaderLinesWITHOUTFormatString() { + // precondition - create a v42 VCFMetaDataLines and make sure its v42 + final Set metaDataSet = getV42HeaderLinesWITHFormatString(); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(metaDataSet); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals( + metaDataLines.getVCFVersion(), + VCFHeaderVersion.VCF4_2); + + // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string + metaDataSet.remove(versionLine); + Assert.assertNull(getVersionLineFromHeaderLineSet(metaDataSet)); + return metaDataSet; + } + + public static Set getV42HeaderLinesWITHFormatString() { + // precondition - create a v42 header and make sure its v42 + final VCFHeader header = createHeaderFromString(getVCFV42TestHeaderString()); + Assert.assertEquals( + header.getVCFHeaderVersion(), + VCFHeaderVersion.VCF4_2); + + // return a mutable set for test use + return new LinkedHashSet<>(header.getMetaDataInInputOrder()); + } + + public static VCFHeader createHeaderFromString(final String headerStr) { + final VCFCodec codec = new VCFCodec(); + codec.setVersionUpgradePolicy(VCFVersionUpgradePolicy.DO_NOT_UPGRADE); + final VCFHeader header = (VCFHeader) codec.readActualHeader( + new LineIteratorImpl(new SynchronousLineReader(new StringReader(headerStr)))); + Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF_4_HEADER_STRING_COUNT); + return header; + } + + /** + * Find and return the VCF fileformat/version line + * + * Return null if no fileformat/version lines are found + */ + private static VCFHeaderLine getVersionLineFromHeaderLineSet(final Set metaDataLines) { + VCFHeaderLine versionLine = null; + final List formatLines = new ArrayList<>(); + for (final VCFHeaderLine headerLine : metaDataLines) { + if (VCFHeaderVersion.isFormatString(headerLine.getKey())) { + formatLines.add(headerLine); + } + } + + if (!formatLines.isEmpty()) { + if (formatLines.size() > 1) { + //throw if there are duplicate version lines + throw new TribbleException("Multiple version header lines found in header line list"); + } + return formatLines.get(0); + } + + return versionLine; + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java new file mode 100644 index 0000000000..0ea2c8f1e8 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -0,0 +1,85 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to INFO lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFInfoHeaderLineUnitTest extends HtsjdkTest { + + @Test + public void testRepairInfoLineFlagTypeWithNonzeroCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(0, infoLine.getCount()); + } + + @DataProvider(name = "mergeCompatibleInfoLines") + public Object[][] getMergeCompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeCompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2, + final VCFInfoHeaderLine expectedHeaderLine) { + Assert.assertEquals( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)), + expectedHeaderLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test + public void testAllow1000GKey() { + final VCFInfoHeaderLine line = new VCFInfoHeaderLine( + "INFO=", + VCFHeader.DEFAULT_VCF_VERSION + ); + + Assert.assertFalse(line.getValidationFailure(VCFHeader.DEFAULT_VCF_VERSION).isPresent()); + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions= TribbleException.class) + public void testMergeIncompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2) { + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java index 02ccdb2a33..4030c180e2 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFIteratorTest.java @@ -50,21 +50,13 @@ public class VCFIteratorTest extends VariantBaseTest { @DataProvider(name = "VariantFiles") public Object[][] getVariantFiles() { - return new Object[][] { + return new Object[][] { new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf", 25 }, new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf.gz", 25 }, new Object[] { "src/test/resources/htsjdk/variant/serialization_test.bcf", 12 } }; } - @DataProvider(name = "VcfFiles") - public Object[][] getVcfFiles() { - return new Object[][] { - new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf", 25 }, - new Object[] { "src/test/resources/htsjdk/tribble/tabix/testTabixIndex.vcf.gz", 25 } - }; - } - private void assertExpectedNumberOfVariants(final VCFIterator r, final int expectVariants) { try { Assert.assertNotNull(r.getHeader()); @@ -91,32 +83,34 @@ public void testUsingFile(final String file, final int nVariants) throws IOExcep private void testUsingZippedInput(final String filepath, final int nVariants, final Function outputStreamProvider) throws IOException { File tmp = new File(filepath); + // TODO I don't understand what problem the comment below is referencing + // Does it mean the code paths for handling zipped/unzipped BCFs should be unified + // under VCFFileReader once VCFFileReader supports zipped BCF? + /* TODO fix this when VCFFileReader will support BCF see * https://github.com/samtools/htsjdk/pull/837#discussion_r139490218 * https://github.com/samtools/htsjdk/issues/946 */ - if( tmp.getName().endsWith(FileExtensions.VCF)) { + if(!(tmp.getName().endsWith(FileExtensions.COMPRESSED_VCF) || tmp.getName().endsWith(FileExtensions.BCF))) { tmp = File.createTempFile("tmp",FileExtensions.COMPRESSED_VCF); tmp.deleteOnExit(); try( FileInputStream in = new FileInputStream(filepath); OutputStream out = outputStreamProvider.apply(tmp); ) { IOUtil.copyStream(in, out); out.flush(); - } catch(final IOException err) { - throw err; - } } + } try (final VCFIterator r = new VCFIteratorBuilder().open(tmp) ) { assertExpectedNumberOfVariants(r, nVariants); } } - @Test(dataProvider = "VcfFiles") + @Test(dataProvider = "VariantFiles") public void testUsingBGZippedInput(final String filepath, final int nVariants) throws IOException { testUsingZippedInput(filepath, nVariants, (F)-> new BlockCompressedOutputStream(F)); } - @Test(dataProvider = "VcfFiles") + @Test(dataProvider = "VariantFiles") public void testUsingGZippedInput(final String filepath, final int nVariants) throws IOException { testUsingZippedInput(filepath, nVariants, (F)-> { try { diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java new file mode 100644 index 0000000000..f88f0fd0ba --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -0,0 +1,389 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFMetaDataLinesUnitTest extends HtsjdkTest { + + @DataProvider(name="keyCollisions") + public Object[][] keyCollisions() { + return new Object[][] { + // line 1, line 2, expected to collide + + // Unstructured key collisions + { // same key, same value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), true + }, + { // same key, different value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value1"), false + }, + { // different key, same value + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), false + }, + { // different key, different value + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), false + }, + + // Structured key collisions + { // same key, same ID, same (base VCFSimpleHeaderLine) class + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), true + }, + { // same key, same ID, same (derived-VCFSimpleHeaderLine) class, same attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "unused description"), true + }, + { // same key, same ID, same class, different attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "different unused description"), true + }, + { // same key, different ID + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName2", "unused description"), false + }, + { // This is an unfortunate case that is allowed by the existing permissive VCFHeader + // APIs; two header lines that have identical content, one of which is modeled by the + // VCFSimpleHeaderLine base class, and one of which is modeled by the specialized , + // derived VCFFilterHeaderLine class + new VCFFilterHeaderLine("id", "unused description"), + new VCFSimpleHeaderLine("FILTER", new LinkedHashMap() {{ + put("ID", "id"); + put("Description", "unused description"); + }}), true } + }; + } + + @Test(dataProvider="keyCollisions") + public void testKeyCollisions(final VCFHeaderLine line1, final VCFHeaderLine line2, final boolean expectCollision) { + final VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + Assert.assertEquals(mdLines.getMetaDataInInputOrder().size(), expectCollision ? 1 : 2); + } + + @DataProvider(name = "contigALTCollisions") + public Object[][] contigALTCollisions() { + return new Object[][] { + { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 0), new VCFAltHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) + }, + { + new VCFAltHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 0) + }, + }; + } + + @Test(dataProvider = "contigALTCollisions", expectedExceptions = IllegalStateException.class) + public void testContigALTCollision(final VCFHeaderLine line1, final VCFHeaderLine line2) { + final VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + } + + @Test + public void testRetainFullHeaderLines() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + Assert.assertEquals(md.getMetaDataInSortedOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + + Assert.assertEquals(unitTestData.getTestFormatLines(), md.getFormatHeaderLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + Assert.assertEquals(unitTestData.getTestInfoLines(), md.getInfoHeaderLines()); + Assert.assertEquals(unitTestData.getTestContigLines(), md.getContigLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + + final Set otherLines = new LinkedHashSet<>(); + otherLines.addAll(unitTestData.getTestDefaultFileFormatLine()); + otherLines.addAll(unitTestData.getTestMiscellaneousLines()); + Assert.assertEquals(otherLines, md.getOtherHeaderLines()); + } + + @Test + public void testAddRemoveOtherMetaDataLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getIDHeaderLines().size(); + int beforeOtherSize = md.getOtherHeaderLines().size(); + + final VCFHeaderLine newLine = new VCFHeaderLine("foo", "bar"); + + // add one other line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize + 1); + + // remove the other line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // still remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); + } + + @Test + public void testAddRemoveUniqueStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + final int beforeOtherSize = md.getOtherHeaderLines().size(); + + // add a new, unique, structured line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // remains the same + + // remove the new line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // still remains the same + } + + @Test + public void testAddRemoveDuplicateStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + + // add a new, unique, structured (filter) line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + + // now try to re-add the same structured filter line again, this second one is rejected, count remains the same + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getFilterHeaderLine("filterID"), newLine); + + // remove the first structured line and we're back to the original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + } + + @Test + public void testHasEquivalentHeaderLinePositive() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines sourceMetaDataLines = unitTestData.getTestMetaDataLines(); + + // for each headerLine in the set, make sure findEquivalentHeaderLine returns it + for (final VCFHeaderLine headerLine : sourceMetaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = sourceMetaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + } + } + + @Test + public void testHasEquivalentHeaderLineNegative() { + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + // add a few test lines + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "test value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "other value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("reference", "assembly37")); + + // for each other headerLine in the starting set, make another header line with the same key but a different + // value, and ensure findEquivalentHeaderLine does NOT return it + for (final VCFHeaderLine headerLine : metaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = metaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + + final VCFHeaderLine modifiedHeaderLine = new VCFHeaderLine(headerLine.getKey(), headerLine.getValue() + "zzz"); + Assert.assertNull(metaDataLines.findEquivalentHeaderLine(modifiedHeaderLine)); + } + } + + @Test + public void testGetFilterHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getFilterHeaderLine(unitTestData.getTestFilterLines().get(0).getID()), unitTestData.getTestFilterLines().get(0)); + } + + @Test + public void testGetInfoHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getInfoHeaderLine(unitTestData.getTestInfoLines().get(0).getID()), unitTestData.getTestInfoLines().get(0)); + } + + @Test + public void testGetFormatHeaderLine() { + final VCFHeaderUnitTestData testData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = testData.getTestMetaDataLines(); + Assert.assertEquals(md.getFormatHeaderLine(testData.getTestFormatLines().get(0).getID()), testData.getTestFormatLines().get(0)); + } + + @Test + public void testAddRemoveVersionLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + + final int originalMetaDataLineCount = md.getMetaDataInInputOrder().size(); + + // now, remove the version line, make sure the removed line is actually the version line, that the + // resulting metadataLines version is now null, and the line count drops by 1 + final VCFHeaderLine queryVersionLine = VCFHeader.makeHeaderVersionLine(unitTestData.TEST_VERSION); + final VCFHeaderLine oldVersionLine = md.removeMetaDataLine(queryVersionLine); + Assert.assertEquals(oldVersionLine, queryVersionLine); + Assert.assertNull(md.getVCFVersion()); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount - 1); + + // now put it back... + md.addMetaDataLine(oldVersionLine); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount); + } + + @Test + public void testAddContigLineExactDuplicate() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // add in the duplicate line + md.addMetaDataLine(vcfContigLine1); + Assert.assertEquals(md.getContigLines(), contigLines); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineConflicting() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + + final Set contigLines = new LinkedHashSet<>(); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0)); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1)); + + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // try to add a contg line with a duplicate index, but with a different name than the existing line with that index + md.addMetaDataLine(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0)); + } + + @Test + public void testRemoveAndReplaceContigLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 1); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 2); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + //make sure the initial contig index order is honored; it happens to be the same as the input + // order a this point, but check anyway + final List sortedLines1 = md.getContigLines(); + Assert.assertEquals(sortedLines1.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines1.get(1), vcfContigLine2); + + // now remove the first contig line; only one should remain + final VCFHeaderLine removedContigLine = md.removeMetaDataLine(vcfContigLine1); + Assert.assertEquals(removedContigLine, vcfContigLine1); + final List sortedContigHeaderLines = md.getContigLines(); + Assert.assertEquals(sortedContigHeaderLines.size(), 1); + + // now add the first line back in, so the input order is different than the sorted order, + // and make sure the order is honored + md.addMetaDataLine(vcfContigLine1); + final List sortedLines2 = md.getContigLines(); + Assert.assertEquals(sortedLines2.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines2.get(1), vcfContigLine2); + + // now add in ANOTHER contig line at the end that has an index that puts it BEFORE the existing lines + final VCFContigHeaderLine vcfContigLine3 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0); + md.addMetaDataLine(vcfContigLine3); + final List sortedLines3 = md.getContigLines(); + Assert.assertEquals(sortedLines3.size(), 3); + Assert.assertEquals(sortedLines3.get(0), vcfContigLine3); + Assert.assertEquals(sortedLines3.get(1), vcfContigLine1); + Assert.assertEquals(sortedLines3.get(2), vcfContigLine2); + } + + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(); + orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(orderedLineSet); + + final Collection inputOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); + } + +} + diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java new file mode 100644 index 0000000000..518f6a6928 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java @@ -0,0 +1,44 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFMetaHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + }; + } + + private static final String META_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFMetaHeaderLine vcfLine = new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java new file mode 100644 index 0000000000..43179c6862 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java @@ -0,0 +1,50 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFPedigreeHeaderLineUnitTest extends HtsjdkTest { + + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String PEDIGREE_STRING_4_2 = "PEDIGREE="; + private static final String PEDIGREE_STRING_4_3 = "PEDIGREE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFPedigreeHeaderLine vcfLine = new VCFPedigreeHeaderLine( + vcfAllowedVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ? + PEDIGREE_STRING_4_3 : + PEDIGREE_STRING_4_2, + vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFPedigreeHeaderLine(PEDIGREE_STRING_4_2, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java new file mode 100644 index 0000000000..355827e27b --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFSampleHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String SAMPLE_STRING = "SAMPLE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFSampleHeaderLine vcfLine = new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java new file mode 100644 index 0000000000..c9f8841d3d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java @@ -0,0 +1,151 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import java.util.LinkedHashMap; + +public class VCFSimpleHeaderLineUnitTest extends HtsjdkTest { + + private VCFSimpleHeaderLine getStructuredHeaderLine() { + return new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }} + ); + } + + @Test + public void testConstructorFromStrings() { + final VCFSimpleHeaderLine hl = new VCFSimpleHeaderLine("testKey", "testId", "test description"); + Assert.assertEquals("testKey", hl.getKey()); + Assert.assertEquals("testId", hl.getID()); + Assert.assertEquals("test description", hl.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + Assert.assertEquals("testKey=", hl.toStringEncoding()); + } + + @Test + public void testConstructorFromEncodedLine() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testConstructorFromAttributeMap() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }}); + + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromEncodedLine() { + new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromAttributeMap() { + new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("attr1", "value1"); + put("attr2", "value2"); + }}); + } + + @DataProvider(name = "violateIDRequirements") + public Object[][] getViolateIDRequirements() { + return new Object[][]{ + {""}, + {""}, + {""}, + {""} + }; + } + + @Test(dataProvider="violateIDRequirements",expectedExceptions=TribbleException.class) + public void testViolateIDRequirements(final String headerLine) { + new VCFSimpleHeaderLine("key", headerLine, VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test + public void testGetID() { + Assert.assertEquals(getStructuredHeaderLine().getID(), "id"); + } + + @Test + public void testIsIDLine() { + Assert.assertTrue(getStructuredHeaderLine().isIDHeaderLine()); + } + + @Test + public void testGetGenericFieldValue() { + Assert.assertEquals(getStructuredHeaderLine().getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testStringEncoding() { + final VCFSimpleHeaderLine structuredHL = getStructuredHeaderLine(); + Assert.assertEquals(structuredHL.toStringEncoding(),"key="); + } + + @Test + public void testUnescapedQuotedStringEncoding() { + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEscapedQuotedStringEncoding() { + // test Source and Version attributes + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index c9efaa59ef..c17360a770 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -31,8 +31,9 @@ import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; /** * Created by IntelliJ IDEA. @@ -188,7 +189,13 @@ public Object[][] makeRepairHeaderTest() { @Test(dataProvider = "RepairHeaderTest") public void testRepairHeaderTest(final RepairHeaderTest cfg) { - final VCFHeader toRepair = new VCFHeader(Collections.singleton((VCFHeaderLine)cfg.original)); + final Set headerLines = new LinkedHashSet<>(); + // The standard header line repair facility is not sufficiently powerful to fix broken lines + // starting from version 4.3, so it is only used for versions <= 4.2, and we use version 4.2 for this test + headerLines.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + headerLines.add(cfg.original); + + final VCFHeader toRepair = new VCFHeader(headerLines); final VCFHeader repaired = VCFStandardHeaderLines.repairStandardHeaderLines(toRepair); VCFCompoundHeaderLine repairedLine = (VCFCompoundHeaderLine)repaired.getFormatHeaderLine(cfg.original.getID()); diff --git a/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java b/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java index 8bb9927de0..0f7d9f5963 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFTextTransformerTest.java @@ -1,56 +1,75 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; -import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.stream.Stream; + public class VCFTextTransformerTest extends HtsjdkTest { - @DataProvider(name="validPercentEncodings") + @DataProvider(name = "validPercentEncodings") public Object[][] validPercentEncodings() { - return new Object[][] { - { "", ""}, - { "%3A", ":"}, - { "%3B", ";"}, - { "%3D", "="}, - { "%25", "%"}, - { "%2C", ","}, - { "%0D", "\r"}, - { "%0A", "\n"}, - { "%09", "\t"}, - { "%3AA", ":A"}, - { "abc%3A", "abc:"}, - { "%3Aabc", ":abc"}, - { "%3Aabc%3A", ":abc:"}, - - // valid text containing % encodings that are not valid, and are passed through in raw form (no decoding) - { "%3", "%3"}, - { "%d", "%d"}, - { "%a", "%a"}, - { "abcdefg%", "abcdefg%"}, - { "%3Aabcdefg%", ":abcdefg%"}, - { "abcdefg%0", "abcdefg%0"}, - { "abcdefg%1", "abcdefg%1"}, - { "abcdefg%a", "abcdefg%a"}, - { "abcdefg%d", "abcdefg%d"}, - { "abcdefg%g", "abcdefg%g"}, - { "abcdefg%gg", "abcdefg%gg"}, - { "abcdefg%-1", "abcdefg%-1"}, + return new Object[][]{ + {"", ""}, + {"%3A", ":"}, + {"%3B", ";"}, + {"%3D", "="}, + {"%25", "%"}, + {"%2C", ","}, + {"%0D", "\r"}, + {"%0A", "\n"}, + {"%09", "\t"}, + {"%3AA", ":A"}, + {"abc%3A", "abc:"}, + {"%3Aabc", ":abc"}, + {"%3Aabc%3A", ":abc:"}, }; } - @Test(dataProvider="validPercentEncodings") - public void testDecodeValidEncodings(final String rawText, final String decodedText) { + @DataProvider(name = "truncatedPercentEncodings") + public Object[][] truncatedPercentEncodings() { + return new Object[][]{ + // valid text containing % encodings that are not valid, and are passed through in raw form (no decoding) + {"%3", "%3"}, + {"%d", "%d"}, + {"%a", "%a"}, + {"abcdefg%", "abcdefg%"}, + {"%3Aabcdefg%", ":abcdefg%"}, + {"abcdefg%0", "abcdefg%0"}, + {"abcdefg%1", "abcdefg%1"}, + {"abcdefg%a", "abcdefg%a"}, + {"abcdefg%d", "abcdefg%d"}, + {"abcdefg%g", "abcdefg%g"}, + {"abcdefg%gg", "abcdefg%gg"}, + {"abcdefg%-1", "abcdefg%-1"}, + }; + } + + @DataProvider(name = "allPercentEncodings") + public Object[][] allPercentEncodings() { + return Stream.concat(Arrays.stream(validPercentEncodings()), Arrays.stream(truncatedPercentEncodings())) + .toArray(Object[][]::new); + } + + @Test(dataProvider = "allPercentEncodings") + public void testDecodeValidEncodings(final String encodedText, final String decodedText) { final VCFTextTransformer vcfTextTransformer = new VCFPercentEncodedTextTransformer(); - Assert.assertEquals(vcfTextTransformer.decodeText(rawText), decodedText); + Assert.assertEquals(vcfTextTransformer.decodeText(encodedText), decodedText); } - @Test(dataProvider = "validPercentEncodings") - public void testPassThruValidEncodings(final String rawText, final String unused) { + @Test(dataProvider = "allPercentEncodings") + public void testPassThruValidEncodings(final String encodedText, final String unused) { final VCFPassThruTextTransformer vcfPassThruTransformer = new VCFPassThruTextTransformer(); - Assert.assertEquals(vcfPassThruTransformer.decodeText(rawText), rawText); + Assert.assertEquals(vcfPassThruTransformer.decodeText(encodedText), encodedText); } + @Test(dataProvider = "validPercentEncodings") + public void testInverseComposition(final String encodedText, final String decodedText) { + final VCFTextTransformer vcfTextTransformer = new VCFPercentEncodedTextTransformer(); + Assert.assertEquals(vcfTextTransformer.encodeText(vcfTextTransformer.decodeText(encodedText)), encodedText); + Assert.assertEquals(vcfTextTransformer.decodeText(vcfTextTransformer.encodeText(decodedText)), decodedText); + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java index ed943feac1..5629798c61 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java @@ -1,6 +1,7 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -11,45 +12,55 @@ public class VCFUtilsTest extends HtsjdkTest { @DataProvider(name="validHeaderVersionMerger") public Object[][] validHeaderMergerVersions() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + + // header version must be at least v4.2 to merge, result is always highest version return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.0")}, - {Arrays.asList("VCFv4.1", "VCFv4.1")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.3", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.2")}, + // headers to merge, expected result version + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2}, + {Arrays.asList("VCFv4.3", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, }; } @DataProvider(name="invalidHeaderVersionMerger") public Object[][] invalidHeaderVersionMerger() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + // header version must be at least v4.2 to merge return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.3")}, - {Arrays.asList("VCFv4.1", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.0", "VCFv4.0", "VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.3", "VCFv4.0", "VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.2")}, + {Arrays.asList("VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.1", "VCFv4.2", "VCFv4.3")}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.1", "VCFv4.0")}, }; } @Test(dataProvider="validHeaderVersionMerger") - public void testValidHeaderVersionMerger(final List headerVersions) { - final List headersToMerge = new ArrayList<>(headerVersions.size()); - headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) - ); - final Set resultHeaders = VCFUtils.smartMergeHeaders(headersToMerge, true); + public void testValidHeaderVersionMerger(final List headerVersions, final VCFHeaderVersion expectedVersion) { + final Set mergedHeaderLines = doHeaderMerge(headerVersions); + + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); } - @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = IllegalArgumentException.class) + @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = TribbleException.class) public void testInvalidHeaderVersionMerger(final List headerVersions) { + doHeaderMerge(headerVersions); + } + + private Set doHeaderMerge(final List headerVersions) { final List headersToMerge = new ArrayList<>(headerVersions.size()); headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) + new VCFHeader( + VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.toHeaderVersion(hv)), + Collections.emptySet())) ); - VCFUtils.smartMergeHeaders(headersToMerge, true); + return VCFUtils.smartMergeHeaders(headersToMerge, true); } @DataProvider(name = "caseIntolerantDoubles") diff --git a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf index a304ba24da..75c9f9b537 100644 --- a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf +++ b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FORMAT= ##FORMAT= diff --git a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf index 9af0cb3e64..097d0b034f 100644 --- a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf +++ b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FILTER= ##FILTER= diff --git a/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf b/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf index 8c84efb399..8f8630d8fb 100644 Binary files a/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf and b/src/test/resources/htsjdk/variant/VcfThatLacksAnIndex.bcf differ diff --git a/src/test/resources/htsjdk/variant/bcfV22.bcf b/src/test/resources/htsjdk/variant/bcfV22.bcf new file mode 100644 index 0000000000..8ded3b5103 Binary files /dev/null and b/src/test/resources/htsjdk/variant/bcfV22.bcf differ diff --git a/src/test/resources/htsjdk/variant/bcfV22.bcf.gz b/src/test/resources/htsjdk/variant/bcfV22.bcf.gz new file mode 100644 index 0000000000..8ded3b5103 Binary files /dev/null and b/src/test/resources/htsjdk/variant/bcfV22.bcf.gz differ diff --git a/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf b/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf index fbe8d1e405..9f96ce09ed 100644 --- a/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf +++ b/src/test/resources/htsjdk/variant/diagnosis_targets_testfile.vcf @@ -14,7 +14,7 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= +##INFO= ##INFO= ##INFO= ##contig= diff --git a/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf b/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf new file mode 100644 index 0000000000..eb9b8d0b7e --- /dev/null +++ b/src/test/resources/htsjdk/variant/missingStringAndCharacterTest.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.3 +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 17330 . T A . PASS NS=3;DP=11;AF=0.017;CHAR=.,b,c;STR=. GT:GQ:DP:HQ:CHAR:STR 0|0:49:3:58,50:a,b,c:abc 0|1:3:5:65,3:.,.,c:c 0/0:41:3:4,5:.:. +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;CHAR=.;STR=. GT:GQ:DP:HQ:CHAR:STR 1|2:21:6:23,27:.:a 2|1:2:0:18,2:.:ab 2/2:35:4:10,20:.:abc diff --git a/src/test/resources/htsjdk/variant/serialization_test.bcf b/src/test/resources/htsjdk/variant/serialization_test.bcf index 8c84efb399..8f8630d8fb 100644 Binary files a/src/test/resources/htsjdk/variant/serialization_test.bcf and b/src/test/resources/htsjdk/variant/serialization_test.bcf differ diff --git a/src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf b/src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf new file mode 100644 index 0000000000..a19ea048f1 Binary files /dev/null and b/src/test/resources/htsjdk/variant/serialization_test_unzipped.bcf differ diff --git a/src/test/resources/htsjdk/variant/structuralvariants.vcf b/src/test/resources/htsjdk/variant/structuralvariants.vcf index 5ffad2f94c..4de882ea49 100644 --- a/src/test/resources/htsjdk/variant/structuralvariants.vcf +++ b/src/test/resources/htsjdk/variant/structuralvariants.vcf @@ -7,7 +7,7 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##ALT= ##ALT= ##ALT= diff --git a/src/test/resources/htsjdk/variant/test1.vcf b/src/test/resources/htsjdk/variant/test1.vcf index 39bed22e75..55566f3365 100644 --- a/src/test/resources/htsjdk/variant/test1.vcf +++ b/src/test/resources/htsjdk/variant/test1.vcf @@ -48,6 +48,6 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 1 8216712 rs11121115 A G 1540.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=0.917;DB;DP=131;Dels=0.00;FS=11.67;HaplotypeScore=3.35;MLEAC=3;MLEAF=0.500;MQ=57.74;MQ0=1;MQRankSum=0.427;QD=11.76;ReadPosRankSum=-2.190e-01;SB=-9.390e+02;VQSLOD=5.53;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:23,28:51:99:681,0,668:127 0/1:16,18:34:99:338,0,244:127 0/1:24,22:46:99:560,0,323:127 1 17032814 rs2773183 T C 2828.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 -1 17032818 rs2773183 T C 2828.26 FILTER AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 +1 17032818 rs2773183 T C 2828.26 LowQual AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 2 1143476 rs4998209 C T 1483.26 PASS AC=2;AF=0.333;AN=6;BaseQRankSum=-4.814e+00;DB;DP=189;Dels=0.00;FS=5.61;HaplotypeScore=0.324;MLEAC=2;MLEAF=0.333;MQ=58.36;MQ0=0;MQRankSum=1.58;QD=12.06;ReadPosRankSum=0.326;SB=-9.320e+02;VQSLOD=6.81;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0|0:66,0:66:99:0,178,2264:127 0|1:33,38:71:99:844,0,1024:127 0|1:26,26:52:99:678,0,719:127 2 9240279 rs56249990 A G 3978.01 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=1.70;DB;DP=213;Dels=0.00;FS=7.83;HaplotypeScore=1.19;MLEAC=3;MLEAF=0.500;MQ=59.40;MQ0=0;MQRankSum=0.143;QD=27.25;ReadPosRankSum=-9.700e-02;SB=-1.991e+03;VQSLOD=9.14;culprit=FS GT:AD:DP:GQ:PL:TP 0|1:33,42:75:99:1400,0,1031:127 0|0:67,0:67:99:0,178,2277:127 1|1:0,71:71:99:2578,199,0:127 diff --git a/src/test/resources/htsjdk/variant/test2.vcf b/src/test/resources/htsjdk/variant/test2.vcf index 39bed22e75..55566f3365 100644 --- a/src/test/resources/htsjdk/variant/test2.vcf +++ b/src/test/resources/htsjdk/variant/test2.vcf @@ -48,6 +48,6 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 1 8216712 rs11121115 A G 1540.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=0.917;DB;DP=131;Dels=0.00;FS=11.67;HaplotypeScore=3.35;MLEAC=3;MLEAF=0.500;MQ=57.74;MQ0=1;MQRankSum=0.427;QD=11.76;ReadPosRankSum=-2.190e-01;SB=-9.390e+02;VQSLOD=5.53;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:23,28:51:99:681,0,668:127 0/1:16,18:34:99:338,0,244:127 0/1:24,22:46:99:560,0,323:127 1 17032814 rs2773183 T C 2828.26 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 -1 17032818 rs2773183 T C 2828.26 FILTER AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 +1 17032818 rs2773183 T C 2828.26 LowQual AC=3;AF=0.500;AN=6;BaseQRankSum=-3.879e+00;DB;DP=322;Dels=0.00;FS=2.43;HaplotypeScore=15.45;MLEAC=3;MLEAF=0.500;MQ=56.86;MQ0=0;MQRankSum=2.92;QD=8.78;ReadPosRankSum=-1.245e+00;SB=-1.943e+03;VQSLOD=-1.421e+00;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0/1:63,59:122:99:1434,0,1831:127 0/1:53,56:109:99:910,0,871:127 0/1:61,30:91:99:523,0,1257:127 2 1143476 rs4998209 C T 1483.26 PASS AC=2;AF=0.333;AN=6;BaseQRankSum=-4.814e+00;DB;DP=189;Dels=0.00;FS=5.61;HaplotypeScore=0.324;MLEAC=2;MLEAF=0.333;MQ=58.36;MQ0=0;MQRankSum=1.58;QD=12.06;ReadPosRankSum=0.326;SB=-9.320e+02;VQSLOD=6.81;culprit=HaplotypeScore GT:AD:DP:GQ:PL:TP 0|0:66,0:66:99:0,178,2264:127 0|1:33,38:71:99:844,0,1024:127 0|1:26,26:52:99:678,0,719:127 2 9240279 rs56249990 A G 3978.01 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=1.70;DB;DP=213;Dels=0.00;FS=7.83;HaplotypeScore=1.19;MLEAC=3;MLEAF=0.500;MQ=59.40;MQ0=0;MQRankSum=0.143;QD=27.25;ReadPosRankSum=-9.700e-02;SB=-1.991e+03;VQSLOD=9.14;culprit=FS GT:AD:DP:GQ:PL:TP 0|1:33,42:75:99:1400,0,1031:127 0|0:67,0:67:99:0,178,2277:127 1|1:0,71:71:99:2578,199,0:127 diff --git a/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf b/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf new file mode 100644 index 0000000000..1d248d2ae9 --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/42AutomaticallyConvertible.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf b/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf new file mode 100644 index 0000000000..c9689a922c --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/42Pedigree.vcf @@ -0,0 +1,91 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##PEDIGREE= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf b/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf new file mode 100644 index 0000000000..a9aac29ed0 --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/invalid43ContigName.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf b/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf new file mode 100644 index 0000000000..e2c2945beb --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/valid43ContigName.vcf @@ -0,0 +1,90 @@ +##fileformat=VCFv4.2 +##COMMENT=This file has 0 embedded UTF8 characters, but we need this fake comment line to keep the file aligned with it's utf8 companion test file. +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=fil%3AteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +2 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +3 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +4 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +5 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +6 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +7 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +8 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +9 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +10 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +11 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +12 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +13 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +14 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +15 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +16 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +17 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +18 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +19 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +20 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +21 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +22 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +X 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 +Y 327 . T C 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredInBoth GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99