diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index 86ffa6c9f7..3be179f784 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -86,6 +86,8 @@ public void setSequences(final List list) { if (mSequenceMap.put(record.getSequenceName(), record) != null) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + record.getSequenceName()); + } else { + record.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(record.getSequenceName(), an)); } } } @@ -98,6 +100,7 @@ public void addSequence(final SAMSequenceRecord sequenceRecord) { sequenceRecord.setSequenceIndex(mSequences.size()); mSequences.add(sequenceRecord); mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord); + sequenceRecord.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(sequenceRecord.getSequenceName(), an)); } /** @@ -203,7 +206,11 @@ public boolean isSameDictionary(final SAMSequenceDictionary that) { return !thatSequences.hasNext(); } - /** returns true if the two dictionaries are the same, aliases are NOT considered */ + /** + * Returns {@code true} if the two dictionaries are the same. + * + *

NOTE: Aliases are NOT considered, but alternative sequence names (AN tag) names ARE. + */ @Override public boolean equals(Object o) { if (this == o) return true; @@ -220,10 +227,11 @@ public boolean equals(Object o) { * 1,chr1,chr01,01,CM000663,NC_000001.10 e.g: * MT,chrM * - * @param originalName - * existing contig name - * @param altName - * new contig name + *

NOTE: this method does not add the alias to the alternative sequence name tag (AN) in the SAMSequenceRecord. + * If you would like to add it to the AN tag, use {@link #addAlternativeSequenceName(String, String)} instead. + * + * @param originalName existing contig name + * @param altName new contig name * @return the contig associated to the 'originalName/altName' */ public SAMSequenceRecord addSequenceAlias(final String originalName, @@ -246,6 +254,25 @@ public SAMSequenceRecord addSequenceAlias(final String originalName, return originalSeqRecord; } + /** + * Add an alternative sequence name (AN tag) to a SAMSequenceRecord, including it into the aliases + * to retrieve the contigs (as with {@link #addSequenceAlias(String, String)}. + * + *

This can be use to provide some alternate names fo a given contig. e.g: + * 1,chr1,chr01,01,CM000663 or + * MT,chrM. + * + * @param originalName existing contig name + * @param altName new contig name + * @return the contig associated to the 'originalName/altName', with the AN tag including the altName + */ + public SAMSequenceRecord addAlternativeSequenceName(final String originalName, + final String altName) { + final SAMSequenceRecord record = addSequenceAlias(originalName, altName); + record.addAlternativeSequenceName(altName); + return record; + } + /** * return a MD5 sum for ths dictionary, the checksum is re-computed each * time this method is called. diff --git a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java index a4b4df2369..8330a675bf 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java @@ -23,14 +23,22 @@ */ package htsjdk.samtools; +import htsjdk.samtools.util.StringUtil; + import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.XmlTransient; import javax.xml.bind.annotation.XmlValue; import java.math.BigInteger; import java.net.URI; import java.net.URISyntaxException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -46,6 +54,7 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea private int mSequenceIndex = -1; private int mSequenceLength = 0; public static final String SEQUENCE_NAME_TAG = "SN"; + public static final String ALTERNATIVE_SEQUENCE_NAME_TAG = "AN"; public static final String SEQUENCE_LENGTH_TAG = "LN"; public static final String MD5_TAG = "M5"; public static final String ASSEMBLY_TAG = "AS"; @@ -67,7 +76,7 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea * The standard tags are stored in text header without type information, because the type of these tags is known. */ public static final Set STANDARD_TAGS = - new HashSet(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, + new HashSet(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, ALTERNATIVE_SEQUENCE_NAME_TAG, MD5_TAG, URI_TAG, SPECIES_TAG)); // Split on any whitespace @@ -75,6 +84,10 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea // These are the chars matched by \\s. private static char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab + // alternative sequence name regexp + private static final Pattern ALTERNATIVE_SEQUENCE_NAME_REGEXP = Pattern.compile("[0-9A-Za-z][0-9A-Za-z*+.@_|-]*"); + private static final String ALTERNATIVE_SEQUENCE_NAME_SEPARATOR = ","; + /** a (private) empty constructor is required for JAXB.XML-serialisation */ @SuppressWarnings("unused") private SAMSequenceRecord() { @@ -139,6 +152,49 @@ private void setSequenceName(final String name) { // Private state used only by SAM implementation. public void setSequenceIndex(final int value) { mSequenceIndex = value; } + /** Returns a set with alternative sequence names. Modifications will not affect the record instance. */ + @XmlAttribute(name = "alternative_sequence_names") + public Set getAlternativeSequenceNames() { + final String anTag = getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG); + return (anTag == null) ? new LinkedHashSet<>() : new LinkedHashSet<>(Arrays.asList(anTag.split(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR))); + } + + /** Adds an alternative sequence name if it is not the same as the sequence name or it is not present already. */ + public void addAlternativeSequenceName(final String name) { + validateAltRegExp(name); + final Set altSequences = getAlternativeSequenceNames(); + if (!mSequenceName.equals(name) && ! altSequences.contains(name) ) { + altSequences.add(name); + } + encodeAltSequences(altSequences); + } + + /** Sets the alternative sequence names in the order provided by iteration, removing the previous values. */ + public void setAlternativeSequenceName(final Collection alternativeSequences) { + if (alternativeSequences == null) { + setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, null); + } else { + // validate all the names and encode afterwards + alternativeSequences.forEach(SAMSequenceRecord::validateAltRegExp); + encodeAltSequences(alternativeSequences); + } + } + + private static void validateAltRegExp(final String name) { + if (!ALTERNATIVE_SEQUENCE_NAME_REGEXP.matcher(name).matches()) { + throw new IllegalArgumentException(String.format("Invalid alternative sequence name '%s': do not match the pattern %s", name, ALTERNATIVE_SEQUENCE_NAME_REGEXP)); + } + } + + private void encodeAltSequences(final Collection alternativeSequences) { + setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, StringUtil.join(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR, alternativeSequences)); + } + + /** Returns {@code true} if there are alternative sequence names; {@code false} otherwise. */ + public boolean hasAlternativeSequenceNames() { + return getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG) != null; + } + /** * Looser comparison than equals(). We look only at sequence index, sequence length, and MD5 tag value * (or sequence names, if there is no MD5 tag in either record. @@ -159,7 +215,16 @@ public boolean isSameSequence(final SAMSequenceRecord that) { } } else { - if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the Strings + // Compare using == since we intern() the Strings + if (mSequenceName != that.mSequenceName) { + // if they are different, they could still be the same based on the alternative sequences + if (getAlternativeSequenceNames().contains(that.mSequenceName) || + that.getAlternativeSequenceNames().contains(mSequenceName)) { + return true; + } + return false; + } + } return true; @@ -184,6 +249,7 @@ public boolean equals(final Object o) { if (mSequenceLength != that.mSequenceLength) return false; if (!attributesEqual(that)) return false; if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the name + if (!getAlternativeSequenceNames().equals(that.getAlternativeSequenceNames())) return false; return true; } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index a8e60ed501..c964e95a8b 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -59,6 +59,24 @@ public void testAliases() { Assert.assertNull(dict.getSequence("chr2")); } + @Test + public void testAlternativeSequences() { + final SAMSequenceRecord ssr1 = new SAMSequenceRecord("1", 1); + final SAMSequenceRecord ssr2 = new SAMSequenceRecord("2", 1); + + final SAMSequenceDictionary dict = new SAMSequenceDictionary( + Arrays.asList(ssr1, ssr2)); + Assert.assertEquals(dict.size(), 2); + dict.addAlternativeSequenceName("1", "chr1"); + dict.addAlternativeSequenceName("1", "01"); + dict.addAlternativeSequenceName("1", "1"); + dict.addAlternativeSequenceName("01", "chr01"); + Assert.assertEquals(dict.size(), 2); + Assert.assertTrue(dict.getSequence("chr1").hasAlternativeSequenceNames()); + Assert.assertEquals(dict.getSequence("1").getAlternativeSequenceNames().size(), 3); + Assert.assertNull(dict.getSequence("chr2")); + } + /** * should be saved as XML * diff --git a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java index 89e6121d27..33d0f91506 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java @@ -29,6 +29,8 @@ import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Test for SAMReadGroupRecordTest @@ -83,4 +85,72 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR Assert.assertEquals(rec1.isSameSequence(rec2), isSame); } } + + @Test + public void testAlternativeSequences() { + final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100); + + // no AN tag yet + Assert.assertFalse(chr1.hasAlternativeSequenceNames()); + Assert.assertTrue(chr1.getAlternativeSequenceNames().isEmpty()); + + // set to a random alias + chr1.addAlternativeSequenceName("my-chromosome"); + Assert.assertNotEquals(chr1, new SAMSequenceRecord(chr1.getSequenceName(), chr1.getSequenceLength())); + Assert.assertTrue(chr1.hasAlternativeSequenceNames()); + Assert.assertEquals(chr1.getAlternativeSequenceNames(), Collections.singleton("my-chromosome")); + Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:my-chromosome", chr1.getSAMString()); + + // set to new chromosome aliases (removing previous) + final List chr1AltNames = Arrays.asList("chr1","chr01","01","CM000663"); + chr1.setAlternativeSequenceName(chr1AltNames); + Assert.assertTrue(chr1.hasAlternativeSequenceNames()); + Assert.assertEquals(chr1.getAlternativeSequenceNames(), chr1AltNames); + Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:chr1,chr01,01,CM000663", chr1.getSAMString()); + } + + @DataProvider + public Object[][] validAlternativeSequences() { + return new Object[][] { + // only characters + {"a"}, + {"alias"}, + // valid symbols after first character ('*', '+', '.', '@', '_', '|', '-') + {"my*contig"}, + {"my+contig"}, + {"my.contig"}, + {"my@contig"}, + {"my_contig"}, + {"my|contig"}, + {"my-contig"} + }; + } + + @Test(dataProvider = "validAlternativeSequences") + public void testValidAlternativeSequences(final String altName) { + final SAMSequenceRecord contig = new SAMSequenceRecord("contig", 100); + // should not throw + contig.setAlternativeSequenceName(Collections.singleton(altName)); + Assert.assertTrue(contig.hasAlternativeSequenceNames()); + Assert.assertEquals(contig.getAlternativeSequenceNames(), Collections.singleton(altName)); + } + + @DataProvider + public Object[][] invalidAlternativeSequences() { + return new Object[][] { + // invalid start + {"@chr1"}, {",chr1"}, + // comma-separated + {"chr1,alt"}, + // coordinate-like + {"chr1:1000"}, {"chr1:100-200"} + }; + } + + @Test(dataProvider = "invalidAlternativeSequences") + public void testInvalidAlternativeSequences(final String altName) { + final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100); + Assert.assertThrows(IllegalArgumentException.class, () -> chr1.addAlternativeSequenceName(altName)); + Assert.assertThrows(IllegalArgumentException.class, () -> chr1.setAlternativeSequenceName(Collections.singleton(altName))); + } }