-
Notifications
You must be signed in to change notification settings - Fork 242
Add support for SQ-AN in SAMSequenceDictionary #956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f3f5996
f4d4bfa
e437a3f
a540558
a8685ee
bfe2206
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,6 +86,8 @@ public void setSequences(final List<SAMSequenceRecord> list) { | |
| if (mSequenceMap.put(record.getSequenceName(), record) != null) { | ||
| throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + | ||
| record.getSequenceName()); | ||
| } else { | ||
| record.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(record.getSequenceName(), an)); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -98,6 +100,7 @@ public void addSequence(final SAMSequenceRecord sequenceRecord) { | |
| sequenceRecord.setSequenceIndex(mSequences.size()); | ||
| mSequences.add(sequenceRecord); | ||
| mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord); | ||
| sequenceRecord.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(sequenceRecord.getSequenceName(), an)); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -203,7 +206,11 @@ public boolean isSameDictionary(final SAMSequenceDictionary that) { | |
| return !thatSequences.hasNext(); | ||
| } | ||
|
|
||
| /** returns true if the two dictionaries are the same, aliases are NOT considered */ | ||
| /** | ||
| * Returns {@code true} if the two dictionaries are the same. | ||
| * | ||
| * <p>NOTE: Aliases are NOT considered, but alternative sequence names (AN tag) names ARE. | ||
| */ | ||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) return true; | ||
|
|
@@ -220,10 +227,11 @@ public boolean equals(Object o) { | |
| * <code>1,chr1,chr01,01,CM000663,NC_000001.10</code> e.g: | ||
| * <code>MT,chrM</code> | ||
| * | ||
| * @param originalName | ||
| * existing contig name | ||
| * @param altName | ||
| * new contig name | ||
| * <p>NOTE: this method does not add the alias to the alternative sequence name tag (AN) in the SAMSequenceRecord. | ||
| * If you would like to add it to the AN tag, use {@link #addAlternativeSequenceName(String, String)} instead. | ||
| * | ||
| * @param originalName existing contig name | ||
| * @param altName new contig name | ||
| * @return the contig associated to the 'originalName/altName' | ||
| */ | ||
| public SAMSequenceRecord addSequenceAlias(final String originalName, | ||
|
|
@@ -246,6 +254,25 @@ public SAMSequenceRecord addSequenceAlias(final String originalName, | |
| return originalSeqRecord; | ||
| } | ||
|
|
||
| /** | ||
| * Add an alternative sequence name (AN tag) to a SAMSequenceRecord, including it into the aliases | ||
| * to retrieve the contigs (as with {@link #addSequenceAlias(String, String)}. | ||
| * | ||
| * <p>This can be use to provide some alternate names fo a given contig. e.g: | ||
| * <code>1,chr1,chr01,01,CM000663</code> or | ||
| * <code>MT,chrM</code>. | ||
| * | ||
| * @param originalName existing contig name | ||
| * @param altName new contig name | ||
| * @return the contig associated to the 'originalName/altName', with the AN tag including the altName | ||
| */ | ||
| public SAMSequenceRecord addAlternativeSequenceName(final String originalName, | ||
| final String altName) { | ||
| final SAMSequenceRecord record = addSequenceAlias(originalName, altName); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to codecov this new code isn't covered by a test.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
| record.addAlternativeSequenceName(altName); | ||
| return record; | ||
| } | ||
|
|
||
| /** | ||
| * return a MD5 sum for ths dictionary, the checksum is re-computed each | ||
| * time this method is called. | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -23,14 +23,22 @@ | |||||
| */ | ||||||
| package htsjdk.samtools; | ||||||
|
|
||||||
| import htsjdk.samtools.util.StringUtil; | ||||||
|
|
||||||
| import javax.xml.bind.annotation.XmlAttribute; | ||||||
| import javax.xml.bind.annotation.XmlRootElement; | ||||||
| import javax.xml.bind.annotation.XmlTransient; | ||||||
| import javax.xml.bind.annotation.XmlValue; | ||||||
| import java.math.BigInteger; | ||||||
| import java.net.URI; | ||||||
| import java.net.URISyntaxException; | ||||||
| import java.util.ArrayList; | ||||||
| import java.util.Arrays; | ||||||
| import java.util.Collection; | ||||||
| import java.util.Collections; | ||||||
| import java.util.HashSet; | ||||||
| import java.util.LinkedHashSet; | ||||||
| import java.util.List; | ||||||
| import java.util.Map; | ||||||
| import java.util.Set; | ||||||
| import java.util.regex.Pattern; | ||||||
|
|
@@ -46,6 +54,7 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea | |||||
| private int mSequenceIndex = -1; | ||||||
| private int mSequenceLength = 0; | ||||||
| public static final String SEQUENCE_NAME_TAG = "SN"; | ||||||
| public static final String ALTERNATIVE_SEQUENCE_NAME_TAG = "AN"; | ||||||
| public static final String SEQUENCE_LENGTH_TAG = "LN"; | ||||||
| public static final String MD5_TAG = "M5"; | ||||||
| public static final String ASSEMBLY_TAG = "AS"; | ||||||
|
|
@@ -67,14 +76,18 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea | |||||
| * The standard tags are stored in text header without type information, because the type of these tags is known. | ||||||
| */ | ||||||
| public static final Set<String> STANDARD_TAGS = | ||||||
| new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, | ||||||
| new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, ALTERNATIVE_SEQUENCE_NAME_TAG, MD5_TAG, URI_TAG, | ||||||
| SPECIES_TAG)); | ||||||
|
|
||||||
| // Split on any whitespace | ||||||
| private static Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s"); | ||||||
| // These are the chars matched by \\s. | ||||||
| private static char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab | ||||||
|
|
||||||
| // alternative sequence name regexp | ||||||
| private static final Pattern ALTERNATIVE_SEQUENCE_NAME_REGEXP = Pattern.compile("[0-9A-Za-z][0-9A-Za-z*+.@_|-]*"); | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no longer needed, use |
||||||
| private static final String ALTERNATIVE_SEQUENCE_NAME_SEPARATOR = ","; | ||||||
|
|
||||||
| /** a (private) empty constructor is required for JAXB.XML-serialisation */ | ||||||
| @SuppressWarnings("unused") | ||||||
| private SAMSequenceRecord() { | ||||||
|
|
@@ -139,6 +152,49 @@ private void setSequenceName(final String name) { | |||||
| // Private state used only by SAM implementation. | ||||||
| public void setSequenceIndex(final int value) { mSequenceIndex = value; } | ||||||
|
|
||||||
| /** Returns a set with alternative sequence names. Modifications will not affect the record instance. */ | ||||||
| @XmlAttribute(name = "alternative_sequence_names") | ||||||
|
yfarjoun marked this conversation as resolved.
|
||||||
| public Set<String> getAlternativeSequenceNames() { | ||||||
| final String anTag = getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG); | ||||||
| return (anTag == null) ? new LinkedHashSet<>() : new LinkedHashSet<>(Arrays.asList(anTag.split(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR))); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't return a modifiable set, should it? E.g. I would use |
||||||
| } | ||||||
|
|
||||||
| /** Adds an alternative sequence name if it is not the same as the sequence name or it is not present already. */ | ||||||
| public void addAlternativeSequenceName(final String name) { | ||||||
| validateAltRegExp(name); | ||||||
| final Set<String> altSequences = getAlternativeSequenceNames(); | ||||||
| if (!mSequenceName.equals(name) && ! altSequences.contains(name) ) { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. inconsistent whitespace
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||||||
| altSequences.add(name); | ||||||
| } | ||||||
| encodeAltSequences(altSequences); | ||||||
| } | ||||||
|
|
||||||
| /** Sets the alternative sequence names in the order provided by iteration, removing the previous values. */ | ||||||
| public void setAlternativeSequenceName(final Collection<String> alternativeSequences) { | ||||||
| if (alternativeSequences == null) { | ||||||
| setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, null); | ||||||
| } else { | ||||||
| // validate all the names and encode afterwards | ||||||
| alternativeSequences.forEach(SAMSequenceRecord::validateAltRegExp); | ||||||
| encodeAltSequences(alternativeSequences); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| private static void validateAltRegExp(final String name) { | ||||||
| if (!ALTERNATIVE_SEQUENCE_NAME_REGEXP.matcher(name).matches()) { | ||||||
| throw new IllegalArgumentException(String.format("Invalid alternative sequence name '%s': do not match the pattern %s", name, ALTERNATIVE_SEQUENCE_NAME_REGEXP)); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| private void encodeAltSequences(final Collection<String> alternativeSequences) { | ||||||
| setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, StringUtil.join(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR, alternativeSequences)); | ||||||
| } | ||||||
|
|
||||||
| /** Returns {@code true} if there are alternative sequence names; {@code false} otherwise. */ | ||||||
| public boolean hasAlternativeSequenceNames() { | ||||||
| return getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG) != null; | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Looser comparison than equals(). We look only at sequence index, sequence length, and MD5 tag value | ||||||
| * (or sequence names, if there is no MD5 tag in either record. | ||||||
|
|
@@ -159,7 +215,16 @@ public boolean isSameSequence(final SAMSequenceRecord that) { | |||||
| } | ||||||
| } | ||||||
| else { | ||||||
| if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the Strings | ||||||
| // Compare using == since we intern() the Strings | ||||||
| if (mSequenceName != that.mSequenceName) { | ||||||
| // if they are different, they could still be the same based on the alternative sequences | ||||||
| if (getAlternativeSequenceNames().contains(that.mSequenceName) || | ||||||
| that.getAlternativeSequenceNames().contains(mSequenceName)) { | ||||||
| return true; | ||||||
| } | ||||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| } | ||||||
|
|
||||||
| return true; | ||||||
|
|
@@ -184,6 +249,7 @@ public boolean equals(final Object o) { | |||||
| if (mSequenceLength != that.mSequenceLength) return false; | ||||||
| if (!attributesEqual(that)) return false; | ||||||
| if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the name | ||||||
| if (!getAlternativeSequenceNames().equals(that.getAlternativeSequenceNames())) return false; | ||||||
|
|
||||||
| return true; | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,8 @@ | |
| import org.testng.annotations.Test; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
|
|
||
| /** | ||
| * Test for SAMReadGroupRecordTest | ||
|
|
@@ -83,4 +85,72 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR | |
| Assert.assertEquals(rec1.isSameSequence(rec2), isSame); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testAlternativeSequences() { | ||
| final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100); | ||
|
|
||
| // no AN tag yet | ||
| Assert.assertFalse(chr1.hasAlternativeSequenceNames()); | ||
| Assert.assertTrue(chr1.getAlternativeSequenceNames().isEmpty()); | ||
|
|
||
| // set to a random alias | ||
| chr1.addAlternativeSequenceName("my-chromosome"); | ||
| Assert.assertNotEquals(chr1, new SAMSequenceRecord(chr1.getSequenceName(), chr1.getSequenceLength())); | ||
| Assert.assertTrue(chr1.hasAlternativeSequenceNames()); | ||
| Assert.assertEquals(chr1.getAlternativeSequenceNames(), Collections.singleton("my-chromosome")); | ||
| Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:my-chromosome", chr1.getSAMString()); | ||
|
|
||
| // set to new chromosome aliases (removing previous) | ||
| final List<String> chr1AltNames = Arrays.asList("chr1","chr01","01","CM000663"); | ||
| chr1.setAlternativeSequenceName(chr1AltNames); | ||
| Assert.assertTrue(chr1.hasAlternativeSequenceNames()); | ||
| Assert.assertEquals(chr1.getAlternativeSequenceNames(), chr1AltNames); | ||
| Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:chr1,chr01,01,CM000663", chr1.getSAMString()); | ||
| } | ||
|
|
||
| @DataProvider | ||
| public Object[][] validAlternativeSequences() { | ||
| return new Object[][] { | ||
| // only characters | ||
| {"a"}, | ||
| {"alias"}, | ||
| // valid symbols after first character ('*', '+', '.', '@', '_', '|', '-') | ||
| {"my*contig"}, | ||
| {"my+contig"}, | ||
| {"my.contig"}, | ||
| {"my@contig"}, | ||
| {"my_contig"}, | ||
| {"my|contig"}, | ||
| {"my-contig"} | ||
| }; | ||
| } | ||
|
|
||
| @Test(dataProvider = "validAlternativeSequences") | ||
| public void testValidAlternativeSequences(final String altName) { | ||
| final SAMSequenceRecord contig = new SAMSequenceRecord("contig", 100); | ||
| // should not throw | ||
| contig.setAlternativeSequenceName(Collections.singleton(altName)); | ||
| Assert.assertTrue(contig.hasAlternativeSequenceNames()); | ||
| Assert.assertEquals(contig.getAlternativeSequenceNames(), Collections.singleton(altName)); | ||
| } | ||
|
|
||
| @DataProvider | ||
| public Object[][] invalidAlternativeSequences() { | ||
| return new Object[][] { | ||
| // invalid start | ||
| {"@chr1"}, {",chr1"}, | ||
| // comma-separated | ||
| {"chr1,alt"}, | ||
| // coordinate-like | ||
| {"chr1:1000"}, {"chr1:100-200"} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a testcase with "{" and "}" |
||
| }; | ||
| } | ||
|
|
||
| @Test(dataProvider = "invalidAlternativeSequences") | ||
| public void testInvalidAlternativeSequences(final String altName) { | ||
| final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100); | ||
| Assert.assertThrows(IllegalArgumentException.class, () -> chr1.addAlternativeSequenceName(altName)); | ||
| Assert.assertThrows(IllegalArgumentException.class, () -> chr1.setAlternativeSequenceName(Collections.singleton(altName))); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the comment on the equals method above needs updating to say that alias's are not considered but Alternative Sequences are.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.