Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions src/main/java/htsjdk/samtools/SAMSequenceDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ public void setSequences(final List<SAMSequenceRecord> list) {
if (mSequenceMap.put(record.getSequenceName(), record) != null) {
throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " +
record.getSequenceName());
} else {
record.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(record.getSequenceName(), an));
}
}
}
Expand All @@ -98,6 +100,7 @@ public void addSequence(final SAMSequenceRecord sequenceRecord) {
sequenceRecord.setSequenceIndex(mSequences.size());
mSequences.add(sequenceRecord);
mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord);
sequenceRecord.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(sequenceRecord.getSequenceName(), an));
}

/**
Expand Down Expand Up @@ -203,7 +206,11 @@ public boolean isSameDictionary(final SAMSequenceDictionary that) {
return !thatSequences.hasNext();
}

/** returns true if the two dictionaries are the same, aliases are NOT considered */
/**
* Returns {@code true} if the two dictionaries are the same.
*
* <p>NOTE: Aliases are NOT considered, but alternative sequence names (AN tag) names ARE.
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand All @@ -220,10 +227,11 @@ public boolean equals(Object o) {
* <code>1,chr1,chr01,01,CM000663,NC_000001.10</code> e.g:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the comment on the equals method above needs updating to say that alias's are not considered but Alternative Sequences are.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

* <code>MT,chrM</code>
*
* @param originalName
* existing contig name
* @param altName
* new contig name
* <p>NOTE: this method does not add the alias to the alternative sequence name tag (AN) in the SAMSequenceRecord.
* If you would like to add it to the AN tag, use {@link #addAlternativeSequenceName(String, String)} instead.
*
* @param originalName existing contig name
* @param altName new contig name
* @return the contig associated to the 'originalName/altName'
*/
public SAMSequenceRecord addSequenceAlias(final String originalName,
Expand All @@ -246,6 +254,25 @@ public SAMSequenceRecord addSequenceAlias(final String originalName,
return originalSeqRecord;
}

/**
* Add an alternative sequence name (AN tag) to a SAMSequenceRecord, including it into the aliases
* to retrieve the contigs (as with {@link #addSequenceAlias(String, String)}.
*
* <p>This can be use to provide some alternate names fo a given contig. e.g:
* <code>1,chr1,chr01,01,CM000663</code> or
* <code>MT,chrM</code>.
*
* @param originalName existing contig name
* @param altName new contig name
* @return the contig associated to the 'originalName/altName', with the AN tag including the altName
*/
public SAMSequenceRecord addAlternativeSequenceName(final String originalName,
final String altName) {
final SAMSequenceRecord record = addSequenceAlias(originalName, altName);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to codecov this new code isn't covered by a test.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

record.addAlternativeSequenceName(altName);
return record;
}

/**
* return a MD5 sum for ths dictionary, the checksum is re-computed each
* time this method is called.
Expand Down
70 changes: 68 additions & 2 deletions src/main/java/htsjdk/samtools/SAMSequenceRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,22 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.util.StringUtil;

import javax.xml.bind.annotation.XmlAttribute;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.XmlTransient;
import javax.xml.bind.annotation.XmlValue;
import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
Expand All @@ -46,6 +54,7 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
private int mSequenceIndex = -1;
private int mSequenceLength = 0;
public static final String SEQUENCE_NAME_TAG = "SN";
public static final String ALTERNATIVE_SEQUENCE_NAME_TAG = "AN";
public static final String SEQUENCE_LENGTH_TAG = "LN";
public static final String MD5_TAG = "M5";
public static final String ASSEMBLY_TAG = "AS";
Expand All @@ -67,14 +76,18 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
* The standard tags are stored in text header without type information, because the type of these tags is known.
*/
public static final Set<String> STANDARD_TAGS =
new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG,
new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, ALTERNATIVE_SEQUENCE_NAME_TAG, MD5_TAG, URI_TAG,
SPECIES_TAG));

// Split on any whitespace
private static Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s");
// These are the chars matched by \\s.
private static char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab

// alternative sequence name regexp
private static final Pattern ALTERNATIVE_SEQUENCE_NAME_REGEXP = Pattern.compile("[0-9A-Za-z][0-9A-Za-z*+.@_|-]*");
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no longer needed, use SAMSequenceRecord.LEGAL_RNAME_PATTERN

private static final String ALTERNATIVE_SEQUENCE_NAME_SEPARATOR = ",";

/** a (private) empty constructor is required for JAXB.XML-serialisation */
@SuppressWarnings("unused")
private SAMSequenceRecord() {
Expand Down Expand Up @@ -139,6 +152,49 @@ private void setSequenceName(final String name) {
// Private state used only by SAM implementation.
public void setSequenceIndex(final int value) { mSequenceIndex = value; }

/** Returns a set with alternative sequence names. Modifications will not affect the record instance. */
@XmlAttribute(name = "alternative_sequence_names")
Comment thread
yfarjoun marked this conversation as resolved.
public Set<String> getAlternativeSequenceNames() {
final String anTag = getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG);
return (anTag == null) ? new LinkedHashSet<>() : new LinkedHashSet<>(Arrays.asList(anTag.split(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR)));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't return a modifiable set, should it? E.g. I would use Collections.emptySet() for the null case.

}

/** Adds an alternative sequence name if it is not the same as the sequence name or it is not present already. */
public void addAlternativeSequenceName(final String name) {
validateAltRegExp(name);
final Set<String> altSequences = getAlternativeSequenceNames();
if (!mSequenceName.equals(name) && ! altSequences.contains(name) ) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inconsistent whitespace

Suggested change
if (!mSequenceName.equals(name) && ! altSequences.contains(name) ) {
if (!mSequenceName.equals(name) && !altSequences.contains(name)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

altSequences.add(name);
}
encodeAltSequences(altSequences);
}

/** Sets the alternative sequence names in the order provided by iteration, removing the previous values. */
public void setAlternativeSequenceName(final Collection<String> alternativeSequences) {
if (alternativeSequences == null) {
setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, null);
} else {
// validate all the names and encode afterwards
alternativeSequences.forEach(SAMSequenceRecord::validateAltRegExp);
encodeAltSequences(alternativeSequences);
}
}

private static void validateAltRegExp(final String name) {
if (!ALTERNATIVE_SEQUENCE_NAME_REGEXP.matcher(name).matches()) {
throw new IllegalArgumentException(String.format("Invalid alternative sequence name '%s': do not match the pattern %s", name, ALTERNATIVE_SEQUENCE_NAME_REGEXP));
}
}

private void encodeAltSequences(final Collection<String> alternativeSequences) {
setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, StringUtil.join(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR, alternativeSequences));
}

/** Returns {@code true} if there are alternative sequence names; {@code false} otherwise. */
public boolean hasAlternativeSequenceNames() {
return getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG) != null;
}

/**
* Looser comparison than equals(). We look only at sequence index, sequence length, and MD5 tag value
* (or sequence names, if there is no MD5 tag in either record.
Expand All @@ -159,7 +215,16 @@ public boolean isSameSequence(final SAMSequenceRecord that) {
}
}
else {
if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the Strings
// Compare using == since we intern() the Strings
if (mSequenceName != that.mSequenceName) {
// if they are different, they could still be the same based on the alternative sequences
if (getAlternativeSequenceNames().contains(that.mSequenceName) ||
that.getAlternativeSequenceNames().contains(mSequenceName)) {
return true;
}
return false;
}

}

return true;
Expand All @@ -184,6 +249,7 @@ public boolean equals(final Object o) {
if (mSequenceLength != that.mSequenceLength) return false;
if (!attributesEqual(that)) return false;
if (mSequenceName != that.mSequenceName) return false; // Compare using == since we intern() the name
if (!getAlternativeSequenceNames().equals(that.getAlternativeSequenceNames())) return false;

return true;
}
Expand Down
18 changes: 18 additions & 0 deletions src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,24 @@ public void testAliases() {
Assert.assertNull(dict.getSequence("chr2"));
}

@Test
public void testAlternativeSequences() {
final SAMSequenceRecord ssr1 = new SAMSequenceRecord("1", 1);
final SAMSequenceRecord ssr2 = new SAMSequenceRecord("2", 1);

final SAMSequenceDictionary dict = new SAMSequenceDictionary(
Arrays.asList(ssr1, ssr2));
Assert.assertEquals(dict.size(), 2);
dict.addAlternativeSequenceName("1", "chr1");
dict.addAlternativeSequenceName("1", "01");
dict.addAlternativeSequenceName("1", "1");
dict.addAlternativeSequenceName("01", "chr01");
Assert.assertEquals(dict.size(), 2);
Assert.assertTrue(dict.getSequence("chr1").hasAlternativeSequenceNames());
Assert.assertEquals(dict.getSequence("1").getAlternativeSequenceNames().size(), 3);
Assert.assertNull(dict.getSequence("chr2"));
}

/**
* should be saved as XML
*
Expand Down
70 changes: 70 additions & 0 deletions src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
* Test for SAMReadGroupRecordTest
Expand Down Expand Up @@ -83,4 +85,72 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR
Assert.assertEquals(rec1.isSameSequence(rec2), isSame);
}
}

@Test
public void testAlternativeSequences() {
final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100);

// no AN tag yet
Assert.assertFalse(chr1.hasAlternativeSequenceNames());
Assert.assertTrue(chr1.getAlternativeSequenceNames().isEmpty());

// set to a random alias
chr1.addAlternativeSequenceName("my-chromosome");
Assert.assertNotEquals(chr1, new SAMSequenceRecord(chr1.getSequenceName(), chr1.getSequenceLength()));
Assert.assertTrue(chr1.hasAlternativeSequenceNames());
Assert.assertEquals(chr1.getAlternativeSequenceNames(), Collections.singleton("my-chromosome"));
Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:my-chromosome", chr1.getSAMString());

// set to new chromosome aliases (removing previous)
final List<String> chr1AltNames = Arrays.asList("chr1","chr01","01","CM000663");
chr1.setAlternativeSequenceName(chr1AltNames);
Assert.assertTrue(chr1.hasAlternativeSequenceNames());
Assert.assertEquals(chr1.getAlternativeSequenceNames(), chr1AltNames);
Assert.assertEquals("@SQ\tSN:1\tLN:100\tAN:chr1,chr01,01,CM000663", chr1.getSAMString());
}

@DataProvider
public Object[][] validAlternativeSequences() {
return new Object[][] {
// only characters
{"a"},
{"alias"},
// valid symbols after first character ('*', '+', '.', '@', '_', '|', '-')
{"my*contig"},
{"my+contig"},
{"my.contig"},
{"my@contig"},
{"my_contig"},
{"my|contig"},
{"my-contig"}
};
}

@Test(dataProvider = "validAlternativeSequences")
public void testValidAlternativeSequences(final String altName) {
final SAMSequenceRecord contig = new SAMSequenceRecord("contig", 100);
// should not throw
contig.setAlternativeSequenceName(Collections.singleton(altName));
Assert.assertTrue(contig.hasAlternativeSequenceNames());
Assert.assertEquals(contig.getAlternativeSequenceNames(), Collections.singleton(altName));
}

@DataProvider
public Object[][] invalidAlternativeSequences() {
return new Object[][] {
// invalid start
{"@chr1"}, {",chr1"},
// comma-separated
{"chr1,alt"},
// coordinate-like
{"chr1:1000"}, {"chr1:100-200"}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a testcase with "{" and "}"

};
}

@Test(dataProvider = "invalidAlternativeSequences")
public void testInvalidAlternativeSequences(final String altName) {
final SAMSequenceRecord chr1 = new SAMSequenceRecord("1", 100);
Assert.assertThrows(IllegalArgumentException.class, () -> chr1.addAlternativeSequenceName(altName));
Assert.assertThrows(IllegalArgumentException.class, () -> chr1.setAlternativeSequenceName(Collections.singleton(altName)));
}
}