Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion src/main/java/picard/sam/CreateSequenceDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,13 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/**
* Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no
Expand Down Expand Up @@ -106,7 +110,28 @@ public class CreateSequenceDictionary extends CommandLineProgram {
@Argument(doc = "Stop after writing this many sequences. For testing.")
public int NUM_SEQUENCES = Integer.MAX_VALUE;

@Argument(shortName = "AN", doc = "Optional file containing the alternative names for the contigs. "
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Explain how this input will be used.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, I mean in the documentation...something like: "The alternative names will be put into the appropriate @AN annotation for each contig". Also, please state how the "columns" need to be separated ("tab", I presume) and whether the columns should have a "header" (I presume that they do not)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yfarjoun I added the sentences.

+ "Tools may use this information to consider different contig notations as identical (e.g: 'chr1' and '1'). "
+ "The alternative names will be put into the appropriate @AN annotation for each contig. "
+ "No header. "
+ "First column is the original name, the second column is an alternative name. "
+ "One contig may have more than one alternative name." ,
optional=true)
public File ALT_NAMES = null;

private final MessageDigest md5;

/**
* Regular expression defined in the sam spec. Any alternative contig should match this regular expression
* TODO: replace the pattern with a constant : see https://github.com/samtools/htsjdk/pull/956/files
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this supposed to be the same as SAMSequenceRecord.ALTERNATIVE_SEQUENCE_NAME_REGEXP? It's not the same, it has a | character in it.

If it is supposed to be the same, you should make the one in SamSequenceRecord public and use it instead of defining another copy here.

*/
private static final Pattern ALTERNATIVE_CONTIG_NAME_PATTERN = Pattern.compile("[0-9A-Za-z][0-9A-Za-z\\*\\+@\\|\\-]*");
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The \\ characters in this string aren't necessary since the characters * + and | do not require quoting. - would require quoting but doesn't since it occurs last in the character class.


/**
* 'AN' attribute in the dictionary
* TODO: replace "AN" with a constant : see https://github.com/samtools/htsjdk/pull/956/files
*/
private static final String AN_ATTRIBUTE = "AN";

public CreateSequenceDictionary() {
try {
Expand Down Expand Up @@ -185,6 +210,9 @@ protected int doWork() {
" already exists. Delete this file and try again, or specify a different output file.");
}

// map for aliases mapping a contig to its aliases
final Map<String, Set<String>> aliasesByContig = loadContigAliasesMap();

// SortingCollection is used to check uniqueness of sequence names
final SortingCollection<String> sequenceNames = makeSortingCollection();
try (BufferedWriter writer = makeWriter()) {
Expand All @@ -196,6 +224,12 @@ protected int doWork() {
// read reference sequence one by one and write its metadata
for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) {
final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq);
// retrieve aliases, if any
final Set<String> aliases = aliasesByContig.get(samSequenceRecord.getSequenceName());
if (aliases != null) {
// "Alternative names is a comma separated list of alternative names"
samSequenceRecord.setAttribute(AN_ATTRIBUTE, String.join(",", aliases));
}
samDictCodec.encodeSequenceRecord(samSequenceRecord);
sequenceNames.add(refSeq.getName());
}
Expand All @@ -208,7 +242,7 @@ protected int doWork() {
// check uniqueness of sequences names
final CloseableIterator<String> iterator = sequenceNames.iterator();

if(!iterator.hasNext()) return 0;
if (!iterator.hasNext()) return 0;

String current = iterator.next();
while (iterator.hasNext()) {
Expand Down Expand Up @@ -284,6 +318,71 @@ private SortingCollection<String> makeSortingCollection() {
);
}

/**
* Load the file ALT_NAMES containing the alternative contig names
* @author Pierre Lindenbaum
* @return a <code>Map&lt;src_contig,Set&lt;new_names&gt;&gt;</code>. Never null. May be empty if ALT_NAMES is null.
* @throws PicardException if there is any error in the file ALT_NAMES
*/
private Map<String, Set<String>> loadContigAliasesMap() throws PicardException {
// return an empty map if no mapping file was provided
if (this.ALT_NAMES == null) {
return Collections.emptyMap();
}
// the map returned by the function
final Map<String, Set<String>> aliasesByContig = new HashMap<>();
try {
for (final String line : IOUtil.slurpLines(this.ALT_NAMES)) {
if (StringUtil.isBlank(line)) {
continue;
}
final int tab = line.indexOf('\t');
if (tab == -1) {
throw new IOException("tabulation missing in " + line);
}
final String contigName = line.substring(0, tab);
final String altName = line.substring(tab + 1);
// check for empty values
if (StringUtil.isBlank(contigName)) {
throw new IOException("empty contig in " + line);
}
if (StringUtil.isBlank(altName)) {
throw new IOException("empty alternative name in " + line);
}
if (altName.equals(contigName)) {
continue;
}
if (!ALTERNATIVE_CONTIG_NAME_PATTERN.matcher(altName).matches()) {
throw new IOException("alternative name in " + line +
" doesn't match the regular expression : " +
ALTERNATIVE_CONTIG_NAME_PATTERN.pattern());
}
// check alias not previously defined as contig
if (aliasesByContig.containsKey(altName)) {
throw new IOException("alternate name " + altName +
" previously defined as a contig in " + line);
}
// check contig not previously defined as alias
if (aliasesByContig.keySet().stream().
// not an error if defined twice for same contig
filter(K -> !K.equals(contigName)).
anyMatch(K -> aliasesByContig.get(K).contains(contigName))) {
throw new IOException("contig " + contigName +
" previously defined as an alternate name in " + line);
}
// add alias
if (!aliasesByContig.containsKey(contigName)) {
aliasesByContig.put(contigName, new HashSet<>());
}
aliasesByContig.get(contigName).add(altName);
}
return aliasesByContig;
} catch (final IOException e) {
throw new PicardException("Can't read alias file " + ALT_NAMES, e);
}

}

private static class StringCodec implements SortingCollection.Codec<String> {
private DataInputStream dis;
private DataOutputStream dos;
Expand Down
57 changes: 57 additions & 0 deletions src/test/java/picard/sam/CreateSequenceDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,21 @@

import org.testng.Assert;
import org.testng.annotations.Test;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import picard.cmdline.CommandLineProgramTest;
import picard.PicardException;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -114,4 +122,53 @@ public void testNonUniqueSequenceName() throws Exception {
Assert.assertEquals(runPicardCommandLine(argv), 0);
Assert.fail("Exception should have been thrown.");
}


@Test
public void testAltNames() throws Exception {
final File altFile = File.createTempFile("CreateSequenceDictionaryTest", ".alt");
final PrintWriter pw = new PrintWriter(altFile);
pw.println("chr1\t1");
pw.println("chr1\t01");
pw.println("chr1\tk1");
pw.println("chrMT\tM");
pw.flush();
pw.close();
altFile.deleteOnExit();

final File outputDict = File.createTempFile("CreateSequenceDictionaryTest.", ".dict");
outputDict.delete();
outputDict.deleteOnExit();
final String[] argv = {
"REFERENCE=" + BASIC_FASTA,
"AN=" + altFile,
"OUTPUT=" + outputDict,
"TRUNCATE_NAMES_AT_WHITESPACE=true"
};
Assert.assertEquals(runPicardCommandLine(argv), 0);
final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(outputDict);
Assert.assertNotNull(dict, "dictionary is null");

// check chr1
SAMSequenceRecord ssr = dict.getSequence("chr1");
Assert.assertNotNull(ssr, "chr1 missing in dictionary");
String an = ssr.getAttribute("AN");
Assert.assertNotNull(ssr, "AN Missing");
Set<String> anSet = new HashSet<>(Arrays.asList(an.split("[,]")));
Assert.assertTrue(anSet.contains("1"));
Assert.assertTrue(anSet.contains("01"));
Assert.assertTrue(anSet.contains("k1"));
Assert.assertFalse(anSet.contains("M"));

// check chr2
ssr = dict.getSequence("chr2");
Assert.assertNotNull(ssr, "chr2 missing in dictionary");
an = ssr.getAttribute("AN");
Assert.assertNull(an, "AN Present");

// check chrM
ssr = dict.getSequence("chrM");
Assert.assertNull(ssr, "chrM present in dictionary");
}

}