diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 0c071b2b47..d3cfcf400b 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -51,9 +51,13 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; /** * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no @@ -106,7 +110,28 @@ public class CreateSequenceDictionary extends CommandLineProgram { @Argument(doc = "Stop after writing this many sequences. For testing.") public int NUM_SEQUENCES = Integer.MAX_VALUE; + @Argument(shortName = "AN", doc = "Optional file containing the alternative names for the contigs. " + + "Tools may use this information to consider different contig notations as identical (e.g: 'chr1' and '1'). " + + "The alternative names will be put into the appropriate @AN annotation for each contig. " + + "No header. " + + "First column is the original name, the second column is an alternative name. " + + "One contig may have more than one alternative name." , + optional=true) + public File ALT_NAMES = null; + private final MessageDigest md5; + + /** + * Regular expression defined in the sam spec. Any alternative contig should match this regular expression + * TODO: replace the pattern with a constant : see https://github.com/samtools/htsjdk/pull/956/files + */ + private static final Pattern ALTERNATIVE_CONTIG_NAME_PATTERN = Pattern.compile("[0-9A-Za-z][0-9A-Za-z\\*\\+@\\|\\-]*"); + + /** + * 'AN' attribute in the dictionary + * TODO: replace "AN" with a constant : see https://github.com/samtools/htsjdk/pull/956/files + */ + private static final String AN_ATTRIBUTE = "AN"; public CreateSequenceDictionary() { try { @@ -185,6 +210,9 @@ protected int doWork() { " already exists. Delete this file and try again, or specify a different output file."); } + // map for aliases mapping a contig to its aliases + final Map> aliasesByContig = loadContigAliasesMap(); + // SortingCollection is used to check uniqueness of sequence names final SortingCollection sequenceNames = makeSortingCollection(); try (BufferedWriter writer = makeWriter()) { @@ -196,6 +224,12 @@ protected int doWork() { // read reference sequence one by one and write its metadata for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); + // retrieve aliases, if any + final Set aliases = aliasesByContig.get(samSequenceRecord.getSequenceName()); + if (aliases != null) { + // "Alternative names is a comma separated list of alternative names" + samSequenceRecord.setAttribute(AN_ATTRIBUTE, String.join(",", aliases)); + } samDictCodec.encodeSequenceRecord(samSequenceRecord); sequenceNames.add(refSeq.getName()); } @@ -208,7 +242,7 @@ protected int doWork() { // check uniqueness of sequences names final CloseableIterator iterator = sequenceNames.iterator(); - if(!iterator.hasNext()) return 0; + if (!iterator.hasNext()) return 0; String current = iterator.next(); while (iterator.hasNext()) { @@ -284,6 +318,71 @@ private SortingCollection makeSortingCollection() { ); } + /** + * Load the file ALT_NAMES containing the alternative contig names + * @author Pierre Lindenbaum + * @return a Map<src_contig,Set<new_names>>. Never null. May be empty if ALT_NAMES is null. + * @throws PicardException if there is any error in the file ALT_NAMES + */ + private Map> loadContigAliasesMap() throws PicardException { + // return an empty map if no mapping file was provided + if (this.ALT_NAMES == null) { + return Collections.emptyMap(); + } + // the map returned by the function + final Map> aliasesByContig = new HashMap<>(); + try { + for (final String line : IOUtil.slurpLines(this.ALT_NAMES)) { + if (StringUtil.isBlank(line)) { + continue; + } + final int tab = line.indexOf('\t'); + if (tab == -1) { + throw new IOException("tabulation missing in " + line); + } + final String contigName = line.substring(0, tab); + final String altName = line.substring(tab + 1); + // check for empty values + if (StringUtil.isBlank(contigName)) { + throw new IOException("empty contig in " + line); + } + if (StringUtil.isBlank(altName)) { + throw new IOException("empty alternative name in " + line); + } + if (altName.equals(contigName)) { + continue; + } + if (!ALTERNATIVE_CONTIG_NAME_PATTERN.matcher(altName).matches()) { + throw new IOException("alternative name in " + line + + " doesn't match the regular expression : " + + ALTERNATIVE_CONTIG_NAME_PATTERN.pattern()); + } + // check alias not previously defined as contig + if (aliasesByContig.containsKey(altName)) { + throw new IOException("alternate name " + altName + + " previously defined as a contig in " + line); + } + // check contig not previously defined as alias + if (aliasesByContig.keySet().stream(). + // not an error if defined twice for same contig + filter(K -> !K.equals(contigName)). + anyMatch(K -> aliasesByContig.get(K).contains(contigName))) { + throw new IOException("contig " + contigName + + " previously defined as an alternate name in " + line); + } + // add alias + if (!aliasesByContig.containsKey(contigName)) { + aliasesByContig.put(contigName, new HashSet<>()); + } + aliasesByContig.get(contigName).add(altName); + } + return aliasesByContig; + } catch (final IOException e) { + throw new PicardException("Can't read alias file " + ALT_NAMES, e); + } + + } + private static class StringCodec implements SortingCollection.Codec { private DataInputStream dis; private DataOutputStream dos; diff --git a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java index e371434f45..bef74f2337 100644 --- a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java +++ b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java @@ -25,13 +25,21 @@ import org.testng.Assert; import org.testng.annotations.Test; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; import picard.cmdline.CommandLineProgramTest; import picard.PicardException; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; /** @@ -114,4 +122,53 @@ public void testNonUniqueSequenceName() throws Exception { Assert.assertEquals(runPicardCommandLine(argv), 0); Assert.fail("Exception should have been thrown."); } + + + @Test + public void testAltNames() throws Exception { + final File altFile = File.createTempFile("CreateSequenceDictionaryTest", ".alt"); + final PrintWriter pw = new PrintWriter(altFile); + pw.println("chr1\t1"); + pw.println("chr1\t01"); + pw.println("chr1\tk1"); + pw.println("chrMT\tM"); + pw.flush(); + pw.close(); + altFile.deleteOnExit(); + + final File outputDict = File.createTempFile("CreateSequenceDictionaryTest.", ".dict"); + outputDict.delete(); + outputDict.deleteOnExit(); + final String[] argv = { + "REFERENCE=" + BASIC_FASTA, + "AN=" + altFile, + "OUTPUT=" + outputDict, + "TRUNCATE_NAMES_AT_WHITESPACE=true" + }; + Assert.assertEquals(runPicardCommandLine(argv), 0); + final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(outputDict); + Assert.assertNotNull(dict, "dictionary is null"); + + // check chr1 + SAMSequenceRecord ssr = dict.getSequence("chr1"); + Assert.assertNotNull(ssr, "chr1 missing in dictionary"); + String an = ssr.getAttribute("AN"); + Assert.assertNotNull(ssr, "AN Missing"); + Set anSet = new HashSet<>(Arrays.asList(an.split("[,]"))); + Assert.assertTrue(anSet.contains("1")); + Assert.assertTrue(anSet.contains("01")); + Assert.assertTrue(anSet.contains("k1")); + Assert.assertFalse(anSet.contains("M")); + + // check chr2 + ssr = dict.getSequence("chr2"); + Assert.assertNotNull(ssr, "chr2 missing in dictionary"); + an = ssr.getAttribute("AN"); + Assert.assertNull(an, "AN Present"); + + // check chrM + ssr = dict.getSequence("chrM"); + Assert.assertNull(ssr, "chrM present in dictionary"); + } + }