-
Notifications
You must be signed in to change notification settings - Fork 381
CreateSequenceDictionary support for alternative names (@SQ-AN) #1127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
ea536c6
6e93f07
bfce5f7
0f45066
82fcb0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,12 +48,17 @@ | |
|
|
||
| import java.io.*; | ||
| import java.math.BigInteger; | ||
| import java.nio.file.Files; | ||
| import java.security.MessageDigest; | ||
| import java.security.NoSuchAlgorithmException; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.regex.Pattern; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| /** | ||
| * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no | ||
|
|
@@ -106,6 +111,12 @@ public class CreateSequenceDictionary extends CommandLineProgram { | |
| @Argument(doc = "Stop after writing this many sequences. For testing.") | ||
| public int NUM_SEQUENCES = Integer.MAX_VALUE; | ||
|
|
||
| @Argument(shortName = "AN", doc = "Optional file containing the alternative names for the contigs. " | ||
| + "First column is the original name, the second column is an alternative name. " | ||
| + "One contig may have more than one alternative name. " , | ||
| optional=true) | ||
| public File ALT_NAMES = null; | ||
|
|
||
| private final MessageDigest md5; | ||
|
|
||
| public CreateSequenceDictionary() { | ||
|
|
@@ -185,6 +196,58 @@ protected int doWork() { | |
| " already exists. Delete this file and try again, or specify a different output file."); | ||
| } | ||
|
|
||
| // map for aliases mapping a contig to its' aliases | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By |
||
| final Map<String, Set<String>> contig2aliases = new HashMap<>(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general I wouldn't use numbers in variable names like this, in the place of the word "to". Looking at other Maps in picard, such maps are typically named by the value, not the key, such as |
||
|
|
||
| // fill the alias map | ||
| if(this.ALT_NAMES != null) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be a space between |
||
| try { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a lot of code to insert directly into a method. If the output of this block is the |
||
| // regex defined in the sam spec | ||
| final Pattern altNameRegex = Pattern.compile("[0-9A-Za-z][0-9A-Za-z\\*\\+@\\|\\-]*"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compiled patterns are thread safe so the usual convention is to store them in a |
||
| for(final String line :IOUtil.slurpLines(this.ALT_NAMES)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need a space after the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of |
||
| if(StringUtil.isBlank(line)) continue; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. each |
||
| final int tab = line.indexOf('\t'); | ||
| if(tab == -1 ) throw new IOException("tabulation missing in " + line); | ||
| final String contigName = line.substring(0,tab); | ||
| final String altName = line.substring(tab + 1); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
| //check for empty values | ||
| if(StringUtil.isBlank(contigName)) { | ||
| throw new IOException("empty contig in " + line); | ||
| } | ||
| if(StringUtil.isBlank(altName)) { | ||
| throw new IOException("empty alternative name in " + line); | ||
| } | ||
| if(altName.equals(contigName)) continue; | ||
| if(!altNameRegex.matcher(altName).matches()) { | ||
| throw new IOException("alternative name in " + line + | ||
| " doesn't match the regular expression : " + | ||
| altNameRegex.pattern()); | ||
| } | ||
| //check alias not previously defined as contig | ||
| if(contig2aliases.containsKey(altName)) { | ||
| throw new IOException("alternate name " + altName + | ||
| "previously defined as a contig in " + line); | ||
| } | ||
| //check contig not previously defined as alias | ||
| if(contig2aliases.keySet().stream(). | ||
| filter(K->!K.equals(contigName)). // not an error is defined twice for same contig | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need spaces around |
||
| flatMap(K->contig2aliases.get(K).stream()). | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you need to iterate over both keys and values, it's better to stream over |
||
| anyMatch(S->S.contains(contigName))) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you could avoid the |
||
| throw new IOException("contig " + contigName + | ||
| "previously defined as an alternate name in " + line); | ||
| } | ||
| //add alias | ||
| if(!contig2aliases.containsKey(contigName)) { | ||
| contig2aliases.put(contigName, new HashSet<>()); | ||
| } | ||
| contig2aliases.get(contigName).add(altName); | ||
| } | ||
| } | ||
| catch (final IOException e) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| throw new PicardException("Can't read alias file " + ALT_NAMES, e); | ||
| } | ||
| } | ||
|
|
||
| // SortingCollection is used to check uniqueness of sequence names | ||
| final SortingCollection<String> sequenceNames = makeSortingCollection(); | ||
| try (BufferedWriter writer = makeWriter()) { | ||
|
|
@@ -196,6 +259,12 @@ protected int doWork() { | |
| // read reference sequence one by one and write its metadata | ||
| for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { | ||
| final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); | ||
| //add aliases | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comments should have a space after the |
||
| if(contig2aliases.containsKey(samSequenceRecord.getSequenceName())) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid hashing twice you could instead write |
||
| final Set<String> aliases = contig2aliases.get(samSequenceRecord.getSequenceName()); | ||
| // "Alternative names is a comma separated list of alternative names" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why this is in double quotes? Even if this is quoted from a spec, quotes are unnecessary in a comment like this. |
||
| samSequenceRecord.setAttribute("AN",String.join(",",aliases)); //TODO replace "AN" with constants/methods: https://github.com/samtools/htsjdk/pull/956/files | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, end-of-line comments are hard to read/maintain, it's better to put them on their own line. If you had a static final for the magic string
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to have spaces after |
||
| } | ||
| samDictCodec.encodeSequenceRecord(samSequenceRecord); | ||
| sequenceNames.add(refSeq.getName()); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,13 +25,21 @@ | |
|
|
||
| import org.testng.Assert; | ||
| import org.testng.annotations.Test; | ||
|
|
||
| import htsjdk.samtools.SAMSequenceDictionary; | ||
| import htsjdk.samtools.SAMSequenceRecord; | ||
| import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; | ||
| import picard.cmdline.CommandLineProgramTest; | ||
| import picard.PicardException; | ||
|
|
||
| import java.io.BufferedReader; | ||
| import java.io.File; | ||
| import java.io.FileReader; | ||
| import java.io.PrintWriter; | ||
| import java.util.Arrays; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Set; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| /** | ||
|
|
@@ -114,4 +122,53 @@ public void testNonUniqueSequenceName() throws Exception { | |
| Assert.assertEquals(runPicardCommandLine(argv), 0); | ||
| Assert.fail("Exception should have been thrown."); | ||
| } | ||
|
|
||
|
|
||
| @Test | ||
| public void testAltNames() throws Exception { | ||
| final File altFile = File.createTempFile("CreateSequenceDictionaryTest.", ".alt"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. first arg to |
||
| final PrintWriter pw = new PrintWriter(altFile); | ||
| pw.println("chr1\t1"); | ||
| pw.println("chr1\t01"); | ||
| pw.println("chr1\tk1"); | ||
| pw.println("chrMT\tM"); | ||
| pw.flush(); | ||
| pw.close(); | ||
| altFile.deleteOnExit(); | ||
|
|
||
| final File outputDict = File.createTempFile("CreateSequenceDictionaryTest.", ".dict"); | ||
| outputDict.delete(); | ||
| outputDict.deleteOnExit(); | ||
| final String[] argv = { | ||
| "REFERENCE=" + BASIC_FASTA, | ||
| "AN=" + altFile, | ||
| "OUTPUT=" + outputDict, | ||
| "TRUNCATE_NAMES_AT_WHITESPACE=true" | ||
| }; | ||
| Assert.assertEquals(runPicardCommandLine(argv), 0); | ||
| final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(outputDict); | ||
| Assert.assertNotNull(dict, "dictionary is null"); | ||
|
|
||
| // check chr1 | ||
| SAMSequenceRecord ssr = dict.getSequence("chr1"); | ||
| Assert.assertNotNull(ssr, "chr1 missing in dictionary"); | ||
| String an = ssr.getAttribute("AN"); | ||
| Assert.assertNotNull(ssr, "AN Missing"); | ||
| Set<String> anSet = new HashSet<>(Arrays.asList(an.split("[,]"))); | ||
| Assert.assertTrue(anSet.contains("1")); | ||
| Assert.assertTrue(anSet.contains("01")); | ||
| Assert.assertTrue(anSet.contains("k1")); | ||
| Assert.assertFalse(anSet.contains("M")); | ||
|
|
||
| // check chr2 | ||
| ssr = dict.getSequence("chr2"); | ||
| Assert.assertNotNull(ssr, "chr2 missing in dictionary"); | ||
| an = ssr.getAttribute("AN"); | ||
| Assert.assertNull(an, "AN Present"); | ||
|
|
||
| // check chrM | ||
| ssr = dict.getSequence("chrM"); | ||
| Assert.assertNull(ssr, "chrM presnt in dictionary"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo |
||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Explain how this input will be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be clear, I mean in the documentation...something like: "The alternative names will be put into the appropriate
@ANannotation for each contig". Also, please state how the "columns" need to be separated ("tab", I presume) and whether the columns should have a "header" (I presume that they do not)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yfarjoun I added the sentences.