-
Notifications
You must be signed in to change notification settings - Fork 381
CreateSequenceDictionary support for alternative names (@SQ-AN) #1127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ea536c6
6e93f07
bfce5f7
0f45066
82fcb0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,9 +51,13 @@ | |
| import java.security.MessageDigest; | ||
| import java.security.NoSuchAlgorithmException; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.regex.Pattern; | ||
|
|
||
| /** | ||
| * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no | ||
|
|
@@ -106,7 +110,28 @@ public class CreateSequenceDictionary extends CommandLineProgram { | |
| @Argument(doc = "Stop after writing this many sequences. For testing.") | ||
| public int NUM_SEQUENCES = Integer.MAX_VALUE; | ||
|
|
||
| @Argument(shortName = "AN", doc = "Optional file containing the alternative names for the contigs. " | ||
| + "Tools may use this information to consider different contig notations as identical (e.g: 'chr1' and '1'). " | ||
| + "The alternative names will be put into the appropriate @AN annotation for each contig. " | ||
| + "No header. " | ||
| + "First column is the original name, the second column is an alternative name. " | ||
| + "One contig may have more than one alternative name." , | ||
| optional=true) | ||
| public File ALT_NAMES = null; | ||
|
|
||
| private final MessageDigest md5; | ||
|
|
||
| /** | ||
| * Regular expression defined in the sam spec. Any alternative contig should match this regular expression | ||
| * TODO: replace the pattern with a constant : see https://github.com/samtools/htsjdk/pull/956/files | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this supposed to be the same as If it is supposed to be the same, you should make the one in |
||
| */ | ||
| private static final Pattern ALTERNATIVE_CONTIG_NAME_PATTERN = Pattern.compile("[0-9A-Za-z][0-9A-Za-z\\*\\+@\\|\\-]*"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
|
||
| /** | ||
| * 'AN' attribute in the dictionary | ||
| * TODO: replace "AN" with a constant : see https://github.com/samtools/htsjdk/pull/956/files | ||
| */ | ||
| private static final String AN_ATTRIBUTE = "AN"; | ||
|
|
||
| public CreateSequenceDictionary() { | ||
| try { | ||
|
|
@@ -185,6 +210,9 @@ protected int doWork() { | |
| " already exists. Delete this file and try again, or specify a different output file."); | ||
| } | ||
|
|
||
| // map for aliases mapping a contig to its aliases | ||
| final Map<String, Set<String>> aliasesByContig = loadContigAliasesMap(); | ||
|
|
||
| // SortingCollection is used to check uniqueness of sequence names | ||
| final SortingCollection<String> sequenceNames = makeSortingCollection(); | ||
| try (BufferedWriter writer = makeWriter()) { | ||
|
|
@@ -196,6 +224,12 @@ protected int doWork() { | |
| // read reference sequence one by one and write its metadata | ||
| for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { | ||
| final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); | ||
| // retrieve aliases, if any | ||
| final Set<String> aliases = aliasesByContig.get(samSequenceRecord.getSequenceName()); | ||
| if (aliases != null) { | ||
| // "Alternative names is a comma separated list of alternative names" | ||
| samSequenceRecord.setAttribute(AN_ATTRIBUTE, String.join(",", aliases)); | ||
| } | ||
| samDictCodec.encodeSequenceRecord(samSequenceRecord); | ||
| sequenceNames.add(refSeq.getName()); | ||
| } | ||
|
|
@@ -208,7 +242,7 @@ protected int doWork() { | |
| // check uniqueness of sequences names | ||
| final CloseableIterator<String> iterator = sequenceNames.iterator(); | ||
|
|
||
| if(!iterator.hasNext()) return 0; | ||
| if (!iterator.hasNext()) return 0; | ||
|
|
||
| String current = iterator.next(); | ||
| while (iterator.hasNext()) { | ||
|
|
@@ -284,6 +318,71 @@ private SortingCollection<String> makeSortingCollection() { | |
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Load the file ALT_NAMES containing the alternative contig names | ||
| * @author Pierre Lindenbaum | ||
| * @return a <code>Map<src_contig,Set<new_names>></code>. Never null. May be empty if ALT_NAMES is null. | ||
| * @throws PicardException if there is any error in the file ALT_NAMES | ||
| */ | ||
| private Map<String, Set<String>> loadContigAliasesMap() throws PicardException { | ||
| // return an empty map if no mapping file was provided | ||
| if (this.ALT_NAMES == null) { | ||
| return Collections.emptyMap(); | ||
| } | ||
| // the map returned by the function | ||
| final Map<String, Set<String>> aliasesByContig = new HashMap<>(); | ||
| try { | ||
| for (final String line : IOUtil.slurpLines(this.ALT_NAMES)) { | ||
| if (StringUtil.isBlank(line)) { | ||
| continue; | ||
| } | ||
| final int tab = line.indexOf('\t'); | ||
| if (tab == -1) { | ||
| throw new IOException("tabulation missing in " + line); | ||
| } | ||
| final String contigName = line.substring(0, tab); | ||
| final String altName = line.substring(tab + 1); | ||
| // check for empty values | ||
| if (StringUtil.isBlank(contigName)) { | ||
| throw new IOException("empty contig in " + line); | ||
| } | ||
| if (StringUtil.isBlank(altName)) { | ||
| throw new IOException("empty alternative name in " + line); | ||
| } | ||
| if (altName.equals(contigName)) { | ||
| continue; | ||
| } | ||
| if (!ALTERNATIVE_CONTIG_NAME_PATTERN.matcher(altName).matches()) { | ||
| throw new IOException("alternative name in " + line + | ||
| " doesn't match the regular expression : " + | ||
| ALTERNATIVE_CONTIG_NAME_PATTERN.pattern()); | ||
| } | ||
| // check alias not previously defined as contig | ||
| if (aliasesByContig.containsKey(altName)) { | ||
| throw new IOException("alternate name " + altName + | ||
| " previously defined as a contig in " + line); | ||
| } | ||
| // check contig not previously defined as alias | ||
| if (aliasesByContig.keySet().stream(). | ||
| // not an error if defined twice for same contig | ||
| filter(K -> !K.equals(contigName)). | ||
| anyMatch(K -> aliasesByContig.get(K).contains(contigName))) { | ||
| throw new IOException("contig " + contigName + | ||
| " previously defined as an alternate name in " + line); | ||
| } | ||
| // add alias | ||
| if (!aliasesByContig.containsKey(contigName)) { | ||
| aliasesByContig.put(contigName, new HashSet<>()); | ||
| } | ||
| aliasesByContig.get(contigName).add(altName); | ||
| } | ||
| return aliasesByContig; | ||
| } catch (final IOException e) { | ||
| throw new PicardException("Can't read alias file " + ALT_NAMES, e); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| private static class StringCodec implements SortingCollection.Codec<String> { | ||
| private DataInputStream dis; | ||
| private DataOutputStream dos; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Explain how this input will be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be clear, I mean in the documentation...something like: "The alternative names will be put into the appropriate
@ANannotation for each contig". Also, please state how the "columns" need to be separated ("tab", I presume) and whether the columns should have a "header" (I presume that they do not)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yfarjoun I added the sentences.