Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/main/java/htsjdk/samtools/BAMRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,16 @@ protected BAMRecord(final SAMFileHeader header,
final int insertSize,
final byte[] restOfData) {
super(header);
setReferenceIndex(referenceID);
// Set reference index and name directly, avoiding the round-trip through
// setReferenceIndex -> resolveNameFromIndex -> setReferenceName -> resolveIndexFromName
setReferenceNameAndIndex(resolveNameFromIndex(referenceID, header), referenceID);
setAlignmentStart(coordinate);
mReadNameLength = readNameLength;
setMappingQuality(mappingQuality);
mCigarLength = cigarLen;
setFlags(flags);
mReadLength = readLen;
setMateReferenceIndex(mateReferenceID);
setMateReferenceNameAndIndex(resolveNameFromIndex(mateReferenceID, header), mateReferenceID);
setMateAlignmentStart(mateCoordinate);
setInferredInsertSize(insertSize);
mRestOfBinaryData = restOfData;
Expand Down
4 changes: 0 additions & 4 deletions src/main/java/htsjdk/samtools/BAMRecordCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,6 @@ public SAMRecord decode() {
header, referenceID, coordinate, readNameLength, mappingQuality,
bin, cigarLen, flags, readLen, mateReferenceID, mateCoordinate, insertSize, restOfRecord);

if (null != header) {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The header is passed to the BAMRecord constructor through the factory call 4 lines above - this does nothing other than cause a whole second round of refId/mateRefId resolution.

// don't reset a null header as this will clobber the reference and mate reference indices
ret.setHeader(header);
}
return ret;
}
}
33 changes: 15 additions & 18 deletions src/main/java/htsjdk/samtools/BinaryTagCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,11 @@ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int of
final byte tagType = byteBuffer.get();
final SAMBinaryTagAndValue tmp;
if (tagType != 'B') {
tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency));
tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency), true);
} else {
final TagValueAndUnsignedArrayFlag valueAndFlag = readArray(byteBuffer, validationStringency);
if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value);
else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value);
if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value, true);
else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value, true);
}

// If samjdk wrote the BAM then the attributes will be in lowest->highest tag order, to inserting at the
Expand All @@ -294,7 +294,7 @@ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int of
tail = tmp;
}
else if (tmp.tag > tail.tag) {
tail.insert(tmp);
tail.next = tmp;
tail = tmp;
}
else {
Expand Down Expand Up @@ -407,19 +407,16 @@ private static TagValueAndUnsignedArrayFlag readArray(final ByteBuffer byteBuffe
}

private static String readNullTerminatedString(final ByteBuffer byteBuffer) {
// Count the number of bytes in the string
byteBuffer.mark();
final int startPosition = byteBuffer.position();
while (byteBuffer.get() != 0) {}
final int endPosition = byteBuffer.position();

// Don't count null terminator
final byte[] buf = new byte[endPosition - startPosition - 1];
// Go back to the start of the string and read out the bytes
byteBuffer.reset();
byteBuffer.get(buf);
// Skip over the null terminator
byteBuffer.get();
return StringUtil.bytesToString(buf);
// Scan the backing array directly to avoid the double-pass of mark/reset/re-read
final byte[] array = byteBuffer.array();
final int start = byteBuffer.arrayOffset() + byteBuffer.position();
final int limit = byteBuffer.arrayOffset() + byteBuffer.limit();
int end = start;
while (end < limit && array[end] != 0) { end++; }
if (end >= limit) {
throw new SAMFormatException("Null-terminated string tag value is not null terminated.");
}
byteBuffer.position(byteBuffer.position() + (end - start) + 1); // advance past null terminator
return StringUtil.bytesToString(array, start, end - start);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ public SAMBinaryTagAndUnsignedArrayValue(final short tag, final Object value) {
}
}

/** Package-private constructor that skips validation, for use in BinaryTagCodec.readTags(). */
SAMBinaryTagAndUnsignedArrayValue(final short tag, final Object value, final boolean skipValidation) {
super(tag, value, skipValidation);
}

/** Creates and returns a shallow copy of the list of tag/values. */
@Override
public SAMBinaryTagAndValue copy() {
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/htsjdk/samtools/SAMBinaryTagAndValue.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ public SAMBinaryTagAndValue(final short tag, final Object value) {
this.value = value;
}

/**
* Package-private constructor that skips type validation, for use in performance-critical
* paths (e.g. BinaryTagCodec.readTags) where the value type is known to be valid.
*/
SAMBinaryTagAndValue(final short tag, final Object value, final boolean skipValidation) {
this.tag = tag;
this.value = value;
}

// Inspect the proposed value to determine if it is an allowed value type,
// and if the value is in range.
protected static boolean isAllowedAttributeValue(final Object value) {
Expand Down
18 changes: 18 additions & 0 deletions src/main/java/htsjdk/samtools/SAMRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,24 @@ public void setReferenceIndex(final int referenceIndex) {
mReferenceIndex = referenceIndex;
}

/**
* Sets the reference name and index directly without resolving one from the other.
* The caller is responsible for ensuring the name and index are consistent.
*/
void setReferenceNameAndIndex(final String referenceName, final int referenceIndex) {
mReferenceName = referenceName;
mReferenceIndex = referenceIndex;
}

/**
* Sets the mate reference name and index directly without resolving one from the other.
* The caller is responsible for ensuring the name and index are consistent.
*/
void setMateReferenceNameAndIndex(final String mateReferenceName, final int mateReferenceIndex) {
mMateReferenceName = mateReferenceName;
mMateReferenceIndex = mateReferenceIndex;
}

/**
* @return Mate reference name, or NO_ALIGNMENT_REFERENCE_NAME (*) if the record has no mate reference name
*/
Expand Down
85 changes: 36 additions & 49 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ public final class SAMUtils {
'N'
};

/**
* Lookup table that maps each possible packed byte (0-255) directly to the two ASCII bases
* it encodes. Indexed by {@code (compressedByte & 0xFF) * 2}; the entry at that index is the
* high-nibble base and the next entry is the low-nibble base. This avoids per-nibble method
* calls and bit manipulation in the hot decode loop. Ported from htslib's {@code code2base} table.
*/
private static final byte[] NIBBLE_PAIR_LOOKUP = new byte[512];
static {
for (int i = 0; i < 256; i++) {
NIBBLE_PAIR_LOOKUP[i * 2] = COMPRESSED_LOOKUP_TABLE[(i >> 4) & 0xf];
NIBBLE_PAIR_LOOKUP[i * 2 + 1] = COMPRESSED_LOOKUP_TABLE[i & 0xf];
}
}

public static final int MAX_PHRED_SCORE = 93;

/**
Expand All @@ -140,25 +154,33 @@ static byte[] bytesToCompressedBases(final byte[] readBases) {
}

/**
* Convert from a byte array with bases stored in nybbles, with for example,=, A, C, G, T, N represented as 0, 1, 2, 4, 8, 15,
* to a a byte array containing =AaCcGgTtNn represented as ASCII.
* Convert from BAM's packed nibble representation to an ASCII byte array.
*
* <p>In BAM format, each byte encodes two bases: the high nibble (bits 4-7) holds the first base
* and the low nibble (bits 0-3) holds the second. Nibble values map to IUPAC codes via
* {@code =ACMGRSVTWYHKDBN} (0-15). For odd-length sequences the low nibble of the last byte is
* unused.
*
* <p>Uses a 512-byte pre-computed lookup table ({@link #NIBBLE_PAIR_LOOKUP}) to decode two bases
* per iteration without per-nibble method calls, following the same approach as htslib's
* {@code code2base} table.
*
* @param length Number of bases (not bytes) to convert.
* @param compressedBases Bases represented as nybbles, in BAM binary format.
* @param compressedOffset Byte offset in compressedBases to start.
* @return New byte array with bases as ASCII bytes.
* @param length number of bases (not bytes) to decode
* @param compressedBases packed nibble-encoded bases in BAM binary format
* @param compressedOffset byte offset into {@code compressedBases} at which to start decoding
* @return new byte array of length {@code length} with bases as uppercase ASCII bytes
*/
public static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) {
final byte[] ret = new byte[length];
int i;
for (i = 1; i < length; i += 2) {
final int compressedIndex = i / 2 + compressedOffset;
ret[i - 1] = compressedBaseToByteHigh(compressedBases[compressedIndex]);
ret[i] = compressedBaseToByteLow(compressedBases[compressedIndex]);
final int pairs = length / 2;
for (int i = 0; i < pairs; i++) {
final int lookupIndex = (compressedBases[i + compressedOffset] & 0xFF) * 2;
ret[i * 2] = NIBBLE_PAIR_LOOKUP[lookupIndex];
ret[i * 2 + 1] = NIBBLE_PAIR_LOOKUP[lookupIndex + 1];
}
// Last nybble
if (i == length) {
ret[i - 1] = compressedBaseToByteHigh(compressedBases[i / 2 + compressedOffset]);
// Odd-length: last base is in the high nibble of the final byte
if ((length & 1) != 0) {
ret[length - 1] = COMPRESSED_LOOKUP_TABLE[(compressedBases[pairs + compressedOffset] >> 4) & 0xF];
}
return ret;
}
Expand Down Expand Up @@ -291,41 +313,6 @@ private static byte charToCompressedBaseHigh(final byte base) {
}
}

/**
* Returns the byte corresponding to a certain nybble
*
* @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base.
* @return ASCII base, one of =ACGTNMRSVWYHKDB.
* @throws IllegalArgumentException if the base is not one of =ACGTNMRSVWYHKDB.
*/
private static byte compressedBaseToByte(byte base) {
try {
return COMPRESSED_LOOKUP_TABLE[base];
} catch (IndexOutOfBoundsException e) {
throw new IllegalArgumentException("Bad base passed to charToCompressedBase: " + Character.toString((char) base) + "(" + base + ")");
}
}

/**
* Convert from BAM nybble representation of a base in low-order nybble to ASCII byte.
*
* @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base.
* @return ASCII base, one of ACGTN=.
*/
private static byte compressedBaseToByteLow(final int base) {
return compressedBaseToByte((byte) (base & 0xf));
}

/**
* Convert from BAM nybble representation of a base in high-order nybble to ASCII byte.
*
* @param base One of COMPRESSED_*_HIGH, a high-order nybble encoded base.
* @return ASCII base, one of ACGTN=.
*/
private static byte compressedBaseToByteHigh(final int base) {
return compressedBaseToByte((byte) ((base >> 4) & 0xf));
}

/**
* Convert bases in place into canonical form, upper case, and with no-call represented as N.
*
Expand Down
51 changes: 50 additions & 1 deletion src/test/java/htsjdk/samtools/SAMUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,63 @@ public void testBytesToCompressedBasesException(final byte[] bases, final char f
}

@Test
public void testCompressedBasesToBytes() {
public void testCompressedBasesToBytesAllNibbleValues() {
// Each byte encodes two bases. Test all 16 nibble values in both high and low positions.
final byte[] compressedBases = new byte[]{1, 18, 36, 72, -113, -1, 51, 85, 102, 119, -103, -86, -69, -52, -35, -18};
final byte[] bytes = SAMUtils.compressedBasesToBytes(2*compressedBases.length, compressedBases, 0);
final byte[] expectedBases = new byte[]{'=', 'A', 'A', 'C', 'C', 'G', 'G', 'T', 'T', 'N', 'N', 'N', 'M', 'M',
'R', 'R', 'S', 'S', 'V', 'V', 'W', 'W', 'Y', 'Y', 'H', 'H', 'K', 'K', 'D', 'D', 'B', 'B'};
Assert.assertEquals(new String(bytes), new String(expectedBases));
}

@Test
public void testCompressedBasesToBytesOddLength() {
// Odd-length: last base uses only the high nibble; low nibble is ignored
// 0x12 = nibbles 1,2 = A,C; 0x40 = nibbles 4,0 = G,=
final byte[] compressed = new byte[]{0x12, 0x40};
final byte[] result = SAMUtils.compressedBasesToBytes(3, compressed, 0);
Assert.assertEquals(new String(result), "ACG");
}

@Test
public void testCompressedBasesToBytesSingleBase() {
// Single base: high nibble of first byte only
final byte[] compressed = new byte[]{(byte) 0x80}; // nibble 8 = T
final byte[] result = SAMUtils.compressedBasesToBytes(1, compressed, 0);
Assert.assertEquals(new String(result), "T");
}

@Test
public void testCompressedBasesToBytesEmptySequence() {
final byte[] result = SAMUtils.compressedBasesToBytes(0, new byte[0], 0);
Assert.assertEquals(result.length, 0);
}

@Test
public void testCompressedBasesToBytesWithOffset() {
// Bytes: [junk, 0x12, 0x48] at offset 1 = A,C,G,T
final byte[] compressed = new byte[]{(byte) 0xFF, 0x12, 0x48};
final byte[] result = SAMUtils.compressedBasesToBytes(4, compressed, 1);
Assert.assertEquals(new String(result), "ACGT");
}

@Test
public void testCompressedBasesToBytesRoundTrip() {
// Round-trip: ASCII bases -> compressed -> back to ASCII
final byte[] originalBases = "ACGTACGTNN".getBytes();
final byte[] compressed = SAMUtils.bytesToCompressedBases(originalBases);
final byte[] decoded = SAMUtils.compressedBasesToBytes(originalBases.length, compressed, 0);
Assert.assertEquals(new String(decoded), new String(originalBases));
}

@Test
public void testCompressedBasesToBytesRoundTripOddLength() {
final byte[] originalBases = "ACGTACGTN".getBytes();
final byte[] compressed = SAMUtils.bytesToCompressedBases(originalBases);
final byte[] decoded = SAMUtils.compressedBasesToBytes(originalBases.length, compressed, 0);
Assert.assertEquals(new String(decoded), new String(originalBases));
}


@DataProvider()
public Iterator<Object[]> getOAValues(){
Expand Down
Loading