Skip to content

Commit 3fa0f9f

Browse files
committed
Optimize BAM tag decoding: single-pass string reads and skip redundant validation.
Two targeted optimizations to the BAM tag decoding path: 1. BinaryTagCodec.readNullTerminatedString: replace the double-pass approach (scan forward for null, reset, re-read) with a single-pass scan over the backing byte array, eliminating the intermediate byte[] allocation per string tag. 2. SAMBinaryTagAndValue/SAMBinaryTagAndUnsignedArrayValue: add package-private constructors that skip isAllowedAttributeValue() validation, used from BinaryTagCodec.readTags() where the value types are known to be valid from the BAM type codes. Profiling with eager decode showed tag decoding at 18.2% of CPU. Benchmarking on a 3.9GB BAM (52M records) showed a ~3% improvement in the eager-decode path (40.0s -> 38.9s) with no regression in the lazy path.
1 parent f60723e commit 3fa0f9f

File tree

3 files changed

+25
-18
lines changed

3 files changed

+25
-18
lines changed

src/main/java/htsjdk/samtools/BinaryTagCodec.java

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,11 @@ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int of
279279
final byte tagType = byteBuffer.get();
280280
final SAMBinaryTagAndValue tmp;
281281
if (tagType != 'B') {
282-
tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency));
282+
tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency), true);
283283
} else {
284284
final TagValueAndUnsignedArrayFlag valueAndFlag = readArray(byteBuffer, validationStringency);
285-
if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value);
286-
else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value);
285+
if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value, true);
286+
else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value, true);
287287
}
288288

289289
// If samjdk wrote the BAM then the attributes will be in lowest->highest tag order, to inserting at the
@@ -294,7 +294,7 @@ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int of
294294
tail = tmp;
295295
}
296296
else if (tmp.tag > tail.tag) {
297-
tail.insert(tmp);
297+
tail.next = tmp;
298298
tail = tmp;
299299
}
300300
else {
@@ -407,19 +407,12 @@ private static TagValueAndUnsignedArrayFlag readArray(final ByteBuffer byteBuffe
407407
}
408408

409409
private static String readNullTerminatedString(final ByteBuffer byteBuffer) {
410-
// Count the number of bytes in the string
411-
byteBuffer.mark();
412-
final int startPosition = byteBuffer.position();
413-
while (byteBuffer.get() != 0) {}
414-
final int endPosition = byteBuffer.position();
415-
416-
// Don't count null terminator
417-
final byte[] buf = new byte[endPosition - startPosition - 1];
418-
// Go back to the start of the string and read out the bytes
419-
byteBuffer.reset();
420-
byteBuffer.get(buf);
421-
// Skip over the null terminator
422-
byteBuffer.get();
423-
return StringUtil.bytesToString(buf);
410+
// Scan the backing array directly to avoid the double-pass of mark/reset/re-read
411+
final byte[] array = byteBuffer.array();
412+
final int start = byteBuffer.arrayOffset() + byteBuffer.position();
413+
int end = start;
414+
while (array[end] != 0) { end++; }
415+
byteBuffer.position(byteBuffer.position() + (end - start) + 1); // advance past null terminator
416+
return StringUtil.bytesToString(array, start, end - start);
424417
}
425418
}

src/main/java/htsjdk/samtools/SAMBinaryTagAndUnsignedArrayValue.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ public SAMBinaryTagAndUnsignedArrayValue(final short tag, final Object value) {
3939
}
4040
}
4141

42+
/** Package-private constructor that skips validation, for use in BinaryTagCodec.readTags(). */
43+
SAMBinaryTagAndUnsignedArrayValue(final short tag, final Object value, final boolean skipValidation) {
44+
super(tag, value, skipValidation);
45+
}
46+
4247
/** Creates and returns a shallow copy of the list of tag/values. */
4348
@Override
4449
public SAMBinaryTagAndValue copy() {

src/main/java/htsjdk/samtools/SAMBinaryTagAndValue.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ public SAMBinaryTagAndValue(final short tag, final Object value) {
6565
this.value = value;
6666
}
6767

68+
/**
69+
* Package-private constructor that skips type validation, for use in performance-critical
70+
* paths (e.g. BinaryTagCodec.readTags) where the value type is known to be valid.
71+
*/
72+
SAMBinaryTagAndValue(final short tag, final Object value, final boolean skipValidation) {
73+
this.tag = tag;
74+
this.value = value;
75+
}
76+
6877
// Inspect the proposed value to determine if it is an allowed value type,
6978
// and if the value is in range.
7079
protected static boolean isAllowedAttributeValue(final Object value) {

0 commit comments

Comments
 (0)