samtools · tfenne · Feb 24, 2025 · Apr 7, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,48 @@ early infrastructure for a plugin-based codec framework and resource bundles.
 
 ---
 
+## 5.0.0
+
+Adds **CRAM 3.1 write support** to htsjdk.  This is the culmination of the read-side codec work
+in 4.2.0 and the reader wiring in 4.3.0: htsjdk can now produce CRAM 3.1 files that are
+interoperable with samtools/htslib.
+
+### CRAM 3.1 Write Support
+
+- Enable CRAM 3.1 writing with all spec codecs: rANS Nx16, adaptive arithmetic Range coder, FQZComp, Name Tokenisation, and STRIPE
+- Add configurable compression profiles (FAST, NORMAL, SMALL, ARCHIVE) with trial compression for automatic codec selection
+- Implement `TrialCompressor` to replace ad-hoc triple-compression for tags and align trial candidates with htslib
+- Add `GzipCodec` for direct Deflater/Inflater GZIP compression, wired into CRAM as a codec option
+- Strip NM/MD tags on CRAM encode and regenerate on decode, matching htslib behavior
+- Implement attached (same-slice) mate pair resolution
+- Align DataSeries content IDs with htslib for cross-implementation debugging
+- Remove content digest tags (BD/SD/B5/S5/B1/S1) from CRAM slice headers, matching htslib/samtools behavior. These are optional per the spec and were expensive to compute. Block-level CRC32 (required by CRAM 3.0+) provides data integrity. This is technically a breaking change but has zero practical impact since no known tools consume these tags.
+- Default CRAM version for writing is now 3.1 (was 3.0)
+- Add `CramConverter` command-line tool for testing and benchmarking CRAM write profiles
+
+### Codec and Compression Optimizations
+
+- Refactor and optimize all rANS codecs: byte-array API, backwards-write encoding, and general simplifications
+- Optimize Name Tokeniser encoder: replace regex with hand-written parser; add per-type flags, STRIPE support, stream deduplication, and all-MATCH elimination
+- Optimize FQZComp, Range coder, and rANS encoder hot paths
+- Tune NORMAL profile codec assignments based on empirical compression testing
+
+### Performance
+
+- Replace `ByteArrayInputStream`/`ByteArrayOutputStream` with unsynchronized `CRAMByteReader`/`CRAMByteWriter` to eliminate synchronization overhead
+- Fuse read base restoration, CIGAR building, and NM/MD computation into a single pass during decode
+- Cache tag key metadata to eliminate per-record `String` allocation during CRAM decode
+- Pool `RANSNx16Decode` instances in the Name Tokeniser
+- Optimize BAM nibble-to-ASCII base decoding with a bulk lookup table
+
+### Testing and Infrastructure
+
+- Split CRAM 3.1 fidelity tests into per-profile classes for parallel execution
+- Reduce memory pressure in unit tests to eliminate OOM failures
+- Fix thread-safety bug in `VariantContextTestProvider` causing non-deterministic test counts
+
+---
+
 ## 4.3.0 (2025-05-09)
 
 Completes CRAM 3.1 read support by wiring the codec implementations (added in 4.2.0) into

diff --git a/README.md b/README.md
@@ -11,6 +11,8 @@ manipulating HTS data.
 
 > **NOTE: _HTSJDK has only partial support for the latest Variant Call Format Specification.  VCFv4.3 can be read but not written, VCFv4.4 can be read in lenient mode only, and there is no support for BCFv2.2._**
 
+> **NOTE: _HTSJDK now supports both reading and writing CRAM 3.1 files.  CRAM 3.1 write support includes all codecs defined in the specification (rANS Nx16, adaptive arithmetic Range coder, FQZComp, Name Tokenisation, and STRIPE), configurable compression profiles (FAST, NORMAL, SMALL, ARCHIVE), and trial compression for automatic codec selection.  Files produced by htsjdk are interoperable with samtools/htslib._**
+
 ### Documentation & Getting Help
 
 API documentation for all versions of HTSJDK since `1.128` are available through [javadoc.io](http://www.javadoc.io/doc/com.github.samtools/htsjdk).

diff --git a/build.gradle b/build.gradle
@@ -99,7 +99,7 @@ tasks.withType(Test).configureEach { task ->
 
     // set heap size for the test JVM(s)
     task.minHeapSize = "1G"
-    task.maxHeapSize = "6G"
+    task.maxHeapSize = "12G"
 
     task.jvmArgs '-Djava.awt.headless=true'  //this prevents awt from displaying a java icon while the tests are running
 
@@ -160,7 +160,7 @@ test {
            excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena", "htsget", "unix"
        }
        parallel = "classes"
-       threadCount = 2 * Runtime.runtime.availableProcessors()
+       threadCount = Runtime.runtime.availableProcessors()
    }
 } dependsOn testWithDefaultReference, testWithOptimisticVCF4_4
 

diff --git a/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java b/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java
@@ -22,7 +22,6 @@ public class CRAMEncoderV3_1 extends CRAMEncoder {
      */
     public CRAMEncoderV3_1(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
         super(outputBundle, readsEncoderOptions);
-        throw new CRAMException("CRAM v3.1 encoding is not yet supported");
     }
 
     @Override

diff --git a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java
@@ -1,6 +1,5 @@
 package htsjdk.beta.plugin.registry;
 
-import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1;
 import htsjdk.beta.exception.HtsjdkException;
 import htsjdk.beta.exception.HtsjdkPluginException;
 import htsjdk.beta.plugin.HtsVersion;
@@ -12,13 +11,9 @@
 import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
 import htsjdk.beta.plugin.reads.ReadsEncoder;
 import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
-import htsjdk.beta.plugin.reads.ReadsFormats;
 import htsjdk.io.IOPath;
 import htsjdk.utils.ValidationUtils;
 
-import java.util.List;
-import java.util.stream.Collectors;
-
 /**
  * Class with methods for resolving inputs and outputs to reads encoders and decoders.
  * <p>
@@ -209,30 +204,4 @@ public ReadsEncoder getReadsEncoder(
                 .getEncoder(outputBundle, readsEncoderOptions);
     }
 
-    /**
-     * Temporarily override to remove the CRAM 3.1 codec from the list of candidate codecs when the request is for
-     * the newest version, since it has no write implementation yet.
-     */
-    @Override
-    protected List<ReadsCodec> filterByVersion(final List<ReadsCodec> candidateCodecs, final HtsVersion htsVersion) {
-        final List<ReadsCodec> preFilteredCodecs;
-        if (htsVersion.equals(HtsVersion.NEWEST_VERSION)) {
-            // if the request is for the newest version, then pre-filter out the CRAM 3.1 codec since it has no
-            // write implementation yet, and then delegate to the superclass to let it find the newest version among
-            // the remaining codecs
-            preFilteredCodecs = candidateCodecs.stream().filter(
-                    c -> !(c.getFileFormat().equals(ReadsFormats.CRAM)
-                            && c.getVersion().equals(CRAMCodecV3_1.VERSION_3_1)))
-                    .collect(Collectors.toList());
-            final HtsVersion newestVersion = preFilteredCodecs.stream()
-                    .map(c -> c.getVersion())
-                    .reduce(candidateCodecs.get(0).getVersion(),
-                            (HtsVersion a, HtsVersion b) -> a.compareTo(b) > 0 ? a : b);
-            return candidateCodecs.stream().filter(
-                    c -> c.getVersion().equals(newestVersion)).collect(Collectors.toList());
-        } else {
-            preFilteredCodecs = candidateCodecs;
-        }
-        return super.filterByVersion(preFilteredCodecs, htsVersion);
-    }
 }
diff --git a/src/main/java/htsjdk/samtools/BamConverter.java b/src/main/java/htsjdk/samtools/BamConverter.java
@@ -0,0 +1,122 @@
+package htsjdk.samtools;
+
+import java.io.File;
+
+/**
+ * Simple command-line tool for reading and optionally converting BAM files, primarily
+ * for experimenting with BAM read-path profiling.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java -cp htsjdk.jar htsjdk.samtools.BamConverter input.bam [output.bam]
+ * </pre>
+ *
+ * <p>If no output is specified, records are read and iterated but not written.
+ */
+public class BamConverter {
+
+    private static final String USAGE = String.join("\n",
+            "Usage: BamConverter <input> [output]",
+            "",
+            "Read and optionally convert a BAM file.",
+            "",
+            "Arguments:",
+            "  input              Input BAM file",
+            "  output             Optional output BAM file (omit to read-only)"
+    );
+
+    /**
+     * Entry point. Parses command-line arguments and performs the read/conversion.
+     *
+     * @param args command-line arguments (see USAGE for details)
+     */
+    public static void main(final String[] args) {
+        if (hasFlag(args, "--help") || hasFlag(args, "-h")) {
+            System.out.println(USAGE);
+            System.exit(0);
+        }
+        if (args.length < 1) {
+            System.err.println(USAGE);
+            System.exit(1);
+        }
+
+        final boolean eager = hasFlag(args, "--eager");
+        // Collect positional args (non-flag arguments)
+        final String[] positional = java.util.Arrays.stream(args)
+                .filter(a -> !a.startsWith("--"))
+                .toArray(String[]::new);
+        if (positional.length < 1) {
+            System.err.println(USAGE);
+            System.exit(1);
+        }
+        final String inputPath = positional[0];
+        final String outputPath = positional.length > 1 ? positional[1] : null;
+
+        if (outputPath != null) {
+            System.err.printf("Converting %s -> %s%s%n", inputPath, outputPath, eager ? " (eager decode)" : "");
+        } else {
+            System.err.printf("Reading %s (no output%s)%n", inputPath, eager ? ", eager decode" : "");
+        }
+
+        final SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
+                .validationStringency(ValidationStringency.SILENT);
+
+        long count = 0;
+        final long startTime = System.currentTimeMillis();
+
+        try (final SamReader reader = readerFactory.open(new File(inputPath))) {
+            final SAMFileHeader header = reader.getFileHeader();
+
+            if (outputPath != null) {
+                final SAMFileWriterFactory writerFactory = new SAMFileWriterFactory();
+                try (final SAMFileWriter writer = writerFactory.makeBAMWriter(header, true, new File(outputPath).toPath())) {
+                    for (final SAMRecord record : reader) {
+                        if (eager) record.eagerDecode();
+                        writer.addAlignment(record);
+                        count++;
+                        if (count % 1_000_000 == 0) {
+                            System.err.printf("  ... %,d records%n", count);
+                        }
+                    }
+                }
+            } else {
+                for (final SAMRecord record : reader) {
+                    if (eager) record.eagerDecode();
+                    count++;
+                    if (count % 1_000_000 == 0) {
+                        System.err.printf("  ... %,d records%n", count);
+                    }
+                }
+            }
+        } catch (final Exception e) {
+            die("Error: " + e.getMessage());
+        }
+
+        final long elapsed = System.currentTimeMillis() - startTime;
+        final long inputSize = new File(inputPath).length();
+
+        if (outputPath != null) {
+            final long outputSize = new File(outputPath).length();
+            System.err.printf("Done. %,d records in %.1fs. Input: %,d bytes, Output: %,d bytes (%.1f%%)%n",
+                    count, elapsed / 1000.0, inputSize, outputSize,
+                    inputSize > 0 ? (100.0 * outputSize / inputSize) : 0);
+        } else {
+            System.err.printf("Done. %,d records in %.1fs. Input: %,d bytes%n",
+                    count, elapsed / 1000.0, inputSize);
+        }
+    }
+
+    private static boolean hasFlag(final String[] args, final String flag) {
+        for (final String arg : args) {
+            if (flag.equals(arg)) return true;
+        }
+        return false;
+    }
+
+    private static void die(final String message) {
+        System.err.println("ERROR: " + message);
+        System.err.println();
+        System.err.println(USAGE);
+        System.exit(1);
+    }
+}
diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java
@@ -2,7 +2,7 @@
 
 import htsjdk.samtools.cram.build.ContainerFactory;
 import htsjdk.samtools.cram.build.CramIO;
-import htsjdk.samtools.cram.common.CramVersions;
+import htsjdk.samtools.cram.common.CRAMVersion;
 import htsjdk.samtools.cram.ref.CRAMReferenceSource;
 import htsjdk.samtools.cram.structure.*;
 import htsjdk.samtools.util.RuntimeIOException;
@@ -19,6 +19,7 @@ public class CRAMContainerStreamWriter {
     private final SAMFileHeader samFileHeader;
     private final ContainerFactory containerFactory;
     private final CRAMIndexer cramIndexer;
+    private final CRAMVersion cramVersion;
 
     private long streamOffset = 0;
 
@@ -70,7 +71,7 @@ public CRAMContainerStreamWriter(
      * Create a CRAMContainerStreamWriter for writing SAM records into a series of CRAM
      * containers on output stream, with an optional index.
      *
-     * @param encodingStrategy encoding strategy values
+     * @param encodingStrategy encoding strategy values (includes CRAM version)
      * @param referenceSource reference cramReferenceSource
      * @param samFileHeader {@link SAMFileHeader} to be used. Sort order is determined by the sortOrder property of this arg.
      * @param outputStream where to write the CRAM stream.
@@ -88,6 +89,7 @@ public CRAMContainerStreamWriter(
         this.outputStream = outputStream;
         this.cramIndexer = indexer;
         this.outputStreamIdentifier = outputIdentifier;
+        this.cramVersion = encodingStrategy.getCramVersion();
         this.containerFactory = new ContainerFactory(samFileHeader, encodingStrategy, referenceSource);
     }
 
@@ -103,11 +105,11 @@ public void writeAlignment(final SAMRecord alignment) {
     }
 
     /**
-     * Write a CRAM file header and the previously provided SAM header to the stream.
+     * Write a CRAM file header and the provided SAM header to the stream.
+     * Retained for backward compatibility with external projects (disq, GATK).
      */
-    // TODO: retained for backward compatibility for disq in order to run GATK tests (remove before merging this branch)
     public void writeHeader(final SAMFileHeader requestedSAMFileHeader) {
-        final CramHeader cramHeader = new CramHeader(CramVersions.DEFAULT_CRAM_VERSION, outputStreamIdentifier);
+        final CramHeader cramHeader = new CramHeader(cramVersion, outputStreamIdentifier);
         streamOffset = CramIO.writeCramHeader(cramHeader, outputStream);
         streamOffset += Container.writeSAMFileHeaderContainer(cramHeader.getCRAMVersion(), requestedSAMFileHeader, outputStream);
     }
@@ -131,7 +133,7 @@ public void finish(final boolean writeEOFContainer) {
                 writeContainer(container);
             }
             if (writeEOFContainer) {
-                CramIO.writeCramEOF(CramVersions.DEFAULT_CRAM_VERSION, outputStream);
+                CramIO.writeCramEOF(cramVersion, outputStream);
             }
             outputStream.flush();
             if (cramIndexer != null) {
@@ -144,7 +146,7 @@ public void finish(final boolean writeEOFContainer) {
     }
 
     protected void writeContainer(final Container container) {
-        streamOffset += container.write(CramVersions.DEFAULT_CRAM_VERSION, outputStream);
+        streamOffset += container.write(cramVersion, outputStream);
         if (cramIndexer != null) {
             // using silent validation here because the reads have been through validation already or
             // they have been generated somehow through the htsjdk

diff --git a/src/main/java/htsjdk/samtools/CRAMFileReader.java b/src/main/java/htsjdk/samtools/CRAMFileReader.java
@@ -374,7 +374,11 @@ public CloseableIterator<SAMRecord> queryUnmapped() {
         try {
             seekableStream.seek(0);
             iterator = new CRAMIterator(seekableStream, referenceSource, validationStringency);
-            seekableStream.seek(startOfLastLinearBin >>> 16);
+            // When startOfLastLinearBin is -1, there are no mapped reads and the entire file is
+            // unmapped. In that case, iterate from the beginning (already at position 0).
+            if (startOfLastLinearBin != -1) {
+                seekableStream.seek(startOfLastLinearBin >>> 16);
+            }
             boolean atAlignments;
             do {
                 atAlignments = iterator.advanceToAlignmentInContainer(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START);

diff --git a/src/main/java/htsjdk/samtools/CRAMIterator.java b/src/main/java/htsjdk/samtools/CRAMIterator.java
@@ -115,7 +115,11 @@ private BAMIteratorFilter.FilteringIteratorState nextContainer() {
                     compressorCache,
                     getSAMFileHeader());
             samRecordIterator = samRecords.iterator();
-            return BAMIteratorFilter.FilteringIteratorState.MATCHES_FILTER;
+            // A container may match the query but produce no records (e.g. a container with
+            // only a compression header and no slices). Skip to the next container in that case.
+            return samRecords.isEmpty()
+                    ? BAMIteratorFilter.FilteringIteratorState.CONTINUE_ITERATION
+                    : BAMIteratorFilter.FilteringIteratorState.MATCHES_FILTER;
         } else {
             return BAMIteratorFilter.FilteringIteratorState.CONTINUE_ITERATION;
         }