diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3472be10fc..b9d7a59d85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,48 @@ early infrastructure for a plugin-based codec framework and resource bundles.
---
+## 5.0.0
+
+Adds **CRAM 3.1 write support** to htsjdk. This is the culmination of the read-side codec work
+in 4.2.0 and the reader wiring in 4.3.0: htsjdk can now produce CRAM 3.1 files that are
+interoperable with samtools/htslib.
+
+### CRAM 3.1 Write Support
+
+- Enable CRAM 3.1 writing with all spec codecs: rANS Nx16, adaptive arithmetic Range coder, FQZComp, Name Tokenisation, and STRIPE
+- Add configurable compression profiles (FAST, NORMAL, SMALL, ARCHIVE) with trial compression for automatic codec selection
+- Implement `TrialCompressor` to replace ad-hoc triple-compression for tags and align trial candidates with htslib
+- Add `GzipCodec` for direct Deflater/Inflater GZIP compression, wired into CRAM as a codec option
+- Strip NM/MD tags on CRAM encode and regenerate on decode, matching htslib behavior
+- Implement attached (same-slice) mate pair resolution
+- Align DataSeries content IDs with htslib for cross-implementation debugging
+- Remove content digest tags (BD/SD/B5/S5/B1/S1) from CRAM slice headers, matching htslib/samtools behavior. These are optional per the spec and were expensive to compute. Block-level CRC32 (required by CRAM 3.0+) provides data integrity. This is technically a breaking change but has zero practical impact since no known tools consume these tags.
+- Default CRAM version for writing is now 3.1 (was 3.0)
+- Add `CramConverter` command-line tool for testing and benchmarking CRAM write profiles
+
+### Codec and Compression Optimizations
+
+- Refactor and optimize all rANS codecs: byte-array API, backwards-write encoding, and general simplifications
+- Optimize Name Tokeniser encoder: replace regex with hand-written parser; add per-type flags, STRIPE support, stream deduplication, and all-MATCH elimination
+- Optimize FQZComp, Range coder, and rANS encoder hot paths
+- Tune NORMAL profile codec assignments based on empirical compression testing
+
+### Performance
+
+- Replace `ByteArrayInputStream`/`ByteArrayOutputStream` with unsynchronized `CRAMByteReader`/`CRAMByteWriter` to eliminate synchronization overhead
+- Fuse read base restoration, CIGAR building, and NM/MD computation into a single pass during decode
+- Cache tag key metadata to eliminate per-record `String` allocation during CRAM decode
+- Pool `RANSNx16Decode` instances in the Name Tokeniser
+- Optimize BAM nibble-to-ASCII base decoding with a bulk lookup table
+
+### Testing and Infrastructure
+
+- Split CRAM 3.1 fidelity tests into per-profile classes for parallel execution
+- Reduce memory pressure in unit tests to eliminate OOM failures
+- Fix thread-safety bug in `VariantContextTestProvider` causing non-deterministic test counts
+
+---
+
## 4.3.0 (2025-05-09)
Completes CRAM 3.1 read support by wiring the codec implementations (added in 4.2.0) into
diff --git a/README.md b/README.md
index c82cbe6434..87cf666ee6 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ manipulating HTS data.
> **NOTE: _HTSJDK has only partial support for the latest Variant Call Format Specification. VCFv4.3 can be read but not written, VCFv4.4 can be read in lenient mode only, and there is no support for BCFv2.2._**
+> **NOTE: _HTSJDK now supports both reading and writing CRAM 3.1 files. CRAM 3.1 write support includes all codecs defined in the specification (rANS Nx16, adaptive arithmetic Range coder, FQZComp, Name Tokenisation, and STRIPE), configurable compression profiles (FAST, NORMAL, SMALL, ARCHIVE), and trial compression for automatic codec selection. Files produced by htsjdk are interoperable with samtools/htslib._**
+
### Documentation & Getting Help
API documentation for all versions of HTSJDK since `1.128` are available through [javadoc.io](http://www.javadoc.io/doc/com.github.samtools/htsjdk).
diff --git a/build.gradle b/build.gradle
index 5bffea4506..3700a92772 100644
--- a/build.gradle
+++ b/build.gradle
@@ -99,7 +99,7 @@ tasks.withType(Test).configureEach { task ->
// set heap size for the test JVM(s)
task.minHeapSize = "1G"
- task.maxHeapSize = "6G"
+ task.maxHeapSize = "12G"
task.jvmArgs '-Djava.awt.headless=true' //this prevents awt from displaying a java icon while the tests are running
@@ -160,7 +160,7 @@ test {
excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena", "htsget", "unix"
}
parallel = "classes"
- threadCount = 2 * Runtime.runtime.availableProcessors()
+ threadCount = Runtime.runtime.availableProcessors()
}
} dependsOn testWithDefaultReference, testWithOptimisticVCF4_4
diff --git a/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java b/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java
index fdcc53a8cc..4ae82e91e1 100644
--- a/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java
+++ b/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java
@@ -22,7 +22,6 @@ public class CRAMEncoderV3_1 extends CRAMEncoder {
*/
public CRAMEncoderV3_1(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
super(outputBundle, readsEncoderOptions);
- throw new CRAMException("CRAM v3.1 encoding is not yet supported");
}
@Override
diff --git a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java
index 4e67a4dffd..b89ffc4a10 100644
--- a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java
+++ b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java
@@ -1,6 +1,5 @@
package htsjdk.beta.plugin.registry;
-import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1;
import htsjdk.beta.exception.HtsjdkException;
import htsjdk.beta.exception.HtsjdkPluginException;
import htsjdk.beta.plugin.HtsVersion;
@@ -12,13 +11,9 @@
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
import htsjdk.beta.plugin.reads.ReadsEncoder;
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
-import htsjdk.beta.plugin.reads.ReadsFormats;
import htsjdk.io.IOPath;
import htsjdk.utils.ValidationUtils;
-import java.util.List;
-import java.util.stream.Collectors;
-
/**
* Class with methods for resolving inputs and outputs to reads encoders and decoders.
*
@@ -209,30 +204,4 @@ public ReadsEncoder getReadsEncoder(
.getEncoder(outputBundle, readsEncoderOptions);
}
- /**
- * Temporarily override to remove the CRAM 3.1 codec from the list of candidate codecs when the request is for
- * the newest version, since it has no write implementation yet.
- */
- @Override
- protected List filterByVersion(final List candidateCodecs, final HtsVersion htsVersion) {
- final List preFilteredCodecs;
- if (htsVersion.equals(HtsVersion.NEWEST_VERSION)) {
- // if the request is for the newest version, then pre-filter out the CRAM 3.1 codec since it has no
- // write implementation yet, and then delegate to the superclass to let it find the newest version among
- // the remaining codecs
- preFilteredCodecs = candidateCodecs.stream().filter(
- c -> !(c.getFileFormat().equals(ReadsFormats.CRAM)
- && c.getVersion().equals(CRAMCodecV3_1.VERSION_3_1)))
- .collect(Collectors.toList());
- final HtsVersion newestVersion = preFilteredCodecs.stream()
- .map(c -> c.getVersion())
- .reduce(candidateCodecs.get(0).getVersion(),
- (HtsVersion a, HtsVersion b) -> a.compareTo(b) > 0 ? a : b);
- return candidateCodecs.stream().filter(
- c -> c.getVersion().equals(newestVersion)).collect(Collectors.toList());
- } else {
- preFilteredCodecs = candidateCodecs;
- }
- return super.filterByVersion(preFilteredCodecs, htsVersion);
- }
}
diff --git a/src/main/java/htsjdk/samtools/BamConverter.java b/src/main/java/htsjdk/samtools/BamConverter.java
new file mode 100644
index 0000000000..13d2500565
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/BamConverter.java
@@ -0,0 +1,122 @@
+package htsjdk.samtools;
+
+import java.io.File;
+
+/**
+ * Simple command-line tool for reading and optionally converting BAM files, primarily
+ * for experimenting with BAM read-path profiling.
+ *
+ *
+ */
+public class CramConverter {
+
+ private static final String USAGE = String.join("\n",
+ "Usage: CramConverter [output] [options]",
+ "",
+ "Convert between SAM, BAM, and CRAM formats.",
+ "If output is omitted, reads all records without writing (useful for benchmarking).",
+ "",
+ "Arguments:",
+ " input Input file (.sam, .bam, or .cram)",
+ " output Output file (.sam, .bam, or .cram) [optional]",
+ "",
+ "Options:",
+ " --reference Reference FASTA (required for CRAM input or output)",
+ " --profile CRAM compression profile: fast, normal (default), small, archive",
+ " --help Print this help message",
+ "",
+ "Examples:",
+ " # Convert BAM to CRAM 3.1 with default (normal) profile:",
+ " CramConverter input.bam output.cram --reference ref.fasta",
+ "",
+ " # Read-only benchmark (no output):",
+ " CramConverter input.cram --reference ref.fasta",
+ "",
+ " # Convert CRAM to CRAM with archive profile:",
+ " CramConverter input.cram output.cram --reference ref.fasta --profile archive",
+ "",
+ " # Convert CRAM to BAM:",
+ " CramConverter input.cram output.bam --reference ref.fasta",
+ "",
+ " # Convert BAM to CRAM with fast profile (writes CRAM 3.0):",
+ " CramConverter input.bam output.cram --reference ref.fasta --profile fast"
+ );
+
+ /**
+ * Entry point. Parses command-line arguments and performs the conversion.
+ *
+ * @param args command-line arguments (see USAGE for details)
+ */
+ public static void main(final String[] args) {
+ if (hasFlag(args, "--help") || hasFlag(args, "-h")) {
+ System.out.println(USAGE);
+ System.exit(0);
+ }
+ if (args.length < 1) {
+ System.err.println(USAGE);
+ System.exit(1);
+ }
+
+ final String inputPath = args[0];
+ // Output is optional — if the second arg looks like a file (not a flag), use it
+ String outputPath = null;
+ int optStart = 1;
+ if (args.length > 1 && !args[1].startsWith("-")) {
+ outputPath = args[1];
+ optStart = 2;
+ }
+ String referencePath = null;
+ String profileName = "normal";
+
+ // Parse optional arguments
+ for (int i = optStart; i < args.length; i++) {
+ switch (args[i]) {
+ case "--reference":
+ case "-r":
+ if (++i >= args.length) die("--reference requires a path argument");
+ referencePath = args[i];
+ break;
+ case "--profile":
+ case "-p":
+ if (++i >= args.length) die("--profile requires a name argument");
+ profileName = args[i];
+ break;
+ default:
+ die("Unknown option: " + args[i]);
+ }
+ }
+
+ // Resolve profile
+ final CRAMCompressionProfile profile;
+ try {
+ profile = CRAMCompressionProfile.valueOfCaseInsensitive(profileName);
+ } catch (final IllegalArgumentException e) {
+ die(e.getMessage());
+ return; // unreachable but keeps compiler happy
+ }
+
+ // Check reference requirement
+ final boolean inputIsCram = inputPath.endsWith(".cram");
+ final boolean outputIsCram = outputPath != null && outputPath.endsWith(".cram");
+ if ((inputIsCram || outputIsCram) && referencePath == null) {
+ die("--reference is required when reading or writing CRAM files");
+ }
+
+ final Path refPath = referencePath != null ? new File(referencePath).toPath() : null;
+ final CRAMEncodingStrategy strategy = profile.toStrategy();
+
+ if (outputPath != null) {
+ System.err.printf("Converting %s -> %s (profile=%s, version=%s)%n",
+ inputPath, outputPath, profile.name().toLowerCase(), strategy.getCramVersion());
+ } else {
+ System.err.printf("Reading %s (no output)%n", inputPath);
+ }
+
+ // Read input
+ final SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
+ .validationStringency(ValidationStringency.SILENT);
+ if (refPath != null) {
+ readerFactory.referenceSequence(refPath);
+ }
+
+ long count = 0;
+ final long startTime = System.currentTimeMillis();
+
+ try (final SamReader reader = readerFactory.open(new File(inputPath))) {
+ final SAMFileHeader header = reader.getFileHeader();
+
+ if (outputPath != null) {
+ // Read and write
+ final SAMFileWriterFactory writerFactory = new SAMFileWriterFactory()
+ .setCRAMEncodingStrategy(strategy);
+
+ try (final SAMFileWriter writer = outputIsCram ?
+ writerFactory.makeCRAMWriter(header, true, new File(outputPath).toPath(), refPath) :
+ writerFactory.makeWriter(header, true, new File(outputPath).toPath(), refPath)) {
+
+ for (final SAMRecord record : reader) {
+ writer.addAlignment(record);
+ count++;
+ if (count % 1_000_000 == 0) {
+ System.err.printf(" ... %,d records%n", count);
+ }
+ }
+ }
+ } else {
+ // Read only — iterate all records without writing
+ for (final SAMRecord record : reader) {
+ count++;
+ if (count % 1_000_000 == 0) {
+ System.err.printf(" ... %,d records%n", count);
+ }
+ }
+ }
+ } catch (final Exception e) {
+ die("Error: " + e.getMessage());
+ }
+
+ final long elapsed = System.currentTimeMillis() - startTime;
+ final long inputSize = new File(inputPath).length();
+
+ if (outputPath != null) {
+ final long outputSize = new File(outputPath).length();
+ System.err.printf("Done. %,d records in %.1fs. Input: %,d bytes, Output: %,d bytes (%.1f%%)%n",
+ count, elapsed / 1000.0, inputSize, outputSize,
+ inputSize > 0 ? (100.0 * outputSize / inputSize) : 0);
+ } else {
+ System.err.printf("Done. %,d records in %.1fs. Input: %,d bytes%n",
+ count, elapsed / 1000.0, inputSize);
+ }
+ }
+
+ private static boolean hasFlag(final String[] args, final String flag) {
+ for (final String arg : args) {
+ if (flag.equals(arg)) return true;
+ }
+ return false;
+ }
+
+ private static void die(final String message) {
+ System.err.println("ERROR: " + message);
+ System.err.println();
+ System.err.println(USAGE);
+ System.exit(1);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/CramStats.java b/src/main/java/htsjdk/samtools/cram/CramStats.java
new file mode 100644
index 0000000000..dd6925c4b8
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/CramStats.java
@@ -0,0 +1,118 @@
+package htsjdk.samtools.cram;
+
+import htsjdk.samtools.cram.build.CramContainerIterator;
+import htsjdk.samtools.cram.build.CramIO;
+import htsjdk.samtools.cram.structure.*;
+import htsjdk.samtools.cram.structure.block.Block;
+import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
+import htsjdk.samtools.seekablestream.SeekableFileStream;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ * Dumps per-DataSeries compression statistics from a CRAM file. Useful for comparing
+ * compression methods and sizes between different CRAM implementations.
+ */
+public class CramStats {
+
+ /**
+ * Entry point. Dumps per-DataSeries compression statistics for each CRAM file argument.
+ *
+ * @param args one or more paths to CRAM files
+ */
+ public static void main(final String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Usage: CramStats [file2.cram ...]");
+ System.exit(1);
+ }
+
+ for (final String path : args) {
+ dumpStats(path);
+ System.out.println();
+ }
+ }
+
+ private static void dumpStats(final String path) throws Exception {
+ System.out.printf("=== %s ===%n", path);
+
+ // Track per-contentID totals: compressed size, uncompressed size, method
+ final Map compressedByContentId = new TreeMap<>(); // contentId -> [compressed, uncompressed]
+ final Map> methodsByContentId = new TreeMap<>();
+ long totalCompressed = 0;
+ long totalUncompressed = 0;
+ int containerCount = 0;
+ int sliceCount = 0;
+ int recordCount = 0;
+
+ try (final SeekableFileStream stream = new SeekableFileStream(new File(path))) {
+ final CramContainerIterator iter = new CramContainerIterator(stream);
+ System.out.printf("CRAM version: %s%n", iter.getCramHeader().getCRAMVersion());
+ while (iter.hasNext()) {
+ final Container container = iter.next();
+ containerCount++;
+ recordCount += container.getContainerHeader().getNumberOfRecords();
+
+ for (final Slice slice : container.getSlices()) {
+ sliceCount++;
+ final SliceBlocks blocks = slice.getSliceBlocks();
+
+ // Core block
+ final Block core = blocks.getCoreBlock();
+ accumulate(compressedByContentId, methodsByContentId, -1, core);
+ totalCompressed += core.getCompressedContentSize();
+ totalUncompressed += core.getUncompressedContentSize();
+
+ // External blocks
+ for (final int contentId : blocks.getExternalContentIDs()) {
+ final Block block = blocks.getExternalBlock(contentId);
+ accumulate(compressedByContentId, methodsByContentId, contentId, block);
+ totalCompressed += block.getCompressedContentSize();
+ totalUncompressed += block.getUncompressedContentSize();
+ }
+ }
+ }
+ }
+
+ System.out.printf("Containers: %d, Slices: %d, Records: %,d%n", containerCount, sliceCount, recordCount);
+ System.out.printf("Total: compressed=%,d uncompressed=%,d ratio=%.1f%%%n%n",
+ totalCompressed, totalUncompressed,
+ totalUncompressed > 0 ? (100.0 * totalCompressed / totalUncompressed) : 0);
+
+ // Map content IDs to data series names
+ final Map contentIdNames = new HashMap<>();
+ for (final DataSeries ds : DataSeries.values()) {
+ contentIdNames.put(ds.getExternalBlockContentId().intValue(), ds.getCanonicalName());
+ }
+ contentIdNames.put(-1, "CORE");
+
+ // Print per-content-ID stats
+ System.out.printf("%-6s %-14s %12s %12s %7s %s%n",
+ "ID", "Series", "Compressed", "Uncompressed", "Ratio", "Methods");
+ System.out.println("-".repeat(80));
+
+ for (final Map.Entry entry : compressedByContentId.entrySet()) {
+ final int id = entry.getKey();
+ final long[] sizes = entry.getValue();
+ final String name = contentIdNames.getOrDefault(id, "TAG:" + id);
+ final String methods = methodsByContentId.getOrDefault(id, Collections.emptyMap()).toString();
+ System.out.printf("%-6d %-14s %,12d %,12d %6.1f%% %s%n",
+ id, name, sizes[0], sizes[1],
+ sizes[1] > 0 ? (100.0 * sizes[0] / sizes[1]) : 0,
+ methods);
+ }
+ }
+
+ private static void accumulate(
+ final Map sizeMap,
+ final Map> methodMap,
+ final int contentId,
+ final Block block) {
+ sizeMap.computeIfAbsent(contentId, k -> new long[2]);
+ sizeMap.get(contentId)[0] += block.getCompressedContentSize();
+ sizeMap.get(contentId)[1] += block.getUncompressedContentSize();
+
+ methodMap.computeIfAbsent(contentId, k -> new EnumMap<>(BlockCompressionMethod.class));
+ methodMap.get(contentId).merge(block.getCompressionMethod(), 1, Integer::sum);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java b/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java
index fb44b52e52..4ec5e45c10 100644
--- a/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java
+++ b/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java
@@ -179,7 +179,7 @@ public void fetchReferenceBasesByRegion(
(zeroBasedStart + requestedFragmentLength));
}
regionStart = zeroBasedStart;
- regionLength = referenceBases.length;
+ regionLength = Math.min(requestedFragmentLength, referenceBases.length);
}
/**
diff --git a/src/main/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java b/src/main/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java
index 19375e745b..4463d0a472 100644
--- a/src/main/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java
+++ b/src/main/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java
@@ -19,7 +19,10 @@
import htsjdk.samtools.cram.common.MutableInt;
import htsjdk.samtools.cram.compression.ExternalCompressor;
+import htsjdk.samtools.cram.compression.TrialCompressor;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
import htsjdk.samtools.cram.encoding.*;
+import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.samtools.cram.encoding.core.CanonicalHuffmanIntegerEncoding;
import htsjdk.samtools.cram.encoding.external.*;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
@@ -59,6 +62,10 @@ public final class CompressionHeaderFactory {
private final Map bestTagEncodings = new HashMap<>();
private final ByteArrayOutputStream baosForTagValues = new ByteArrayOutputStream(1024 * 1024);
+ // Per-tag trial compressors that persist across containers. Each tag ID gets its own
+ // TrialCompressor that learns which of GZIP/rANS-0/rANS-1 works best for that tag's data.
+ private final Map tagTrialCompressors = new HashMap<>();
+
/**
* Create a CompressionHeaderFactory using the provided CRAMEncodingStrategy.
* @param encodingStrategy {@link CRAMEncodingStrategy} to use, may not be null
@@ -66,9 +73,9 @@ public final class CompressionHeaderFactory {
public CompressionHeaderFactory(final CRAMEncodingStrategy encodingStrategy) {
ValidationUtils.nonNull(encodingStrategy, "A CRAMEncodingStrategy is required");
- this.encodingMap = encodingStrategy.getCustomCompressionHeaderEncodingMap() == null ?
- new CompressionHeaderEncodingMap(encodingStrategy) :
- encodingStrategy.getCustomCompressionHeaderEncodingMap();
+ this.encodingMap = encodingStrategy.getCustomCompressionHeaderEncodingMap() != null ?
+ encodingStrategy.getCustomCompressionHeaderEncodingMap() :
+ new CompressionHeaderEncodingMap(encodingStrategy);
this.encodingStrategy = encodingStrategy;
}
@@ -89,8 +96,9 @@ public CompressionHeader createCompressionHeader(final List discoveredTagIds = new HashSet<>();
+ compressionHeader.setTagIdDictionary(buildTagIdDictionary(containerCRAMCompressionRecords, discoveredTagIds));
+ buildTagEncodings(containerCRAMCompressionRecords, compressionHeader, discoveredTagIds);
final SubstitutionMatrix substitutionMatrix = new SubstitutionMatrix(containerCRAMCompressionRecords);
updateSubstitutionCodes(containerCRAMCompressionRecords, substitutionMatrix);
compressionHeader.setSubstitutionMatrix(substitutionMatrix);
@@ -114,20 +122,14 @@ public CRAMEncodingStrategy getEncodingStrategy() {
* @param compressionHeader
* compression header to register encodings
*/
- private void buildTagEncodings(final List cramCompressionRecords, final CompressionHeader compressionHeader) {
- final Set tagIdSet = new HashSet<>();
-
- for (final CRAMCompressionRecord record : cramCompressionRecords) {
- if (record.getTags() == null || record.getTags().size() == 0) {
- continue;
- }
-
- for (final ReadTag tag : record.getTags()) {
- tagIdSet.add(tag.keyType3BytesAsInt);
- }
- }
-
- for (final int tagId : tagIdSet) {
+ /**
+ * Build tag encodings using tag IDs already discovered during dictionary building,
+ * avoiding a redundant pass over all records.
+ */
+ private void buildTagEncodings(final List cramCompressionRecords,
+ final CompressionHeader compressionHeader,
+ final Set discoveredTagIds) {
+ for (final int tagId : discoveredTagIds) {
if (bestTagEncodings.containsKey(tagId)) {
compressionHeader.addTagEncoding(tagId, bestTagEncodings.get(tagId).compressor, bestTagEncodings.get(tagId).params);
} else {
@@ -171,7 +173,8 @@ static void updateSubstitutionCodes(final List cramCompre
* records holding the tags
* @return a 3D byte array: a set of unique lists of tag ids.
*/
- private static byte[][][] buildTagIdDictionary(final List cramCompressionRecords) {
+ private static byte[][][] buildTagIdDictionary(final List cramCompressionRecords,
+ final Set discoveredTagIds) {
final Comparator comparator = new Comparator() {
@Override
public int compare(final ReadTag o1, final ReadTag o2) {
@@ -209,9 +212,11 @@ public int compare(final byte[] o1, final byte[] o2) {
int tagIndex = 0;
for (int i = 0; i < record.getTags().size(); i++) {
- tagIds[i * 3] = (byte) record.getTags().get(tagIndex).keyType3Bytes.charAt(0);
- tagIds[i * 3 + 1] = (byte) record.getTags().get(tagIndex).keyType3Bytes.charAt(1);
- tagIds[i * 3 + 2] = (byte) record.getTags().get(tagIndex).keyType3Bytes.charAt(2);
+ final ReadTag tag = record.getTags().get(tagIndex);
+ tagIds[i * 3] = (byte) tag.keyType3Bytes.charAt(0);
+ tagIds[i * 3 + 1] = (byte) tag.keyType3Bytes.charAt(1);
+ tagIds[i * 3 + 2] = (byte) tag.keyType3Bytes.charAt(2);
+ discoveredTagIds.add(tag.keyType3BytesAsInt);
tagIndex++;
}
@@ -255,13 +260,32 @@ static byte getTagType(final int tagID) {
}
/**
- * Get the best external compressor to use for the given byte array.
- *
- * @param data byte array to compress
- * @return best compressor to use for the data
+ * Get the trial compressor for a tag, creating one if it doesn't exist yet. The trial
+ * compressor tries GZIP, rANS order-0, and rANS order-1 on the first few blocks, then
+ * uses the winner for subsequent blocks until re-trial.
*/
- public ExternalCompressor getBestExternalCompressor(final byte[] data) {
- return encodingMap.getBestExternalCompressor(data, encodingStrategy);
+ private ExternalCompressor getTagTrialCompressor(final int tagID) {
+ return tagTrialCompressors.computeIfAbsent(tagID, id -> {
+ final CompressorCache cache = new CompressorCache();
+ // Extract the two-character tag name from the encoded tag ID
+ final char tag1 = (char) ((id >> 16) & 0xFF);
+ final char tag2 = (char) ((id >> 8) & 0xFF);
+
+ // BZIP2's BWT excels on structured alignment tags (SA:Z, XA:Z) where
+ // repeated substrings like contig names and CIGAR patterns are common
+ final boolean useBzip2 = (tag1 == 'S' && tag2 == 'A') || (tag1 == 'X' && tag2 == 'A');
+ if (useBzip2) {
+ return new TrialCompressor(List.of(
+ cache.getCompressorForMethod(BlockCompressionMethod.GZIP, encodingStrategy.getGZIPCompressionLevel()),
+ cache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ZERO.ordinal()),
+ cache.getCompressorForMethod(BlockCompressionMethod.BZIP2, ExternalCompressor.NO_COMPRESSION_ARG)
+ ));
+ }
+ return new TrialCompressor(List.of(
+ cache.getCompressorForMethod(BlockCompressionMethod.GZIP, encodingStrategy.getGZIPCompressionLevel()),
+ cache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ZERO.ordinal())
+ ));
+ });
}
byte[] getDataForTag(final List cramCompressionRecords, final int tagID) {
@@ -368,12 +392,11 @@ private EncodingDescriptor buildTagEncodingForSize(final int tagValueSize, final
*/
private EncodingDetails buildEncodingForTag(final List records, final int tagID) {
final EncodingDetails details = new EncodingDetails();
- final byte[] data = getDataForTag(records, tagID);
-
- details.compressor = getBestExternalCompressor(data);
+ details.compressor = getTagTrialCompressor(tagID);
final byte type = getTagType(tagID);
switch (type) {
+ // Fixed-size types — no record iteration needed
case 'A':
case 'c':
case 'C':
@@ -391,39 +414,66 @@ private EncodingDetails buildEncodingForTag(final List re
details.params = buildTagEncodingForSize(2, tagID);
return details;
+ // Z-type strings — always use stop-byte encoding, no record iteration needed
case 'Z':
+ details.params = new ByteArrayStopEncoding((byte) '\t', tagID).toEncodingDescriptor();
+ return details;
+
+ // B-type arrays — need record iteration for size range and data
case 'B':
- final ByteSizeRange stats = getByteSizeRangeOfTagValues(records, tagID);
- final boolean singleSize = stats.min == stats.max;
- if (singleSize) {
- details.params = buildTagEncodingForSize(stats.min, tagID);
- return details;
- }
+ return buildEncodingForVariableLengthTag(records, tagID, type, details);
+ default:
+ throw new IllegalArgumentException("Unknown tag type: " + (char) type);
+ }
+ }
- if (type == 'Z') {
- details.params = new ByteArrayStopEncoding((byte) '\t', tagID).toEncodingDescriptor();
- return details;
- }
+ /**
+ * Build encoding for B-type array tags using a single pass over records
+ * to collect size range and raw data bytes (for getUnusedByte check).
+ */
+ private EncodingDetails buildEncodingForVariableLengthTag(
+ final List records, final int tagID, final byte type,
+ final EncodingDetails details) {
+ baosForTagValues.reset();
- final int minSize_threshold_ForByteArrayStopEncoding = 100;
- if (stats.min > minSize_threshold_ForByteArrayStopEncoding) {
- final int unusedByte = getUnusedByte(data);
- if (unusedByte > ALL_BYTES_USED) {
- details.params = new ByteArrayStopEncoding((byte) unusedByte, tagID).toEncodingDescriptor();
- return details;
+ int min = Integer.MAX_VALUE;
+ int max = Integer.MIN_VALUE;
+ for (final CRAMCompressionRecord record : records) {
+ if (record.getTags() != null) {
+ for (final ReadTag tag : record.getTags()) {
+ if (tag.keyType3BytesAsInt == tagID) {
+ final int size = getTagValueByteSize(type, tag.getValue());
+ if (size < min) min = size;
+ if (size > max) max = size;
+ try {
+ baosForTagValues.write(tag.getValueAsByteArray());
+ } catch (final IOException e) {
+ throw new RuntimeIOException(e);
+ }
}
}
+ }
+ }
+
+ if (min == max) {
+ details.params = buildTagEncodingForSize(min, tagID);
+ return details;
+ }
- // NOTE: This usage of ByteArrayLenEncoding does NOT split the stream between core
- // and external, since both the length and byte encoding instantiated here are
- // external. But it does create two different external encodings with the same externalID (???)
- details.params = new ByteArrayLenEncoding(
- new ExternalIntegerEncoding(tagID),
- new ExternalByteArrayEncoding(tagID)).toEncodingDescriptor();
+ // B-type with variable sizes
+ final int minSize_threshold_ForByteArrayStopEncoding = 100;
+ if (min > minSize_threshold_ForByteArrayStopEncoding) {
+ final int unusedByte = getUnusedByte(baosForTagValues.toByteArray());
+ if (unusedByte > ALL_BYTES_USED) {
+ details.params = new ByteArrayStopEncoding((byte) unusedByte, tagID).toEncodingDescriptor();
return details;
- default:
- throw new IllegalArgumentException("Unknown tag type: " + (char) type);
+ }
}
+
+ details.params = new ByteArrayLenEncoding(
+ new ExternalIntegerEncoding(tagID),
+ new ExternalByteArrayEncoding(tagID)).toEncodingDescriptor();
+ return details;
}
/**
diff --git a/src/main/java/htsjdk/samtools/cram/common/CramVersions.java b/src/main/java/htsjdk/samtools/cram/common/CramVersions.java
index 852a135563..be0c86f909 100644
--- a/src/main/java/htsjdk/samtools/cram/common/CramVersions.java
+++ b/src/main/java/htsjdk/samtools/cram/common/CramVersions.java
@@ -17,7 +17,7 @@ public final class CramVersions {
/**
* The default CRAM version when creating a new CRAM output file or stream.
*/
- public static final CRAMVersion DEFAULT_CRAM_VERSION = CRAM_v3;
+ public static final CRAMVersion DEFAULT_CRAM_VERSION = CRAM_v3_1;
/**
* Return true if {@code candidateVersion} is a supported CRAM version.
diff --git a/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java b/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java
index c8a5bf04d9..7aff455df4 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java
@@ -6,7 +6,18 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
+/**
+ * Utility methods shared across CRAM 3.1 compression codecs (rANS, Range, Name Tokeniser, etc.),
+ * including uint7 encoding, bit-packing, and STRIPE data transformation.
+ */
public class CompressionUtils {
+ /**
+ * Write an unsigned integer using 7-bit variable-length encoding (uint7). Each output byte uses
+ * 7 bits for data and the high bit as a continuation flag (1 = more bytes follow).
+ *
+ * @param i the value to write (must be non-negative)
+ * @param cp the output buffer
+ */
public static void writeUint7(final int i, final ByteBuffer cp) {
int s = 0;
int X = i;
@@ -22,18 +33,63 @@ public static void writeUint7(final int i, final ByteBuffer cp) {
} while (s > 0);
}
+ /**
+ * Read an unsigned integer using 7-bit variable-length encoding (uint7). Each byte uses
+ * 7 bits for data and the high bit as a continuation flag (1 = more bytes follow).
+ *
+ * @param cp the input buffer
+ * @return the decoded unsigned integer value
+ */
public static int readUint7(final ByteBuffer cp) {
int i = 0;
int c;
do {
- //read byte
c = cp.get();
i = (i << 7) | (c & 0x7f);
} while ((c & 0x80) != 0);
return i;
}
- // Implementation of the spec bit-packing algorithm for range coding.
+ /** Write uint7 into byte[] at posHolder[0], advancing posHolder[0]. */
+ public static void writeUint7(final int i, final byte[] buf, final int[] posHolder) {
+ int s = 0;
+ int X = i;
+ do {
+ s += 7;
+ X >>= 7;
+ } while (X > 0);
+ int pos = posHolder[0];
+ do {
+ s -= 7;
+ final int s_ = (s > 0) ? 1 : 0;
+ buf[pos++] = (byte) (((i >> s) & 0x7f) + (s_ << 7));
+ } while (s > 0);
+ posHolder[0] = pos;
+ }
+
+ /** Read uint7 from byte[] at posHolder[0], advancing posHolder[0]. */
+ public static int readUint7(final byte[] buf, final int[] posHolder) {
+ int i = 0;
+ int c;
+ do {
+ c = buf[posHolder[0]++];
+ i = (i << 7) | (c & 0x7f);
+ } while ((c & 0x80) != 0);
+ return i;
+ }
+
+ /**
+ * Pack input symbols into a smaller number of bits per value based on the number of distinct
+ * symbols. Writes the pack header (symbol count, mapping table, packed length) to outBuffer
+ * and returns the packed data as a separate buffer.
+ *
+ * @param inBuffer the input data to pack
+ * @param outBuffer the output buffer for the pack header (symbol count, mapping table, packed length)
+ * @param frequencyTable frequency counts for each byte value (0-255)
+ * @param packMappingTable mapping from original symbol to packed value
+ * @param numSymbols the number of distinct symbols in the input
+ * @return a ByteBuffer containing the packed data
+ */
public static ByteBuffer encodePack(
final ByteBuffer inBuffer,
final ByteBuffer outBuffer,
@@ -97,6 +153,16 @@ public static ByteBuffer encodePack(
return encodedBuffer; // Here position = 0 since we have always accessed the data buffer using index
}
+ /**
+ * Unpack bit-packed data back to one byte per symbol, reversing the transformation
+ * performed by {@link #encodePack}.
+ *
+ * @param inBuffer the packed input data
+ * @param packMappingTable mapping from packed value back to original symbol
+ * @param numSymbols the number of distinct symbols (determines bits per value)
+ * @param uncompressedPackOutputLength the expected number of output bytes
+ * @return a ByteBuffer containing the unpacked data
+ */
public static ByteBuffer decodePack(
final ByteBuffer inBuffer,
final byte[] packMappingTable,
@@ -148,6 +214,13 @@ else if (numSymbols <= 16){
return outBufferPack;
}
+ /**
+ * Allocate an output buffer large enough to hold compressed rANS data, including worst-case
+ * frequency table overhead and header bytes.
+ *
+ * @param inSize the uncompressed input size
+ * @return a little-endian ByteBuffer sized for the worst-case compressed output
+ */
public static ByteBuffer allocateOutputBuffer(final int inSize) {
// This calculation is identical to the one in samtools rANS_static.c
// Presumably the frequency table (always big enough for order 1) = 257*257,
@@ -156,33 +229,94 @@ public static ByteBuffer allocateOutputBuffer(final int inSize) {
final int compressedSize = (int) (inSize + 257 * 257 * 3 + 9);
final ByteBuffer outputBuffer = allocateByteBuffer(compressedSize);
if (outputBuffer.remaining() < compressedSize) {
- throw new CRAMException("Failed to allocate sufficient buffer size for RANS coder.");
+ throw new CRAMException("Failed to allocate sufficient buffer size for CRAM codec.");
}
return outputBuffer;
}
- // returns a new LITTLE_ENDIAN ByteBuffer of size = bufferSize
+ /**
+ * Allocate a new little-endian ByteBuffer of the specified size.
+ *
+ * @param bufferSize the capacity of the buffer
+ * @return a new little-endian ByteBuffer
+ */
public static ByteBuffer allocateByteBuffer(final int bufferSize){
return ByteBuffer.allocate(bufferSize).order(ByteOrder.LITTLE_ENDIAN);
}
- // returns a LITTLE_ENDIAN ByteBuffer that is created by wrapping a byte[]
+ /**
+ * Wrap a byte array in a little-endian ByteBuffer.
+ *
+ * @param inputBytes the byte array to wrap
+ * @return a little-endian ByteBuffer backed by the input array
+ */
public static ByteBuffer wrap(final byte[] inputBytes){
return ByteBuffer.wrap(inputBytes).order(ByteOrder.LITTLE_ENDIAN);
}
- // returns a LITTLE_ENDIAN ByteBuffer that is created by inputBuffer.slice()
+ /**
+ * Create a little-endian slice of the given ByteBuffer (from position to limit).
+ *
+ * @param inputBuffer the buffer to slice
+ * @return a new little-endian ByteBuffer sharing the input's content
+ */
public static ByteBuffer slice(final ByteBuffer inputBuffer){
return inputBuffer.slice().order(ByteOrder.LITTLE_ENDIAN);
}
+ /** Number of interleaved streams used by the STRIPE transformation. */
+ private static final int STRIPE_NUM_STREAMS = 4;
+
+ /**
+ * Compute the uncompressed size for each stripe stream. Earlier streams get the extra bytes
+ * when totalSize is not evenly divisible by the number of streams.
+ *
+ * @param totalSize the total uncompressed size
+ * @return array of per-stream sizes
+ */
+ public static int[] buildStripeUncompressedSizes(final int totalSize) {
+ final int[] sizes = new int[STRIPE_NUM_STREAMS];
+ final int q = totalSize / STRIPE_NUM_STREAMS;
+ final int r = totalSize % STRIPE_NUM_STREAMS;
+ for (int i = 0; i < STRIPE_NUM_STREAMS; i++) {
+ sizes[i] = (i < r) ? q + 1 : q;
+ }
+ return sizes;
+ }
+
+ /**
+ * Transpose (de-interleave) input data into N=4 separate streams using round-robin byte distribution.
+ * Stream i gets bytes at positions i, i+4, i+8, ...
+ *
+ * @param inBuffer the input data (position to limit)
+ * @param sizes per-stream uncompressed sizes from {@link #buildStripeUncompressedSizes}
+ * @return array of ByteBuffers, one per stream
+ */
+ public static ByteBuffer[] stripeTranspose(final ByteBuffer inBuffer, final int[] sizes) {
+ final ByteBuffer[] chunks = new ByteBuffer[sizes.length];
+ for (int i = 0; i < sizes.length; i++) {
+ chunks[i] = allocateByteBuffer(sizes[i]);
+ for (int j = 0; j < sizes[i]; j++) {
+ chunks[i].put(j, inBuffer.get(inBuffer.position() + j * sizes.length + i));
+ }
+ }
+ return chunks;
+ }
+
+ /**
+ * @return the number of streams used by the STRIPE codec (always 4)
+ */
+ public static int getStripeNumStreams() {
+ return STRIPE_NUM_STREAMS;
+ }
+
/**
- * Return a byte array with a size that matches the limit of the provided ByteBuffer. If the ByteBuffer is
- * backed by a byte array that matches the limit of the ByteBuffer, the backing array will be returned directly.
- * Otherwise, copy the contents of the ByteBuffer into a new byte array and return the new byte array.
- * @param buffer input ByteBuffer which is the source of the byte array
- * @return A byte array. If the ByteBuffer is backed by a byte array that matches the limit of the ByteBuffer,
- * return the backing array directly. Otherwise, copy the contents of the ByteBuffer into a new byte array.
+ * Return a byte array with contents matching the ByteBuffer from position 0 to limit.
+ * If the buffer is backed by an array that exactly matches its limit, returns the
+ * backing array directly (no copy). Otherwise copies the data into a new array.
+ *
+ * @param buffer the source ByteBuffer
+ * @return a byte array containing the buffer's data
*/
public static byte[] toByteArray(final ByteBuffer buffer) {
if (buffer.hasArray() && buffer.arrayOffset() == 0 && buffer.array().length == buffer.limit()) {
diff --git a/src/main/java/htsjdk/samtools/cram/compression/ExternalCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/ExternalCompressor.java
index 339b79d289..82e64a6dd7 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/ExternalCompressor.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/ExternalCompressor.java
@@ -9,24 +9,43 @@
import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.range.RangeExternalCompressor;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Decode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Encode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Decode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Encode;
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.utils.ValidationUtils;
+/**
+ * Abstract base class for CRAM external block compressors. Each subclass wraps a specific
+ * compression algorithm (GZIP, rANS, BZIP2, etc.) and provides compress/uncompress operations
+ * on raw byte arrays. Instances are typically obtained via {@link #getCompressorForMethod}.
+ */
public abstract class ExternalCompressor {
final public static int NO_COMPRESSION_ARG = -1;
final private static String argErrorMessage = "Invalid compression arg (%d) requested for CRAM %s compressor";
private BlockCompressionMethod method;
+ /**
+ * @param method the compression method for this compressor, or null if the method will be
+ * determined later (e.g., by {@link TrialCompressor} after its first trial)
+ */
protected ExternalCompressor(final BlockCompressionMethod method) {
this.method = method;
}
+ /**
+ * Set the compression method. For use by subclasses like {@link TrialCompressor} that
+ * determine their method after construction.
+ *
+ * @param method the compression method to set
+ */
+ protected void setMethod(final BlockCompressionMethod method) {
+ this.method = method;
+ }
+
/**
* Compress the data using the codec-specific context model.
* @param data the data to compress
@@ -35,13 +54,30 @@ protected ExternalCompressor(final BlockCompressionMethod method) {
*/
public abstract byte[] compress(byte[] data, CRAMCodecModelContext contextModel);
+ /**
+ * Decompress the data.
+ *
+ * @param data the compressed data
+ * @return the decompressed data
+ */
public abstract byte[] uncompress(byte[] data);
- public BlockCompressionMethod getMethod() { return method; }
+ /**
+ * @return the compression method used by this compressor
+ * @throws IllegalStateException if the method has not been determined yet (compress() must
+ * be called first for compressors like {@link TrialCompressor})
+ */
+ public BlockCompressionMethod getMethod() {
+ if (method == null) {
+ throw new IllegalStateException(
+ "Compression method has not been determined yet — compress() must be called before getMethod()");
+ }
+ return method;
+ }
@Override
public String toString() {
- return this.getMethod().toString();
+ return method != null ? method.toString() : "UNDETERMINED";
}
@Override
@@ -51,12 +87,12 @@ public boolean equals(Object o) {
ExternalCompressor that = (ExternalCompressor) o;
- return getMethod() == that.getMethod();
+ return method == that.method;
}
@Override
public int hashCode() {
- return getMethod().hashCode();
+ return method != null ? method.hashCode() : 0;
}
/**
diff --git a/src/main/java/htsjdk/samtools/cram/compression/GZIPExternalCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/GZIPExternalCompressor.java
index cbb493952d..03b5bf6470 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/GZIPExternalCompressor.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/GZIPExternalCompressor.java
@@ -25,75 +25,66 @@
package htsjdk.samtools.cram.compression;
import htsjdk.samtools.Defaults;
-import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
-import htsjdk.samtools.util.IOUtil;
-import htsjdk.samtools.util.RuntimeIOException;
+import htsjdk.samtools.util.GzipCodec;
import htsjdk.utils.ValidationUtils;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
+import java.nio.ByteBuffer;
import java.util.zip.Deflater;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
public final class GZIPExternalCompressor extends ExternalCompressor {
- // The writeCompressionLevel value is used for write only. When this class is used to read
- // (uncompress) data read from a CRAM block, writeCompressionLevel does not necessarily reflect
- // the level that was used to compress that data (the compression level that used to create a
- // gzip compressed stream is not recovered from Slice block itself).
private final int writeCompressionLevel;
+ private final GzipCodec codec;
public GZIPExternalCompressor() {
this(Defaults.COMPRESSION_LEVEL);
}
public GZIPExternalCompressor(final int compressionLevel) {
+ this(compressionLevel, Deflater.DEFAULT_STRATEGY);
+ }
+
+ public GZIPExternalCompressor(final int compressionLevel, final int deflateStrategy) {
super(BlockCompressionMethod.GZIP);
- ValidationUtils.validateArg(compressionLevel >= Deflater.NO_COMPRESSION && compressionLevel <= Deflater.BEST_COMPRESSION,
+ ValidationUtils.validateArg(compressionLevel >= Deflater.NO_COMPRESSION && compressionLevel <= Deflater.BEST_COMPRESSION,
String.format("Invalid compression level (%d) requested for CRAM GZIP compression", compressionLevel));
this.writeCompressionLevel = compressionLevel;
+ this.codec = new GzipCodec(compressionLevel, deflateStrategy);
}
- /**
- * @return the gzip compression level used by this compressor's compress method
- */
+ /** @return the gzip compression level used by this compressor's compress method */
public int getWriteCompressionLevel() { return writeCompressionLevel; }
@Override
public byte[] compress(final byte[] data, final CRAMCodecModelContext unused_contextModel) {
- final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
- try (final GZIPOutputStream gos = new GZIPOutputStream(byteArrayOutputStream) {
- {
- def.setLevel(writeCompressionLevel);
- }
- }) {
- IOUtil.copyStream(new ByteArrayInputStream(data), gos);
- } catch (final IOException e) {
- throw new RuntimeIOException(e);
- }
-
- return byteArrayOutputStream.toByteArray();
+ final ByteBuffer compressed = codec.compress(ByteBuffer.wrap(data));
+ final byte[] result = new byte[compressed.remaining()];
+ compressed.get(result);
+ return result;
}
@Override
public byte[] uncompress(byte[] data) {
- // Note that when uncompressing data that was retrieved from a (slice) data block
- // embedded in a CRAM stream, the writeCompressionLevel value is not recovered
- // from the block, and therefore does not necessarily reflect the value that was used
- // to compress the data that is now being uncompressed
- try (final GZIPInputStream gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(data))) {
- return InputStreamUtils.readFully(gzipInputStream);
- } catch (final IOException e) {
- throw new RuntimeIOException(e);
- }
+ final ByteBuffer decompressed = codec.decompress(ByteBuffer.wrap(data));
+ final byte[] result = new byte[decompressed.remaining()];
+ decompressed.get(result);
+ return result;
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if (!super.equals(o)) return false;
+ return writeCompressionLevel == ((GZIPExternalCompressor) o).writeCompressionLevel;
+ }
+
+ @Override
+ public int hashCode() {
+ return 31 * super.hashCode() + writeCompressionLevel;
}
@Override
public String toString() {
return String.format("%s(%d)", super.toString(), writeCompressionLevel);
}
-
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/RANS4x8ExternalCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/RANS4x8ExternalCompressor.java
index a45afe46ec..3d81365995 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/RANS4x8ExternalCompressor.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/RANS4x8ExternalCompressor.java
@@ -25,15 +25,19 @@
package htsjdk.samtools.cram.compression;
import htsjdk.samtools.cram.compression.rans.RANSParams;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Params;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Decode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Encode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Params;
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
-import java.nio.ByteBuffer;
import java.util.Objects;
+/**
+ * CRAM external compressor that uses the rANS 4x8 entropy coder (CRAM 3.0).
+ * Wraps shared {@link RANS4x8Encode} and {@link RANS4x8Decode} instances to
+ * avoid repeated allocation of large internal tables.
+ */
public final class RANS4x8ExternalCompressor extends ExternalCompressor {
private final RANSParams.ORDER order;
private final RANS4x8Encode ransEncode;
@@ -50,6 +54,13 @@ public RANS4x8ExternalCompressor(
this(RANSParams.ORDER.ZERO, ransEncode, ransDecode);
}
+ /**
+ * Create a rANS 4x8 compressor with the specified order (as an integer).
+ *
+ * @param order the rANS order (0 or 1)
+ * @param ransEncode shared encoder instance
+ * @param ransDecode shared decoder instance
+ */
public RANS4x8ExternalCompressor(
final int order,
final RANS4x8Encode ransEncode,
@@ -57,6 +68,13 @@ public RANS4x8ExternalCompressor(
this(RANSParams.ORDER.fromInt(order), ransEncode, ransDecode);
}
+ /**
+ * Create a rANS 4x8 compressor with the specified order.
+ *
+ * @param order the rANS order (ZERO or ONE)
+ * @param ransEncode shared encoder instance
+ * @param ransDecode shared decoder instance
+ */
public RANS4x8ExternalCompressor(
final RANSParams.ORDER order,
final RANS4x8Encode ransEncode,
@@ -69,15 +87,12 @@ public RANS4x8ExternalCompressor(
@Override
public byte[] compress(final byte[] data, final CRAMCodecModelContext unused_contextModel) {
- final RANS4x8Params params = new RANS4x8Params(order);
- final ByteBuffer buffer = ransEncode.compress(CompressionUtils.wrap(data), params);
- return toByteArray(buffer);
+ return ransEncode.compress(data, new RANS4x8Params(order));
}
@Override
public byte[] uncompress(byte[] data) {
- final ByteBuffer buf = ransDecode.uncompress(CompressionUtils.wrap(data));
- return toByteArray(buf);
+ return ransDecode.uncompress(data);
}
@Override
@@ -100,14 +115,4 @@ public int hashCode() {
return Objects.hash(getMethod(), order);
}
- private byte[] toByteArray(final ByteBuffer buffer) {
- if (buffer.hasArray() && buffer.arrayOffset() == 0 && buffer.array().length == buffer.limit()) {
- return buffer.array();
- }
-
- final byte[] bytes = new byte[buffer.remaining()];
- buffer.get(bytes);
- return bytes;
- }
-
}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/RANSNx16ExternalCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/RANSNx16ExternalCompressor.java
index 0a9ba87f7c..ddb052257e 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/RANSNx16ExternalCompressor.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/RANSNx16ExternalCompressor.java
@@ -1,13 +1,19 @@
package htsjdk.samtools.cram.compression;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Decode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Encode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import java.util.Objects;
+/**
+ * CRAM external compressor that uses the rANS Nx16 entropy coder (CRAM 3.1).
+ * Supports order-0, order-1, bit-packing, RLE, striping, and CAT modes via
+ * flag combinations. Wraps shared {@link RANSNx16Encode} and {@link RANSNx16Decode}
+ * instances to avoid repeated allocation of large internal tables.
+ */
public final class RANSNx16ExternalCompressor extends ExternalCompressor {
private final int flags;
private final RANSNx16Encode ransEncode;
@@ -25,6 +31,13 @@ public RANSNx16ExternalCompressor(
this(0, ransEncode, ransDecode); // order 0
}
+ /**
+ * Create a rANS Nx16 compressor with the specified flag combination.
+ *
+ * @param flags bitmask of rANS Nx16 flags (order, pack, RLE, stripe, etc.)
+ * @param ransEncode shared encoder instance
+ * @param ransDecode shared decoder instance
+ */
public RANSNx16ExternalCompressor(
final int flags,
final RANSNx16Encode ransEncode,
@@ -37,13 +50,12 @@ public RANSNx16ExternalCompressor(
@Override
public byte[] compress(final byte[] data, final CRAMCodecModelContext unused_contextModel) {
- final RANSNx16Params params = new RANSNx16Params(flags);
- return CompressionUtils.toByteArray(ransEncode.compress(CompressionUtils.wrap(data), params));
+ return ransEncode.compress(data, new RANSNx16Params(flags));
}
@Override
public byte[] uncompress(byte[] data) {
- return CompressionUtils.toByteArray(ransDecode.uncompress(CompressionUtils.wrap(data)));
+ return ransDecode.uncompress(data);
}
@Override
diff --git a/src/main/java/htsjdk/samtools/cram/compression/TrialCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/TrialCompressor.java
new file mode 100644
index 0000000000..4314941c79
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/TrialCompressor.java
@@ -0,0 +1,162 @@
+package htsjdk.samtools.cram.compression;
+
+import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
+
+import java.util.List;
+
+/**
+ * An {@link ExternalCompressor} that tries multiple candidate compressors and selects the one
+ * that produces the smallest output. Matches htslib's trial compression approach.
+ *
+ *
Not thread-safe. Instances must not be shared across threads.
+ *
+ *
+ *
Trial phase ({@code nTrials} non-empty blocks): compresses with ALL candidates,
+ * picks the smallest for each block, and accumulates sizes to determine the overall winner.
+ *
Production phase (next {@code trialSpan} blocks): uses the cached winner only.
+ *
Re-trial: after {@code trialSpan} blocks, re-enters the trial phase to adapt
+ * to changing data characteristics.
+ *
+ *
+ *
The compression method is initially null and is set after the first non-empty block is
+ * compressed. Calling {@link #getMethod()} before any non-empty {@link #compress} call will
+ * throw {@link IllegalStateException}.
+ *
+ * @see ExternalCompressor
+ */
+public class TrialCompressor extends ExternalCompressor {
+ /** Default number of non-empty blocks to trial before selecting a winner (matching htslib NTRIALS). */
+ private static final int DEFAULT_NTRIALS = 3;
+
+ /** Default number of blocks between re-trials (matching htslib TRIAL_SPAN). */
+ private static final int DEFAULT_TRIAL_SPAN = 70;
+
+ private final List candidates;
+ private final long[] accumulatedSizes;
+
+ private int nTrials = DEFAULT_NTRIALS;
+ private int trialSpan = DEFAULT_TRIAL_SPAN;
+
+ private ExternalCompressor winner;
+ private int trialBlocksRemaining = nTrials;
+ private int blocksUntilRetrial = 0;
+
+ /**
+ * Create a trial compressor with the given candidate compressors.
+ *
+ * @param candidates the candidate compressors to try (must have at least 2)
+ */
+ public TrialCompressor(final List candidates) {
+ super(null); // method unknown until first trial
+ if (candidates.size() < 2) {
+ throw new IllegalArgumentException("TrialCompressor requires at least 2 candidates");
+ }
+ this.candidates = List.copyOf(candidates);
+ this.accumulatedSizes = new long[candidates.size()];
+ }
+
+ /**
+ * Set the number of non-empty blocks to trial before selecting a winner.
+ * Must be called before the first {@link #compress} call. Intended for testing.
+ */
+ TrialCompressor setNTrials(final int nTrials) {
+ this.nTrials = nTrials;
+ this.trialBlocksRemaining = nTrials;
+ return this;
+ }
+
+ /**
+ * Set the number of blocks between re-trials.
+ * Must be called before the first {@link #compress} call. Intended for testing.
+ */
+ TrialCompressor setTrialSpan(final int trialSpan) {
+ this.trialSpan = trialSpan;
+ return this;
+ }
+
+ /**
+ * Compress data. During the trial phase, tries all candidates and accumulates size statistics.
+ * After {@code nTrials} non-empty blocks, selects the overall winner and uses it exclusively
+ * until the next re-trial.
+ *
+ * @param data the data to compress
+ * @param contextModel optional codec context model
+ * @return the compressed data from the best compressor for this block
+ */
+ @Override
+ public byte[] compress(final byte[] data, final CRAMCodecModelContext contextModel) {
+ if (data.length == 0) {
+ // Empty blocks don't count as trials but still need valid compressed output
+ final ExternalCompressor comp = this.winner == null ? candidates.get(0) : winner;
+ setMethod(comp.getMethod());
+ return comp.compress(data, contextModel);
+ }
+
+ if (winner != null) {
+ // Production phase: use cached winner
+ if (blocksUntilRetrial > 0) {
+ blocksUntilRetrial--;
+ setMethod(winner.getMethod());
+ return winner.compress(data, contextModel);
+ }
+ // Time for re-training: reset trial counter but leave blocksUntilRetrial at 0
+ // so subsequent calls fall through to the trial code below until the trial completes.
+ else {
+ this.trialBlocksRemaining = nTrials;
+ this.winner = null;
+
+ // Halve accumulated sizes to weight the new trial blocks
+ for (int i = 0; i < accumulatedSizes.length; i++) {
+ accumulatedSizes[i] /= 2;
+ }
+ }
+ }
+
+ // Trial: compress with all candidates, track sizes and per-candidate results
+ final byte[][] results = new byte[candidates.size()][];
+ for (int i = 0; i < candidates.size(); i++) {
+ results[i] = candidates.get(i).compress(data, contextModel);
+ accumulatedSizes[i] += results[i].length;
+ }
+
+ trialBlocksRemaining--;
+ if (trialBlocksRemaining == 0) {
+ // Trial complete — select overall winner from accumulated sizes
+ long bestAccum = Long.MAX_VALUE;
+ int winnerIdx = 0;
+ for (int i = 0; i < candidates.size(); i++) {
+ if (accumulatedSizes[i] < bestAccum) {
+ bestAccum = accumulatedSizes[i];
+ winnerIdx = i;
+ }
+ }
+ winner = candidates.get(winnerIdx);
+ blocksUntilRetrial = trialSpan;
+ setMethod(winner.getMethod());
+
+ // Return the winner's result for this block (even if not the smallest for this block)
+ return results[winnerIdx];
+ }
+ else {
+ // Still mid-trial — return the smallest result for this block
+ int bestIdx = 0;
+ for (int i = 1; i < results.length; i++) {
+ if (results[i].length < results[bestIdx].length) {
+ bestIdx = i;
+ }
+ }
+ setMethod(candidates.get(bestIdx).getMethod());
+ return results[bestIdx];
+ }
+ }
+
+ /**
+ * Decompress data. Delegates to the winner if one has been selected, otherwise to the
+ * first candidate. In practice, decompression is handled by the method-specific decompressor
+ * selected based on the block's compression method ID, not through this trial compressor.
+ */
+ @Override
+ public byte[] uncompress(final byte[] data) {
+ return (winner != null ? winner : candidates.get(0)).uncompress(data);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java
index 80afdbada9..d5c7b008a9 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java
@@ -17,6 +17,14 @@ public class FQZCompDecode {
private static final int NUMBER_OF_SYMBOLS = 256;
private static int SUPPORTED_FQZCOMP_VERSION = 5; // 5 because the spec says so
+ /**
+ * Decompress a FQZComp-compressed quality score block. Reads the uncompressed size, version,
+ * parameters, and then decodes quality symbols using adaptive arithmetic coding with the
+ * context model defined by the parameters.
+ *
+ * @param inBuffer the compressed FQZComp data (consumed by this call)
+ * @return a rewound ByteBuffer containing the decompressed quality scores
+ */
public static ByteBuffer uncompress( final ByteBuffer inBuffer) {
final int outBufferLength = CompressionUtils.readUint7(inBuffer);
final int version = inBuffer.get() & 0xFF;
@@ -77,7 +85,10 @@ public static ByteBuffer uncompress( final ByteBuffer inBuffer) {
}
- // If duplicate returns 1, else 0
+ /**
+ * Decode the header for a new record: selector, length, reverse flag, and duplicate flag.
+ * Updates the FQZ state with the decoded record metadata.
+ */
public static void decodeFQZNewRecord(
final ByteBuffer inBuffer,
final RangeCoder rangeCoder,
@@ -125,6 +136,16 @@ public static void decodeFQZNewRecord(
state.setReadOrdinal(state.getReadOrdinal() + 1);
}
+ /**
+ * Update the 16-bit context value after encoding/decoding a quality score. Incorporates
+ * quality history, position, delta, and selector into the context based on the parameter
+ * configuration. Also decrements the remaining bases counter.
+ *
+ * @param params the parameter block controlling context bit allocation
+ * @param state the mutable encoder/decoder state
+ * @param quality the quality value just encoded/decoded
+ * @return the new 16-bit context value
+ */
public static int fqzUpdateContext(final FQZParam params,
final FQZState state,
final int quality) {
@@ -148,6 +169,10 @@ public static int fqzUpdateContext(final FQZParam params,
return last & 0xffff;
}
+ /**
+ * Reverse quality score arrays for records that were flagged for reversal during encoding.
+ * Called after all quality scores have been decoded when the global DO_REVERSE flag is set.
+ */
public static void reverseQualities(final ByteBuffer outBuffer, final int outBufferLength, final FQZState fqzState) {
final boolean[] toReverse = fqzState.getReverseArray();
final int[] qualityLengths = fqzState.getQualityLengthArray();
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompEncode.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompEncode.java
index 0ad7d050fa..dc00d4837d 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompEncode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompEncode.java
@@ -1,18 +1,578 @@
package htsjdk.samtools.cram.compression.fqzcomp;
-import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.CompressionUtils;
+import htsjdk.samtools.cram.compression.range.ByteModel;
+import htsjdk.samtools.cram.compression.range.RangeCoder;
+
import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
/**
- * Placeholder for the (not yet implemented) CRAM 3.1 FQZComp quality score encoder.
+ * Encoder for the CRAM 3.1 FQZComp quality score compression codec. Uses an adaptive arithmetic
+ * (range) coder with a 16-bit context model built from quality history, position within read,
+ * and optionally delta and selector information.
+ *
+ *
The encoder analyzes quality data to select context model parameters, matching the adaptive
+ * parameter training in htslib's {@code fqz_pick_parameters} ({@code fqzcomp_qual.c}). Features:
+ *
+ *
Quality map for sparse symbol sets (e.g., NovaSeq 4-bin quality)
+ *
Delta table with approximate-sqrt mapping for running quality difference context
+ *
Symbol-count-based tuning (NovaSeq, HiSeqX, small datasets)
+ *
Duplicate detection for consecutive identical quality strings
+ *
Quality reversal for reverse-complemented reads (GFLAG_DO_REV)
+ *
Fixed-length read optimization
+ *
+ *
+ * @see FQZCompDecode
+ * @see CRAM 3.1 specification
*/
public class FQZCompEncode {
- // This method assumes that inBuffer is already rewound.
- // It compresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the compressed data.
- public ByteBuffer compress(final ByteBuffer inBuffer) {
- throw new CRAMException("FQZComp compression is not implemented");
+ private static final int FQZ_VERSION = 5;
+ private static final int CTX_SIZE = 1 << 16;
+ private static final int NUMBER_OF_SYMBOLS = 256;
+
+ // Reusable buffer to avoid per-call allocation of quality data array
+ private byte[] pooledQualData;
+
+ // Pooled model arrays to avoid 65K+ allocations per compress() call.
+ // qualityModels are keyed by maxSymbol+1 (numSymbols) since that varies per block.
+ private ByteModel[] pooledQualityModels;
+ private int pooledQualityModelsNumSymbols;
+ private final ByteModel[] pooledLengthModels = new ByteModel[4];
+ private ByteModel pooledReverseModel;
+ private ByteModel pooledDupModel;
+
+ private ByteModel[] getOrCreateQualityModels(final int numSymbols) {
+ if (pooledQualityModels == null || pooledQualityModelsNumSymbols != numSymbols) {
+ pooledQualityModels = new ByteModel[CTX_SIZE];
+ for (int i = 0; i < CTX_SIZE; i++) {
+ pooledQualityModels[i] = new ByteModel(numSymbols);
+ }
+ pooledQualityModelsNumSymbols = numSymbols;
+ } else {
+ for (int i = 0; i < CTX_SIZE; i++) {
+ pooledQualityModels[i].reset();
+ }
+ }
+ return pooledQualityModels;
+ }
+
+ private ByteModel[] getOrCreateLengthModels() {
+ for (int i = 0; i < 4; i++) {
+ if (pooledLengthModels[i] == null) {
+ pooledLengthModels[i] = new ByteModel(NUMBER_OF_SYMBOLS);
+ } else {
+ pooledLengthModels[i].reset();
+ }
+ }
+ return pooledLengthModels;
+ }
+
+ private ByteModel getOrCreateReverseModel() {
+ if (pooledReverseModel == null) {
+ pooledReverseModel = new ByteModel(2);
+ } else {
+ pooledReverseModel.reset();
+ }
+ return pooledReverseModel;
+ }
+
+ private ByteModel getOrCreateDupModel() {
+ if (pooledDupModel == null) {
+ pooledDupModel = new ByteModel(2);
+ } else {
+ pooledDupModel.reset();
+ }
+ return pooledDupModel;
+ }
+
+ // Parameter flag masks (same as in FQZParam)
+ private static final int PFLAG_DO_DEDUP = 0x02;
+ private static final int PFLAG_DO_LEN = 0x04;
+ private static final int PFLAG_HAVE_QMAP = 0x10;
+ private static final int PFLAG_HAVE_PTAB = 0x20;
+ private static final int PFLAG_HAVE_DTAB = 0x40;
+
+ // Global flag masks (same as in FQZGlobalFlags)
+ private static final int GFLAG_DO_REV = 0x04;
+
+ // Approximate sqrt table for delta context mapping (from htslib fqzcomp_qual.c)
+ private static final int[] DSQR = {
+ 0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+ };
+
+ /** BAM flag for reverse-complemented read */
+ private static final int BAM_FREVERSE = 0x10;
+
+ /**
+ * Compress concatenated quality scores using the FQZComp codec.
+ *
+ * @param inBuffer concatenated quality scores for all records (position to limit is compressed)
+ * @param recordLengths per-record quality score lengths; sum must equal inBuffer.remaining()
+ * @return a rewound ByteBuffer containing the compressed data
+ */
+ public ByteBuffer compress(final ByteBuffer inBuffer, final int[] recordLengths) {
+ return compress(inBuffer, recordLengths, null);
+ }
+
+ /**
+ * Compress concatenated quality scores using the FQZComp codec with per-record BAM flags.
+ *
+ * @param inBuffer concatenated quality scores for all records (position to limit is compressed)
+ * @param recordLengths per-record quality score lengths; sum must equal inBuffer.remaining()
+ * @param bamFlags per-record BAM flags (for reverse-complement and dedup detection), or null
+ * @return a rewound ByteBuffer containing the compressed data
+ */
+ public ByteBuffer compress(final ByteBuffer inBuffer, final int[] recordLengths, final int[] bamFlags) {
+ final int uncompressedSize = inBuffer.remaining();
+ if (uncompressedSize == 0) {
+ return CompressionUtils.allocateByteBuffer(0);
+ }
+
+ // Build parameters from data analysis
+ final EncoderParams params = buildParameters(inBuffer, recordLengths, bamFlags, uncompressedSize);
+
+ // Allocate output buffer (worst case: header + data with some growth)
+ final int worstCase = (int) ((uncompressedSize + recordLengths.length * 5) * 1.1) + 10000;
+ final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer(worstCase);
+ outBuffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ // Write header: uncompressed size + parameters
+ CompressionUtils.writeUint7(uncompressedSize, outBuffer);
+ writeParameters(outBuffer, params);
+
+ // Pre-shift ptab and dtab values to eliminate shifts in hot loop (matches htslib optimization)
+ final int[] ptab = params.ptab != null ? params.ptab.clone() : null;
+ if (ptab != null) {
+ for (int i = 0; i < ptab.length; i++) {
+ ptab[i] <<= params.ploc;
+ }
+ }
+ final int[] dtab = params.dtab != null ? params.dtab.clone() : null;
+ if (dtab != null) {
+ for (int i = 0; i < dtab.length; i++) {
+ dtab[i] <<= params.dloc;
+ }
+ }
+
+ // Pre-process: copy quality data into reusable buffer, reverse if needed
+ if (pooledQualData == null || pooledQualData.length < uncompressedSize) {
+ pooledQualData = new byte[uncompressedSize];
+ }
+ final byte[] qualData = pooledQualData;
+ inBuffer.get(qualData, 0, uncompressedSize);
+ if (params.doReverse) {
+ reverseQualitiesInPlace(qualData, recordLengths, bamFlags);
+ }
+
+ // Initialize models (pooled to avoid 65K+ allocations per call) and range coder
+ final ByteModel[] qualityModels = getOrCreateQualityModels(params.maxSymbol + 1);
+ final ByteModel[] lengthModels = getOrCreateLengthModels();
+ final ByteModel reverseModel = getOrCreateReverseModel();
+ final ByteModel dupModel = getOrCreateDupModel();
+ final RangeCoder rangeCoder = new RangeCoder();
+
+ // Set up range coder to write to byte[] starting after the header
+ final int headerSize = outBuffer.position();
+ final byte[] outArray = outBuffer.array();
+ rangeCoder.setOutput(outArray, headerSize);
+
+ // Encoding loop
+ int last = 0;
+ int qctx = 0;
+ int delta = 0;
+ int prevQ = 0;
+ int recIdx = 0;
+ int basesRemaining = 0;
+ boolean firstLen = true;
+ int lastLen = 0;
+
+ for (int i = 0; i < uncompressedSize; i++) {
+ if (basesRemaining == 0) {
+ // Start of new record
+ final int len = recordLengths[recIdx];
+
+ // Encode length (unless fixed-length and not first record)
+ if (!params.fixedLen || firstLen) {
+ lengthModels[0].modelEncode(rangeCoder, (len) & 0xFF);
+ lengthModels[1].modelEncode(rangeCoder, (len >> 8) & 0xFF);
+ lengthModels[2].modelEncode(rangeCoder, (len >> 16) & 0xFF);
+ lengthModels[3].modelEncode(rangeCoder, (len >> 24) & 0xFF);
+ firstLen = false;
+ }
+
+ // Encode reverse flag
+ if (params.doReverse) {
+ final boolean isReverse = bamFlags != null && (bamFlags[recIdx] & BAM_FREVERSE) != 0;
+ reverseModel.modelEncode(rangeCoder, isReverse ? 1 : 0);
+ }
+
+ // Encode duplicate flag
+ if (params.doDedup) {
+ final boolean isDup = i > 0 && len == lastLen &&
+ arraysEqual(qualData, i - lastLen, qualData, i, len);
+ dupModel.modelEncode(rangeCoder, isDup ? 1 : 0);
+ if (isDup) {
+ lastLen = len;
+ i += len - 1; // skip quality encoding, -1 because loop increments
+ recIdx++;
+ basesRemaining = 0;
+ continue;
+ }
+ }
+
+ lastLen = len;
+ basesRemaining = len;
+ last = 0;
+ qctx = 0;
+ delta = 0;
+ prevQ = 0;
+ recIdx++;
+ }
+
+ // Map quality through qmap and encode
+ final int rawQ = qualData[i] & 0xFF;
+ final int q = params.qmap[rawQ];
+ qualityModels[last].modelEncode(rangeCoder, q);
+
+ // Update context
+ qctx = (qctx << params.qshift) + params.qtab[q];
+ last = (qctx & params.qmask) << params.qloc;
+ if (ptab != null) {
+ last += ptab[Math.min(basesRemaining, 1023)];
+ }
+ if (dtab != null) {
+ last += dtab[Math.min(delta, 255)];
+ delta += (prevQ != q) ? 1 : 0;
+ prevQ = q;
+ }
+ last &= (CTX_SIZE - 1);
+
+ basesRemaining--;
+ }
+
+ rangeCoder.rangeEncodeEnd();
+ // Update the ByteBuffer position to match what the range coder wrote
+ outBuffer.position(rangeCoder.getOutputPosition());
+
+ // Post-process: undo reversal to restore input data
+ if (params.doReverse) {
+ reverseQualitiesInPlace(qualData, recordLengths, bamFlags);
+ }
+
+ outBuffer.limit(outBuffer.position());
+ outBuffer.rewind();
+ return outBuffer;
+ }
+
+ /**
+ * Build encoder parameters by analyzing the quality data. Adapts context model parameters
+ * based on symbol count, data size, and record characteristics, matching htslib's
+ * {@code fqz_pick_parameters}.
+ */
+ private EncoderParams buildParameters(final ByteBuffer inBuffer, final int[] recordLengths,
+ final int[] bamFlags, final int dataSize) {
+ final EncoderParams params = new EncoderParams();
+
+ // Scan for symbol statistics
+ final int[] qhist = new int[NUMBER_OF_SYMBOLS];
+ int maxSymbol = 0;
+ int nsym = 0;
+ for (int i = inBuffer.position(); i < inBuffer.limit(); i++) {
+ final int q = inBuffer.get(i) & 0xFF;
+ qhist[q]++;
+ if (q > maxSymbol) maxSymbol = q;
+ }
+ for (int i = 0; i <= maxSymbol; i++) {
+ if (qhist[i] > 0) nsym++;
+ }
+
+ // Start with strategy 0 defaults (basic, matching htslib strat_opts[0])
+ int qbits = 10, qshift = 5, pbits = 4, dbits = 2, dshift = 1;
+ int qloc = 0, sloc = 14, ploc = 10, dloc = 14;
+ int pshift = -1; // auto
+
+ // Symbol-count-based tuning (htslib lines 817-835)
+ if (nsym <= 4) {
+ // NovaSeq-style binned quality (4 values)
+ qshift = 2;
+ if (dataSize < 5_000_000) {
+ pbits = 2;
+ pshift = 5;
+ }
+ } else if (nsym <= 8) {
+ // HiSeqX-style quality (8 values)
+ qbits = Math.min(qbits, 9);
+ qshift = 3;
+ if (dataSize < 5_000_000) {
+ qbits = 6;
+ }
+ }
+
+ // Small dataset adjustment (htslib line 832-835)
+ if (dataSize < 300_000) {
+ qbits = qshift;
+ dbits = 2;
+ }
+
+ // Auto-compute pshift from read length (htslib line 814-815)
+ if (pshift < 0) {
+ pshift = recordLengths.length > 0 ?
+ Math.max(0, (int) (Math.log((double) recordLengths[0] / (1 << pbits)) / Math.log(2) + 0.5)) : 0;
+ }
+
+ params.qbits = qbits;
+ params.qshift = qshift;
+ params.qmask = (1 << qbits) - 1;
+ params.qloc = qloc;
+ params.sloc = sloc;
+ params.ploc = ploc;
+ params.dloc = dloc;
+
+ // Quality map for sparse symbol sets (htslib line 800, 849-863)
+ final boolean storeQmap = (nsym <= 8 && nsym * 2 < maxSymbol);
+ params.qmap = new int[NUMBER_OF_SYMBOLS];
+ if (storeQmap) {
+ // Forward map: qmap[originalQ] = encodedIndex
+ // Reverse map: reverseQmap[encodedIndex] = originalQ (for serialization)
+ params.reverseQmap = new int[nsym];
+ int j = 0;
+ for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) {
+ if (qhist[i] > 0) {
+ params.qmap[i] = j;
+ params.reverseQmap[j] = i;
+ j++;
+ } else {
+ params.qmap[i] = 0;
+ }
+ }
+ params.maxSymbol = nsym;
+ } else {
+ for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) {
+ params.qmap[i] = i;
+ }
+ params.maxSymbol = maxSymbol;
+ }
+
+ // Quality context table (identity - htslib line 867-873)
+ params.qtab = new int[NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) {
+ params.qtab[i] = i;
+ }
+
+ // Position table (htslib line 877-885)
+ if (pbits > 0) {
+ params.ptab = new int[1024];
+ for (int i = 0; i < 1024; i++) {
+ params.ptab[i] = Math.min((1 << pbits) - 1, i >> pshift);
+ }
+ }
+
+ // Delta table with approximate-sqrt mapping (htslib line 845-890)
+ if (dbits > 0) {
+ // Clamp dsqr values to fit in dbits
+ final int[] dsqr = DSQR.clone();
+ for (int i = 0; i < dsqr.length; i++) {
+ if (dsqr[i] > (1 << dbits) - 1) {
+ dsqr[i] = (1 << dbits) - 1;
+ }
+ }
+ params.dtab = new int[NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) {
+ params.dtab[i] = dsqr[Math.min(dsqr.length - 1, i >> dshift)];
+ }
+ }
+
+ // Check for fixed-length reads
+ params.fixedLen = true;
+ if (recordLengths.length > 1) {
+ for (int i = 1; i < recordLengths.length; i++) {
+ if (recordLengths[i] != recordLengths[0]) {
+ params.fixedLen = false;
+ break;
+ }
+ }
+ }
+
+ // Duplicate detection (htslib: enabled when dup rate > 1/500)
+ if (bamFlags != null) {
+ int dupCount = 0;
+ int offset = 0;
+ int prevLen = 0;
+ final byte[] data = new byte[inBuffer.remaining()];
+ final int pos = inBuffer.position();
+ inBuffer.get(data);
+ inBuffer.position(pos); // restore position
+ for (int rec = 0; rec < recordLengths.length; rec++) {
+ final int len = recordLengths[rec];
+ if (rec > 0 && len == prevLen && offset >= prevLen &&
+ arraysEqual(data, offset - prevLen, data, offset, len)) {
+ dupCount++;
+ }
+ offset += len;
+ prevLen = len;
+ }
+ params.doDedup = ((recordLengths.length + 1) / (dupCount + 1) < 500);
+ }
+
+ // Reverse flag for CRAM 3.x (htslib line 763-764)
+ params.doReverse = (bamFlags != null);
+
+ // Parameter flags
+ params.pflags =
+ (params.ptab != null ? PFLAG_HAVE_PTAB : 0) |
+ (params.dtab != null ? PFLAG_HAVE_DTAB : 0) |
+ (params.fixedLen ? PFLAG_DO_LEN : 0) |
+ (params.doDedup ? PFLAG_DO_DEDUP : 0) |
+ (storeQmap ? PFLAG_HAVE_QMAP : 0);
+
+ // Global flags
+ params.gflags = params.doReverse ? GFLAG_DO_REV : 0;
+
+ return params;
+ }
+
+ /**
+ * Serialize FQZComp parameters to the output buffer.
+ */
+ private void writeParameters(final ByteBuffer outBuffer, final EncoderParams params) {
+ // Version
+ outBuffer.put((byte) FQZ_VERSION);
+
+ // Global flags
+ outBuffer.put((byte) params.gflags);
+
+ // Single parameter block: context (little-endian u16)
+ outBuffer.put((byte) 0);
+ outBuffer.put((byte) 0);
+
+ // pflags
+ outBuffer.put((byte) params.pflags);
+
+ // max_sym
+ outBuffer.put((byte) params.maxSymbol);
+
+ // qbits(4) | qshift(4)
+ outBuffer.put((byte) ((params.qbits << 4) | params.qshift));
+
+ // qloc(4) | sloc(4)
+ outBuffer.put((byte) ((params.qloc << 4) | params.sloc));
+
+ // ploc(4) | dloc(4)
+ outBuffer.put((byte) ((params.ploc << 4) | params.dloc));
+
+ // Quality map (PFLAG_HAVE_QMAP): write the original quality values for each encoded index.
+ // The decoder reads maxSymbol bytes into qualityMap[]; qualityMap[encoded_index] = original_quality.
+ if ((params.pflags & PFLAG_HAVE_QMAP) != 0) {
+ for (int i = 0; i < params.maxSymbol; i++) {
+ outBuffer.put((byte) params.reverseQmap[i]);
+ }
+ }
+
+ // Position table (PFLAG_HAVE_PTAB)
+ if (params.ptab != null) {
+ storeArray(outBuffer, params.ptab, 1024);
+ }
+
+ // Delta table (PFLAG_HAVE_DTAB)
+ if (params.dtab != null) {
+ storeArray(outBuffer, params.dtab, NUMBER_OF_SYMBOLS);
+ }
}
+ /**
+ * Serialize an array using the two-level run-length encoding used by FQZComp for tables.
+ * This is the inverse of {@link FQZUtils#readArray(ByteBuffer, int[], int)}.
+ *
+ * @param outBuffer output buffer to write to
+ * @param array the array to serialize
+ * @param size number of elements to serialize
+ */
+ public static void storeArray(final ByteBuffer outBuffer, final int[] array, final int size) {
+ final byte[] tmp = new byte[4096];
+ int k = 0;
+ int j = 0;
+ for (int i = 0; i < size; j++) {
+ int runLen = i;
+ while (i < size && array[i] == j) {
+ i++;
+ }
+ runLen = i - runLen;
+
+ int r;
+ do {
+ r = Math.min(255, runLen);
+ tmp[k++] = (byte) r;
+ runLen -= r;
+ } while (r == 255);
+ }
+
+ int last = -1;
+ for (int i = 0; i < k; ) {
+ outBuffer.put(tmp[i]);
+ if ((tmp[i] & 0xFF) == last) {
+ int n = i + 1;
+ while (n < k && (tmp[n] & 0xFF) == last) {
+ n++;
+ }
+ outBuffer.put((byte) (n - i - 1));
+ i = n;
+ } else {
+ last = tmp[i] & 0xFF;
+ i++;
+ }
+ }
+ }
+
+ /**
+ * Reverse quality arrays in place for reverse-complemented reads.
+ * Called before encoding (to match decoder's post-decode reversal).
+ */
+ private static void reverseQualitiesInPlace(final byte[] data, final int[] lengths, final int[] bamFlags) {
+ if (bamFlags == null) return;
+ int offset = 0;
+ for (int rec = 0; rec < lengths.length; rec++) {
+ if ((bamFlags[rec] & BAM_FREVERSE) != 0) {
+ int lo = offset;
+ int hi = offset + lengths[rec] - 1;
+ while (lo < hi) {
+ final byte tmp = data[lo];
+ data[lo] = data[hi];
+ data[hi] = tmp;
+ lo++;
+ hi--;
+ }
+ }
+ offset += lengths[rec];
+ }
+ }
+
+ /** Compare sub-arrays for equality. */
+ private static boolean arraysEqual(final byte[] a, final int aOff, final byte[] b, final int bOff, final int len) {
+ for (int i = 0; i < len; i++) {
+ if (a[aOff + i] != b[bOff + i]) return false;
+ }
+ return true;
+ }
+
+ /** Internal parameter holder for the encoder. */
+ private static class EncoderParams {
+ int maxSymbol;
+ int qbits, qshift, qmask;
+ int qloc, sloc, ploc, dloc;
+ int[] qmap; // forward map: qmap[originalQ] = encodedIndex
+ int[] reverseQmap; // reverse map: reverseQmap[encodedIndex] = originalQ (for serialization)
+ int[] qtab;
+ int[] ptab;
+ int[] dtab;
+ boolean fixedLen;
+ boolean doDedup;
+ boolean doReverse;
+ int pflags;
+ int gflags;
+ }
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompExternalCompressor.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompExternalCompressor.java
index 4686b80a20..839cc3315c 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompExternalCompressor.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompExternalCompressor.java
@@ -5,26 +5,110 @@
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
+/**
+ * External compressor wrapper for the FQZComp quality score codec. Bridges the
+ * {@link ExternalCompressor} interface with the FQZComp encoder/decoder, extracting
+ * per-record metadata from the {@link CRAMCodecModelContext} that the encoder needs
+ * to determine record boundaries.
+ */
public class FQZCompExternalCompressor extends ExternalCompressor {
- // this codec is decode only; not implemented for writing
+ private final FQZCompEncode fqzCompEncoder;
private final FQZCompDecode fqzCompDecoder;
public FQZCompExternalCompressor(
- final FQZCompEncode unused_fqzCompEncoder,
+ final FQZCompEncode fqzCompEncoder,
final FQZCompDecode fqzCompDecoder) {
super(BlockCompressionMethod.FQZCOMP);
+ this.fqzCompEncoder = fqzCompEncoder;
this.fqzCompDecoder = fqzCompDecoder;
}
+ /**
+ * Compress quality score data using FQZComp. Per-record quality score lengths are extracted
+ * from the context model; if unavailable or mismatched, the data is treated as a single record.
+ * Empty data is returned as-is.
+ *
+ * @param data concatenated quality scores
+ * @param contextModel context containing per-record lengths and flags, or null
+ * @return compressed data
+ */
@Override
- public byte[] compress(byte[] data, final CRAMCodecModelContext unused_contextModel) {
- throw new UnsupportedOperationException("FQZComp compression is not implemented");
+ public byte[] compress(byte[] data, final CRAMCodecModelContext contextModel) {
+ if (data.length == 0) {
+ return data;
+ }
+
+ final int[] allLengths = getRecordLengths(data.length, contextModel);
+ final int[] allFlags = contextModel != null ? contextModel.getBamFlags() : null;
+
+ // FQZComp iterates quality bytes and advances its record index when it exhausts each
+ // record's length. Records with zero quality bytes (missing quality scores) are never
+ // visited, so we must filter them out of both the lengths and flags arrays.
+ final int[] lengths = filterNonZero(allLengths);
+ final int[] bamFlags = allFlags != null ? filterByNonZero(allFlags, allLengths) : null;
+
+ return CompressionUtils.toByteArray(
+ fqzCompEncoder.compress(CompressionUtils.wrap(data), lengths, bamFlags));
+ }
+
+ /**
+ * Determine per-record quality score lengths. Uses the context model when available and
+ * the lengths sum to the data size (indicating all records have preserved quality scores).
+ * Falls back to treating the entire data as a single record otherwise.
+ */
+ private static int[] getRecordLengths(final int dataLength, final CRAMCodecModelContext contextModel) {
+ if (contextModel != null && contextModel.getQualityScoreLengths() != null) {
+ final int[] lengths = contextModel.getQualityScoreLengths();
+ int sum = 0;
+ for (final int len : lengths) {
+ sum += len;
+ }
+ if (sum == dataLength) {
+ return lengths;
+ }
+ }
+ // Fall back: treat the entire data as a single record
+ return new int[]{dataLength};
+ }
+
+ /** Returns a new array containing only the non-zero values from the input. */
+ private static int[] filterNonZero(final int[] values) {
+ int count = 0;
+ for (final int v : values) {
+ if (v != 0) count++;
+ }
+ if (count == values.length) return values;
+ final int[] result = new int[count];
+ int j = 0;
+ for (final int v : values) {
+ if (v != 0) result[j++] = v;
+ }
+ return result;
+ }
+
+ /** Returns elements from {@code values} at positions where {@code filter} is non-zero. */
+ private static int[] filterByNonZero(final int[] values, final int[] filter) {
+ int count = 0;
+ for (final int f : filter) {
+ if (f != 0) count++;
+ }
+ if (count == filter.length) return values;
+ final int[] result = new int[count];
+ int j = 0;
+ for (int i = 0; i < filter.length; i++) {
+ if (filter[i] != 0) result[j++] = values[i];
+ }
+ return result;
}
+ /** {@inheritDoc} */
@Override
public byte[] uncompress(byte[] data) {
+ if (data.length == 0) {
+ return data;
+ }
return fqzCompDecoder.uncompress(CompressionUtils.wrap(data)).array();
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java
index 671965488e..37b05cab77 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java
@@ -2,6 +2,12 @@
import java.nio.ByteBuffer;
+/**
+ * Global flags byte for the FQZComp codec, read from the first byte after the version number
+ * in the FQZComp parameter header. Controls whether multiple parameter blocks are present,
+ * whether a selector table maps records to parameter blocks, and whether quality scores
+ * should be reversed for reverse-complemented reads.
+ */
public class FQZGlobalFlags {
public static final int MULTI_PARAM_FLAG_MASK = 0x01;
public static final int SELECTOR_TABLE_FLAG_MASK = 0x02;
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModels.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModels.java
index 696971888a..323d9081fc 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModels.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModels.java
@@ -2,6 +2,14 @@
import htsjdk.samtools.cram.compression.range.ByteModel;
+/**
+ * Adaptive arithmetic coding models used by FQZComp for encoding and decoding quality scores.
+ * Contains 65,536 quality models (one per possible 16-bit context), plus models for record
+ * length, reverse-complement flag, duplicate flag, and parameter selector.
+ *
+ * @see ByteModel
+ * @see FQZParams
+ */
public class FQZModels {
private final ByteModel[] quality; // Primary model for quality values
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java
index a9fa371168..0bf02f79db 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java
@@ -2,6 +2,22 @@
import java.nio.ByteBuffer;
+/**
+ * A single parameter block for the FQZComp quality score codec. Defines the context model used
+ * to compress quality scores, including the number of context bits allocated to quality history,
+ * position within read, running delta, and selector. Also holds optional lookup tables (qtab, ptab,
+ * dtab) and quality maps for remapping sparse quality value alphabets.
+ *
+ *
The 16-bit context for each quality score is assembled from:
+ *
+ *
{@code qbits} bits of quality history at bit position {@code qloc}
+ *
Position context from {@code ptab[]} at bit position {@code ploc}
+ *
Delta context from {@code dtab[]} at bit position {@code dloc}
+ *
Selector bits at bit position {@code sloc}
+ *
+ *
+ * @see FQZParams
+ */
public class FQZParam {
private static final int DEDUP_FLAG_MASK = 0x02;
private static final int FIXED_LEN_FLAG_MASK = 0x04;
@@ -160,7 +176,7 @@ public void setFixedLen(int fixedLen) {
private void cacheParameterFlags(int parameterFlags) {
this.doDedup = (parameterFlags & DEDUP_FLAG_MASK) != 0;
- setFixedLen(parameterFlags & FIXED_LEN_FLAG_MASK); //TODO: f'd up - is this a flag or an int ?
+ setFixedLen(parameterFlags & FIXED_LEN_FLAG_MASK);
this.doSel = (parameterFlags & SEL_FLAG_MASK) != 0;
this.doQmap = (parameterFlags & QMAP_FLAG_MASK) != 0;
this.doPos = (parameterFlags & PTAB_FLAG_MASK) != 0;
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParams.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParams.java
index 8a6e30527c..275bfcd45d 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParams.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParams.java
@@ -4,6 +4,14 @@
import java.util.ArrayList;
import java.util.List;
+/**
+ * Global parameters for the FQZComp quality score codec. Contains one or more {@link FQZParam} blocks,
+ * an optional selector table for choosing between parameter blocks per record, and the global
+ * flags controlling features like quality reversal and multi-parameter mode.
+ *
+ * @see FQZParam
+ * @see FQZGlobalFlags
+ */
public class FQZParams {
private static final int NUMBER_OF_SYMBOLS = 256;
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java
index 66743d361a..d86f74901f 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java
@@ -2,6 +2,11 @@
import htsjdk.samtools.cram.structure.CRAMEncodingStrategy;
+/**
+ * Mutable state tracked during FQZComp encoding and decoding. Updated per quality score as the
+ * context model evolves, and reset at each record boundary. Tracks quality context history,
+ * position within read, running delta, and per-record metadata (lengths, reversal flags).
+ */
public class FQZState {
private int qualityContext; // Qual-only sub-context
private int previousQuality; // Previous quality value
diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZUtils.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZUtils.java
index e1ea14a9f0..3c46157e29 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZUtils.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZUtils.java
@@ -2,8 +2,23 @@
import java.nio.ByteBuffer;
+/**
+ * Utility methods for FQZComp table serialization. Tables (qtab, ptab, dtab, stab) are stored
+ * using a two-level run-length encoding scheme where values are sequential from 0.
+ *
+ * @see FQZCompEncode#storeArray(java.nio.ByteBuffer, int[], int)
+ */
public class FQZUtils {
+ /**
+ * Read a table from a two-level run-length encoded stream. The encoding first stores
+ * run lengths for each successive value (0, 1, 2, ...), using 255 as a continuation marker.
+ * A second RLE level compresses consecutive identical run-length values.
+ *
+ * @param inBuffer the input stream to read from
+ * @param table output array to populate with decoded values
+ * @param size number of elements to decode into the table
+ */
public static void readArray(final ByteBuffer inBuffer, final int[] table, final int size) {
int j = 0; // array value
int z = 0; // array index: table[j]
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
index 63e0a29bb2..d9ea8639c9 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
@@ -2,6 +2,7 @@
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.CompressionUtils;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Decode;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
@@ -22,6 +23,9 @@ public class NameTokenisationDecode {
public static final int DEFAULT_POSITION_ALLOCATION = 30;
+ // Reusable rANS decoder — avoids allocating 1MB+ of arrays per token stream
+ private final RANSNx16Decode sharedRansDecoder = new RANSNx16Decode();
+
/**
* Uncompress the compressed name data in the input buffer. Return is a byte[] containing the read names,
* each separated by the byte value specified by nameSeparator, including a terminating separator.
@@ -35,7 +39,7 @@ public byte[] uncompress(final ByteBuffer inBuffer, final byte nameSeparator) {
final int numNames = inBuffer.getInt() & 0xFFFFFFFF;
final int useArith = inBuffer.get() & 0xFF;
- final TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames);
+ final TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames, sharedRansDecoder);
// two-dimensional array of previously decoded tokens, indexed as (nameIndex, tokenPosition - 1); note
// that unlike the TYPE stream in TokenStreams, where token position 1 is located at index 1 because of
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java
index d17220d1fc..5df46118db 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java
@@ -5,8 +5,8 @@
import htsjdk.samtools.cram.compression.nametokenisation.tokens.EncodeToken;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.range.RangeParams;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Encode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
import htsjdk.samtools.cram.structure.CRAMEncodingStrategy;
import java.nio.ByteBuffer;
@@ -15,28 +15,52 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
/**
- * A very naive implementation of a name tokenization encoder.
- *
- * It does not currently:
- *
- * - recognize and encode for duplicate streams (that is, it does not ever set the DUP_PREVIOUS_STREAM_FLAG_MASK flag)
- * - detect and encode for streams that are all match, as mentioned in the spec ("if a byte stream of token types
- * is entirely MATCH apart from the very first value it is discarded. It is possible to regenerate this during decode
- * by observing the other byte streams.")
+ * Name tokenization encoder that compresses read names by tokenizing them and encoding
+ * each token stream independently using rANS or arithmetic coding. Uses per-token-type
+ * flag selection to match htslib's tok3 encoder behavior.
*/
public class NameTokenisationEncode {
- private final static String READ_NAME_TOK_REGEX = "([a-zA-Z0-9]{1,9})|([^a-zA-Z0-9]+)";
- private final static Pattern READ_NAME_PATTERN = Pattern.compile(READ_NAME_TOK_REGEX);
- private final static String DIGITS0_REGEX = "^0+[0-9]*$";
- private final static Pattern DIGITS0_PATTERN = Pattern.compile(DIGITS0_REGEX);
-
- private final static String DIGITS_REGEX = "^[0-9]+$";
- private final static Pattern DIGITS_PATTERN = Pattern.compile(DIGITS_REGEX);
+ // Per-token-type rANS flag sets, matching htslib's tok3 level -3 profile.
+ // Each row is indexed by token type constant (TOKEN_TYPE=0 through TOKEN_END=12).
+ // Within each row, the encoder tries all listed flag combinations and keeps the smallest.
+ private static final int[][] RANS_FLAG_SETS_BY_TOKEN_TYPE = {
+ /* TOKEN_TYPE (0x00) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK, 0},
+ /* TOKEN_STRING (0x01) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, RANSNx16Params.ORDER_FLAG_MASK, 0},
+ /* TOKEN_CHAR (0x02) */ {0},
+ /* TOKEN_DIGITS0 (0x03) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.STRIPE_FLAG_MASK, 0},
+ /* TOKEN_DZLEN (0x04) */ {0},
+ /* TOKEN_DUP (0x05) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.STRIPE_FLAG_MASK},
+ /* TOKEN_DIFF (0x06) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.STRIPE_FLAG_MASK},
+ /* TOKEN_DIGITS (0x07) */ {RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.STRIPE_FLAG_MASK},
+ /* TOKEN_DELTA (0x08) */ {0},
+ /* TOKEN_DELTA0 (0x09) */ {RANSNx16Params.PACK_FLAG_MASK},
+ /* TOKEN_MATCH (0x0A) */ {0},
+ /* TOKEN_NOP (0x0B) */ {0},
+ /* TOKEN_END (0x0C) */ {0},
+ };
+
+ // Per-token-type Range (arithmetic) flag sets, mirroring the rANS sets above
+ private static final int[][] RANGE_FLAG_SETS_BY_TOKEN_TYPE = {
+ /* TOKEN_TYPE (0x00) */ {RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK, 0},
+ /* TOKEN_STRING (0x01) */ {RangeParams.PACK_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, RangeParams.ORDER_FLAG_MASK, 0},
+ /* TOKEN_CHAR (0x02) */ {0},
+ /* TOKEN_DIGITS0 (0x03) */ {RangeParams.PACK_FLAG_MASK | RangeParams.STRIPE_FLAG_MASK, 0},
+ /* TOKEN_DZLEN (0x04) */ {0},
+ /* TOKEN_DUP (0x05) */ {RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK | RangeParams.STRIPE_FLAG_MASK},
+ /* TOKEN_DIFF (0x06) */ {RangeParams.PACK_FLAG_MASK | RangeParams.STRIPE_FLAG_MASK},
+ /* TOKEN_DIGITS (0x07) */ {RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK | RangeParams.STRIPE_FLAG_MASK},
+ /* TOKEN_DELTA (0x08) */ {0},
+ /* TOKEN_DELTA0 (0x09) */ {RangeParams.PACK_FLAG_MASK},
+ /* TOKEN_MATCH (0x0A) */ {0},
+ /* TOKEN_NOP (0x0B) */ {0},
+ /* TOKEN_END (0x0C) */ {0},
+ };
+
+ // Reusable encoder instance — avoids allocating 256x256 RANSEncodingSymbol matrix per trial
+ private final RANSNx16Encode reusableRansEncoder = new RANSNx16Encode();
private int maxPositions; // the maximum number of tokenised columns seen across all names
private int maxStringValueLength; // longest *String* value for any token
@@ -50,6 +74,10 @@ public class NameTokenisationEncode {
* @return the compressed buffer
*/
public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith, final byte nameSeparator) {
+ // Reset per-block state from any previous compress() call
+ maxPositions = 0;
+ maxStringValueLength = 0;
+
// strictly speaking, keeping this list isn't necessary, but since the first thing that we need to write
// to the output stream is the number of names, we have to scan the entire input anyway to count them,
// so just extract them while we're scanning
@@ -91,12 +119,16 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith, fi
);
}
+ // Track all previously compressed streams for cross-position duplicate detection.
+ // Each entry maps compressed bytes to the (position, tokenType) of the first occurrence.
+ final List compressedStreamRegistry = new ArrayList<>();
+
for (int position = 0; position < maxPositions; position++) {
final List streamsForPosition = distributeTokensForPosition(
encodedTokensByName,
position,
numNames);
- serializeTokenStreams(streamsForPosition, outBuffer, useArith);
+ serializeTokenStreams(streamsForPosition, outBuffer, useArith, position, compressedStreamRegistry);
}
// set the limit to current position (important because we initially dramatically over-allocated the buffer,
@@ -115,37 +147,42 @@ private List tokeniseName(
if (nameIndexMap.containsKey(name)) {
// duplicate name, there is no need to tokenise the name, just encode the index of the duplicate
- final String duplicateIndex = String.valueOf(nameIndex - nameIndexMap.get(name));
- return List.of(new EncodeToken.DupOrDiffToken(TokenStreams.TOKEN_DUP, duplicateIndex));
+ final int duplicateIndex = nameIndex - nameIndexMap.get(name);
+ return List.of(new EncodeToken.DupOrDiffToken(TokenStreams.TOKEN_DUP, String.valueOf(duplicateIndex)));
}
final List encodedTokens = new ArrayList<>(NameTokenisationDecode.DEFAULT_POSITION_ALLOCATION);
- // if this name is the first name, the diff value (which indicates the relative position of the record against
- // which we are diffing) must be 0; otherwise for now use a naive strategy that only/always diffs against the
- // (immediately) preceding name by specifying a value of 1
encodedTokens.add(0, new EncodeToken.DupOrDiffToken(TokenStreams.TOKEN_DIFF, String.valueOf(nameIndex == 0 ? 0 : 1)));
nameIndexMap.put(name, nameIndex);
final int prevNameIndex = nameIndex - 1;
- // tokenise the current name
- final Matcher matcher = READ_NAME_PATTERN.matcher(name);
- for (int i = 1; matcher.find(); i++) {
+ // Tokenize the name by splitting on alphanumeric / non-alphanumeric boundaries.
+ // Equivalent to regex: "([a-zA-Z0-9]{1,9})|([^a-zA-Z0-9]+)" but without regex overhead.
+ int pos = 0;
+ for (int i = 1; pos < name.length(); i++) {
+ final int start = pos;
+ if (isAlphanumeric(name.charAt(pos))) {
+ final int limit = Math.min(name.length(), start + 9);
+ do { pos++; } while (pos < limit && isAlphanumeric(name.charAt(pos)));
+ } else {
+ do { pos++; } while (pos < name.length() && !isAlphanumeric(name.charAt(pos)));
+ }
+ final String fragmentValue = name.substring(start, pos);
+
byte type = TokenStreams.TOKEN_STRING;
- final String fragmentValue = matcher.group(); // absolute value of the token
String relativeValue = fragmentValue; // relative value of the token (comparing to prev name's token at the same token position)
- if (DIGITS0_PATTERN.matcher(fragmentValue).matches()) {
+ if (isAllDigitsLeadingZero(fragmentValue)) {
type = TokenStreams.TOKEN_DIGITS0;
- } else if (DIGITS_PATTERN.matcher(fragmentValue).matches()) {
+ } else if (isAllDigits(fragmentValue)) {
type = TokenStreams.TOKEN_DIGITS;
} else if (fragmentValue.length() == 1) {
type = TokenStreams.TOKEN_CHAR;
} // else just treat it as an absolute string
- // compare the current token with the corresponding token from the previous name (this implementation always
- // compares against immediately previous name only), but ONLY if the previous name actually has a
- // corresponding token, and that token is not the terminal token
+ // compare the current token with the corresponding token from the previous name,
+ // but ONLY if the previous name actually has a corresponding token and is not the terminal token
final EncodeToken prevToken = prevNameIndex >= 0 && encodedTokensByName.get(prevNameIndex).size() > i + 1?
encodedTokensByName.get(prevNameIndex).get(i) :
null;
@@ -156,20 +193,24 @@ private List tokeniseName(
relativeValue = null;
} else if (type==TokenStreams.TOKEN_DIGITS &&
(prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA)) {
- int d = Integer.parseInt(relativeValue) - Integer.parseInt(prevToken.getActualValue());
+ final int curVal = Integer.parseInt(relativeValue);
+ final int d = curVal - Integer.parseInt(prevToken.getActualValue());
tokenFrequencies[i]++;
if (d >= 0 && d < 256 && tokenFrequencies[i] > nameIndex / 2) {
type = TokenStreams.TOKEN_DELTA;
- relativeValue = String.valueOf(d);
+ encodedTokens.add(new EncodeToken(type, fragmentValue, d));
+ continue;
}
} else if (type == TokenStreams.TOKEN_DIGITS0 &&
prevToken.getActualValue().length() == relativeValue.length() &&
(prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS0 || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA0)) {
- int d = Integer.parseInt(relativeValue) - Integer.parseInt(prevToken.getActualValue());
+ final int curVal = Integer.parseInt(relativeValue);
+ final int d = curVal - Integer.parseInt(prevToken.getActualValue());
tokenFrequencies[i]++;
if (d >= 0 && d < 256 && tokenFrequencies[i] > nameIndex / 2) {
type = TokenStreams.TOKEN_DELTA0;
- relativeValue = String.valueOf(d);
+ encodedTokens.add(new EncodeToken(type, fragmentValue, d));
+ continue;
}
}
}
@@ -192,6 +233,30 @@ private List tokeniseName(
return encodedTokens;
}
+ /** Check if a character is alphanumeric (a-z, A-Z, 0-9). */
+ private static boolean isAlphanumeric(final char c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
+ }
+
+ /** Check if all characters are ASCII digits (equivalent to ^[0-9]+$). */
+ private static boolean isAllDigits(final String s) {
+ for (int i = 0; i < s.length(); i++) {
+ final char c = s.charAt(i);
+ if (c < '0' || c > '9') return false;
+ }
+ return s.length() > 0;
+ }
+
+ /** Check if string matches ^0+[0-9]*$ (starts with at least one zero, rest are digits). */
+ private static boolean isAllDigitsLeadingZero(final String s) {
+ if (s.length() == 0 || s.charAt(0) != '0') return false;
+ for (int i = 1; i < s.length(); i++) {
+ final char c = s.charAt(i);
+ if (c < '0' || c > '9') return false;
+ }
+ return true;
+ }
+
// extract the individual names from the input buffer and return in a list
private static List extractInputNames(
final ByteBuffer inBuffer,
@@ -255,12 +320,12 @@ private List distributeTokensForPosition(
switch (type) {
case TokenStreams.TOKEN_DIFF:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIFF, numNames * 4)
- .putInt(Integer.parseInt(encodeToken.getRelativeValue()));
+ .putInt(encodeToken.getRelativeValueAsInt());
break;
case TokenStreams.TOKEN_DUP:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DUP, numNames * 4)
- .putInt(Integer.parseInt(encodeToken.getRelativeValue()));
+ .putInt(encodeToken.getRelativeValueAsInt());
break;
case TokenStreams.TOKEN_STRING:
@@ -277,24 +342,24 @@ private List distributeTokensForPosition(
case TokenStreams.TOKEN_DIGITS:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIGITS, numNames * 4)
- .putInt(Integer.parseInt(encodeToken.getRelativeValue()));
+ .putInt(encodeToken.getRelativeValueAsInt());
break;
case TokenStreams.TOKEN_DIGITS0:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIGITS0, numNames * 4)
- .putInt(Integer.parseInt(encodeToken.getRelativeValue()));
+ .putInt(encodeToken.getRelativeValueAsInt());
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DZLEN, numNames)
.put((byte) encodeToken.getRelativeValue().length());
break;
case TokenStreams.TOKEN_DELTA:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DELTA, numNames * 1)
- .put((byte)Integer.parseInt(encodeToken.getRelativeValue()));
+ .put((byte) encodeToken.getRelativeValueAsInt());
break;
case TokenStreams.TOKEN_DELTA0:
getByteBufferFor(tokenStreams, TokenStreams.TOKEN_DELTA0, numNames * 1)
- .put((byte)Integer.parseInt(encodeToken.getRelativeValue()));
+ .put((byte) encodeToken.getRelativeValueAsInt());
break;
case TokenStreams.TOKEN_NOP:
@@ -331,88 +396,154 @@ private ByteBuffer getByteBufferFor(
}
private static void writeString(final ByteBuffer tokenStreamBuffer, final String val) {
- tokenStreamBuffer.put(val.getBytes());
+ tokenStreamBuffer.put(val.getBytes(StandardCharsets.US_ASCII));
tokenStreamBuffer.put((byte) 0);
}
- private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final boolean useArith) {
- // compress with different formatFlags
- // and return the compressed output ByteBuffer with the least number of bytes
+ /**
+ * Try multiple compression flag combinations for the given token stream and return the
+ * smallest compressed result. Flag sets are selected per token type to match htslib's
+ * tok3 encoder behavior.
+ */
+ private ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final boolean useArith, final int tokenType) {
int bestCompressedLength = 1 << 30;
ByteBuffer compressedByteBuffer = null;
+ final int streamSize = nameTokenStream.limit();
- if (useArith == true) { // use the range encoder
- final int[] rangeEncoderFlagsSets = {
- 0,
- RangeParams.ORDER_FLAG_MASK,
- RangeParams.RLE_FLAG_MASK, //64
- RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, //65
- RangeParams.PACK_FLAG_MASK, //128,
- RangeParams.PACK_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, //129
- // we don't include stripe here since it's not implemented for write
- RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK // 193+8
- };
- for (int rangeEncoderFlagSet : rangeEncoderFlagsSets) {
- if ((rangeEncoderFlagSet & RangeParams.ORDER_FLAG_MASK) != 0 && nameTokenStream.remaining() < 100) {
- continue;
- }
- if ((rangeEncoderFlagSet & RangeParams.STRIPE_FLAG_MASK) != 0 && (nameTokenStream.remaining() % 4) != 0) {
- continue;
- }
- // Encode using Range
- final RangeEncode rangeEncode = new RangeEncode();
- nameTokenStream.rewind();
- final ByteBuffer tmpByteBuffer = rangeEncode.compress(nameTokenStream, new RangeParams(rangeEncoderFlagSet));
- if (bestCompressedLength > tmpByteBuffer.limit()) {
- bestCompressedLength = tmpByteBuffer.limit();
- compressedByteBuffer = tmpByteBuffer;
- }
+ final int[] flagSets = useArith
+ ? RANGE_FLAG_SETS_BY_TOKEN_TYPE[tokenType]
+ : RANS_FLAG_SETS_BY_TOKEN_TYPE[tokenType];
+
+ for (final int flagSet : flagSets) {
+ if ((flagSet & RANSNx16Params.ORDER_FLAG_MASK) != 0 && streamSize < 100) {
+ continue;
}
- } else {
- final int[] ransNx16FlagsSets = {
- 0,
- RANSNx16Params.ORDER_FLAG_MASK,
- RANSNx16Params.RLE_FLAG_MASK, //64
- RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, //65
- RANSNx16Params.PACK_FLAG_MASK, //128,
- RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, //129
- // we don't include stripe here since it's not implemented for write
- RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK // 193+8
- };
- for (int ransNx16FlagSet : ransNx16FlagsSets) {
- if ((ransNx16FlagSet & RANSNx16Params.ORDER_FLAG_MASK) != 0 && nameTokenStream.remaining() < 100) {
- continue;
- }
- if ((ransNx16FlagSet & RANSNx16Params.STRIPE_FLAG_MASK) != 0 && (nameTokenStream.remaining() % 4) != 0) {
- continue;
- }
- // Encode using RANSnx16
- final RANSNx16Encode ransEncode = new RANSNx16Encode();
- nameTokenStream.rewind();
- final ByteBuffer tmpByteBuffer = ransEncode.compress(nameTokenStream, new RANSNx16Params(ransNx16FlagSet));
- if (bestCompressedLength > tmpByteBuffer.limit()) {
- bestCompressedLength = tmpByteBuffer.limit();
- compressedByteBuffer = tmpByteBuffer;
- }
+ if ((flagSet & RANSNx16Params.STRIPE_FLAG_MASK) != 0 && (streamSize % 4) != 0) {
+ continue;
+ }
+ nameTokenStream.rewind();
+ final ByteBuffer tmpByteBuffer;
+ if (useArith) {
+ tmpByteBuffer = new RangeEncode().compress(nameTokenStream, new RangeParams(flagSet));
+ } else {
+ final byte[] streamBytes = new byte[nameTokenStream.remaining()];
+ nameTokenStream.get(streamBytes);
+ final byte[] compressed = reusableRansEncoder.compress(streamBytes, new RANSNx16Params(flagSet));
+ tmpByteBuffer = ByteBuffer.wrap(compressed);
+ }
+ if (bestCompressedLength > tmpByteBuffer.limit()) {
+ bestCompressedLength = tmpByteBuffer.limit();
+ compressedByteBuffer = tmpByteBuffer;
+ }
+ }
+
+ if (bestCompressedLength > nameTokenStream.limit()) {
+ // compression doesn't buy us anything; fall back to CAT (uncompressed)
+ nameTokenStream.rewind();
+ if (useArith) {
+ compressedByteBuffer = new RangeEncode().compress(nameTokenStream, new RangeParams(RangeParams.CAT_FLAG_MASK));
+ } else {
+ final byte[] streamBytes = new byte[nameTokenStream.remaining()];
+ nameTokenStream.get(streamBytes);
+ final byte[] compressed = reusableRansEncoder.compress(streamBytes, new RANSNx16Params(RANSNx16Params.CAT_FLAG_MASK));
+ compressedByteBuffer = ByteBuffer.wrap(compressed);
}
}
return compressedByteBuffer;
}
+ /**
+ * Tracks a compressed stream's bytes and its source coordinates (position, tokenType) for
+ * cross-position duplicate detection.
+ */
+ private static class CompressedStream {
+ final byte[] compressedBytes;
+ final int position;
+ final int tokenType;
+
+ CompressedStream(final byte[] compressedBytes, final int position, final int tokenType) {
+ this.compressedBytes = compressedBytes;
+ this.position = position;
+ this.tokenType = tokenType;
+ }
+ }
+
private void serializeTokenStreams(
final List tokenStreams,
final ByteBuffer outBuffer,
- final boolean useArith) {
- // Compress and serialise the non-null tokenStreams
+ final boolean useArith,
+ final int currentPosition,
+ final List compressedStreamRegistry) {
+ // Check if the TOKEN_TYPE stream is all MATCH after the first byte. If so, the spec allows
+ // us to omit it entirely — the decoder regenerates it from the first non-null stream's type.
+ boolean omitTypeStream = false;
+ final ByteBuffer typeStream = tokenStreams.get(TokenStreams.TOKEN_TYPE);
+ if (typeStream != null && typeStream.limit() > 1) {
+ typeStream.rewind();
+ typeStream.get(); // skip byte 0 (the non-MATCH type)
+ boolean allMatch = true;
+ while (typeStream.hasRemaining()) {
+ if (typeStream.get() != TokenStreams.TOKEN_MATCH) {
+ allMatch = false;
+ break;
+ }
+ }
+ if (allMatch) {
+ // Check that at least one other stream exists for this position (otherwise we can't omit TYPE)
+ for (int t = 1; t <= TokenStreams.TOKEN_END; t++) {
+ final ByteBuffer s = tokenStreams.get(t);
+ if (s != null && s.limit() > 0) {
+ omitTypeStream = true;
+ break;
+ }
+ }
+ }
+ typeStream.rewind();
+ }
+
+ // Compress and serialize the non-null tokenStreams
+ boolean firstStreamForPosition = true;
for (int tokenStreamType = 0; tokenStreamType <= TokenStreams.TOKEN_END; tokenStreamType++) {
+ if (omitTypeStream && tokenStreamType == TokenStreams.TOKEN_TYPE) {
+ continue;
+ }
final ByteBuffer tokenBytes = tokenStreams.get(tokenStreamType);
- if (tokenBytes != null && tokenBytes.position() > 0) {
- // if this encoder was aware of duplicate streams, we would need to detect and encode them
- // here, and set the DUP_PREVIOUS_STREAM_FLAG_MASK bit
- outBuffer.put((byte) (tokenStreamType | (tokenStreamType == 0 ? TokenStreams.NEW_POSITION_FLAG_MASK : 0)));
- final ByteBuffer tempOutByteBuffer = tryCompress(tokenBytes, useArith);
- CompressionUtils.writeUint7(tempOutByteBuffer.limit(), outBuffer);
- outBuffer.put(tempOutByteBuffer);
+ if (tokenBytes != null && tokenBytes.limit() > 0) {
+ byte headerByte = (byte) tokenStreamType;
+ if (firstStreamForPosition) {
+ headerByte |= TokenStreams.NEW_POSITION_FLAG_MASK;
+ firstStreamForPosition = false;
+ }
+
+ final ByteBuffer compressedBuffer = tryCompress(tokenBytes, useArith, tokenStreamType);
+ final byte[] compressedBytes = new byte[compressedBuffer.limit()];
+ compressedBuffer.rewind();
+ compressedBuffer.get(compressedBytes);
+
+ // Check for a duplicate among previously compressed streams
+ CompressedStream dupSource = null;
+ if (compressedBytes.length > 4) {
+ for (final CompressedStream prev : compressedStreamRegistry) {
+ if (prev.compressedBytes.length == compressedBytes.length &&
+ Arrays.equals(prev.compressedBytes, compressedBytes)) {
+ dupSource = prev;
+ break;
+ }
+ }
+ }
+
+ if (dupSource != null) {
+ // Emit a 3-byte dup reference instead of the full compressed data
+ outBuffer.put((byte) (headerByte | TokenStreams.DUP_PREVIOUS_STREAM_FLAG_MASK));
+ outBuffer.put((byte) dupSource.position);
+ outBuffer.put((byte) dupSource.tokenType);
+ } else {
+ // Emit the compressed data and register it for future dedup
+ outBuffer.put(headerByte);
+ CompressionUtils.writeUint7(compressedBytes.length, outBuffer);
+ outBuffer.put(compressedBytes);
+ compressedStreamRegistry.add(new CompressedStream(compressedBytes, currentPosition, tokenStreamType));
+ }
}
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
index 3989a6abe8..dda29eafd4 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java
@@ -3,11 +3,16 @@
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.CompressionUtils;
import htsjdk.samtools.cram.compression.range.RangeDecode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Decode;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
+/**
+ * Token streams for the CRAM 3.1 name tokeniser. Stores decompressed byte streams indexed
+ * by (position, tokenType). Accepts an optional shared {@link RANSNx16Decode} instance to
+ * avoid allocating a fresh 1MB+ decoder per stream.
+ */
public class TokenStreams {
public static final byte TOKEN_TYPE = 0x00;
public static final byte TOKEN_STRING = 0x01;
@@ -44,6 +49,17 @@ public class TokenStreams {
* @param numNames - the number of read names in the slice for which this token stream is being created
*/
public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames) {
+ this(inputByteBuffer, useArith, numNames, null);
+ }
+
+ /**
+ * @param inputByteBuffer the input buffer of token streams
+ * @param useArith true to use range coding; false for rANS coding
+ * @param numNames the number of read names in the slice
+ * @param sharedRansDecoder optional shared rANS decoder instance (avoids per-stream allocation)
+ */
+ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames,
+ final RANSNx16Decode sharedRansDecoder) {
// pre-allocate enough room for 32 token positions; we'll reallocate if we exceed this; it is ok if
// the actual number is less than the pre-allocated amount
// note that this array is often very sparse (unused cells have null instead of an actual ByteBuffer)
@@ -97,18 +113,33 @@ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final
final RangeDecode rangeDecode = new RangeDecode();
uncompressedTokenStream = rangeDecode.uncompress(CompressionUtils.wrap(compressedTokenStream));
} else {
- final RANSNx16Decode ransDecode = new RANSNx16Decode();
- uncompressedTokenStream = ransDecode.uncompress(CompressionUtils.wrap(compressedTokenStream));
+ final RANSNx16Decode ransDecode = sharedRansDecoder != null ? sharedRansDecoder : new RANSNx16Decode();
+ uncompressedTokenStream = ByteBuffer.wrap(ransDecode.uncompress(compressedTokenStream))
+ .order(java.nio.ByteOrder.LITTLE_ENDIAN);
}
getStreamsForPos(tokenPosition)[tokenType] = uncompressedTokenStream;
}
}
}
+ /**
+ * Return the array of token streams for a given column position.
+ * The returned array is indexed by token type constant (e.g., {@link #TOKEN_TYPE}).
+ *
+ * @param pos the column position (0-based)
+ * @return array of ByteBuffers indexed by token type; entries may be null
+ */
public ByteBuffer[] getStreamsForPos(final int pos) {
return tokenStreams[pos];
}
+ /**
+ * Return the token stream for a specific position and token type.
+ *
+ * @param tokenPosition the column position (0-based)
+ * @param tokenType the token type constant (e.g., {@link #TOKEN_STRING})
+ * @return the ByteBuffer for that stream, or null if not present
+ */
public ByteBuffer getStream(final int tokenPosition, final int tokenType) {
return getStreamsForPos(tokenPosition)[tokenType];
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java
index 853752b5ca..b16f1e4b75 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java
@@ -23,6 +23,10 @@ public class EncodeToken {
private String actualValue;
private String relativeValue;
+ /** Cached parsed integer for relativeValue, avoiding repeated Integer.parseInt() calls. */
+ private int relativeValueInt;
+ private boolean hasRelativeValueInt;
+
/**
* Token types TOKEN_DELTA, TOKEN_DELTA0, TOKEN_DIGITS, TOKEN_DIGITS0 all have a relative value that
* differs from the actual value of the original fragment, and for those token types, we need to preserve
@@ -40,6 +44,23 @@ public EncodeToken(final byte type, final String actualValue, final String relat
this.relativeValue = relativeValue;
}
+ public EncodeToken(final byte type, final String actualValue, final int relativeValueInt) {
+ this.tokenType = type;
+ this.actualValue = actualValue;
+ this.relativeValue = null;
+ this.relativeValueInt = relativeValueInt;
+ this.hasRelativeValueInt = true;
+ }
+
+ /** Get the relative value as an int, parsing from string only on first call. */
+ public int getRelativeValueAsInt() {
+ if (!hasRelativeValueInt) {
+ relativeValueInt = Integer.parseInt(relativeValue);
+ hasRelativeValueInt = true;
+ }
+ return relativeValueInt;
+ }
+
public byte getTokenType() {
return tokenType;
}
@@ -49,6 +70,9 @@ public String getActualValue() {
}
public String getRelativeValue() {
+ if (relativeValue == null && hasRelativeValueInt) {
+ relativeValue = String.valueOf(relativeValueInt);
+ }
return relativeValue;
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/ByteModel.java b/src/main/java/htsjdk/samtools/cram/compression/range/ByteModel.java
index f2f71c4e2a..0047e062aa 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/range/ByteModel.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/range/ByteModel.java
@@ -2,106 +2,138 @@
import java.nio.ByteBuffer;
+/**
+ * Adaptive frequency model for the CRAM 3.1 arithmetic range coder. Maintains per-symbol frequency
+ * counts and provides encode/decode operations that update the model after each symbol. Symbols
+ * are kept approximately sorted by frequency (descending) for cache-friendly access.
+ *
+ *
Symbols and frequencies are interleaved in a single {@code int[]} array for cache locality:
+ * even indices hold frequencies, odd indices hold symbol values. This eliminates the cache
+ * thrashing that occurs with separate arrays during the linear scan.
+ *
+ *
Each symbol starts with a frequency of 1. After encoding/decoding a symbol, its frequency
+ * is incremented by {@link Constants#STEP} (16). When total frequency exceeds {@link Constants#MAX_FREQ},
+ * all frequencies are halved (avoiding zeros).
+ *
+ * @see RangeCoder
+ */
public class ByteModel {
- // spec: To encode any symbol the entropy encoder needs to know
- // the frequency of the symbol to encode,
- // the cumulative frequencies of all symbols prior to this symbol,
- // and the total of all frequencies.
public int totalFrequency;
public final int maxSymbol;
- public final int[] symbols;
- public final int[] frequencies;
+ /**
+ * Interleaved frequency/symbol pairs: {@code data[i*2]} = frequency, {@code data[i*2+1]} = symbol.
+ * Keeping these adjacent improves cache hit rate during the linear scan in encode/decode.
+ */
+ public final int[] data;
+
+ /**
+ * Create a new model for the given number of distinct symbols (0 to numSymbols-1),
+ * each starting with frequency 1.
+ *
+ * @param numSymbols number of distinct symbols this model can encode/decode
+ */
public ByteModel(final int numSymbols) {
// Spec: ModelCreate method
this.totalFrequency = numSymbols;
this.maxSymbol = numSymbols - 1;
- frequencies = new int[maxSymbol+1];
- symbols = new int[maxSymbol+1];
+ data = new int[(maxSymbol + 1) * 2];
+ reset();
+ }
+
+ /** Reset all frequencies to 1 and restore natural symbol ordering. */
+ public void reset() {
+ totalFrequency = maxSymbol + 1;
for (int i = 0; i <= maxSymbol; i++) {
- this.symbols[i] = i;
- this.frequencies[i] = 1;
+ data[i * 2] = 1; // frequency
+ data[i * 2 + 1] = i; // symbol
}
}
+ /**
+ * Decode one symbol from the compressed stream, update the model frequencies, and return the symbol.
+ *
+ * @param inBuffer the compressed input stream
+ * @param rangeCoder the range coder state (must have been started with {@link RangeCoder#rangeDecodeStart})
+ * @return the decoded symbol value
+ */
public int modelDecode(final ByteBuffer inBuffer, final RangeCoder rangeCoder){
-
- // decodes one symbol
final int freq = rangeCoder.rangeGetFrequency(totalFrequency);
int cumulativeFrequency = 0;
int x = 0;
- while (cumulativeFrequency + frequencies[x] <= freq){
- cumulativeFrequency += frequencies[x++];
+ while (cumulativeFrequency + data[x * 2] <= freq){
+ cumulativeFrequency += data[x * 2];
+ x++;
}
// update rangecoder
- rangeCoder.rangeDecode(inBuffer,cumulativeFrequency,frequencies[x]);
+ rangeCoder.rangeDecode(inBuffer, cumulativeFrequency, data[x * 2]);
// update model frequencies
- frequencies[x] += Constants.STEP;
+ data[x * 2] += Constants.STEP;
totalFrequency += Constants.STEP;
if (totalFrequency > Constants.MAX_FREQ){
- // if totalFrequency is too high, the frequencies are halved, making
- // sure to avoid any zero frequencies being created.
modelRenormalize();
}
// keep symbols approximately frequency sorted
- final int symbol = symbols[x];
- if (x > 0 && frequencies[x] > frequencies[x-1]){
- // Swap frequencies[x], frequencies[x-1]
- int tmp = frequencies[x];
- frequencies[x] = frequencies[x-1];
- frequencies[x-1] = tmp;
-
- // Swap symbols[x], symbols[x-1]
- tmp = symbols[x];
- symbols[x] = symbols[x-1];
- symbols[x-1] = tmp;
+ final int symbol = data[x * 2 + 1];
+ if (x > 0 && data[x * 2] > data[(x - 1) * 2]){
+ // Swap frequency and symbol pairs
+ final int tmpFreq = data[x * 2];
+ final int tmpSym = data[x * 2 + 1];
+ data[x * 2] = data[(x - 1) * 2];
+ data[x * 2 + 1] = data[(x - 1) * 2 + 1];
+ data[(x - 1) * 2] = tmpFreq;
+ data[(x - 1) * 2 + 1] = tmpSym;
}
return symbol;
}
- public void modelRenormalize(){
- // frequencies are halved
+ /** Halve all frequencies (avoiding zeros) when total frequency exceeds {@link Constants#MAX_FREQ}. */
+ public void modelRenormalize(){
totalFrequency = 0;
- for (int i=0; i <= maxSymbol; i++){
- frequencies[i] -= Math.floorDiv(frequencies[i],2);
- totalFrequency += frequencies[i];
+ for (int i = 0; i <= maxSymbol; i++){
+ data[i * 2] -= data[i * 2] >> 1;
+ totalFrequency += data[i * 2];
}
}
- public void modelEncode(final ByteBuffer outBuffer, final RangeCoder rangeCoder, final int symbol){
-
- // encodes one input symbol
+ /**
+ * Encode one symbol to the compressed stream and update the model frequencies.
+ * Output is written to the range coder's internal byte[] buffer.
+ *
+ * @param rangeCoder the range coder state (must have output set via {@link RangeCoder#setOutput})
+ * @param symbol the symbol value to encode (must be in range 0 to maxSymbol)
+ */
+ public void modelEncode(final RangeCoder rangeCoder, final int symbol){
int cumulativeFrequency = 0;
- int i;
- for( i = 0; symbols[i] != symbol; i++){
- cumulativeFrequency += frequencies[i];
+ int i = 0;
+ while (data[i * 2 + 1] != symbol) {
+ cumulativeFrequency += data[i * 2];
+ i++;
}
// Encode
- rangeCoder.rangeEncode(outBuffer, cumulativeFrequency, frequencies[i],totalFrequency);
+ rangeCoder.rangeEncode(cumulativeFrequency, data[i * 2], totalFrequency);
// Update Model
- frequencies[i] += Constants.STEP;
+ data[i * 2] += Constants.STEP;
totalFrequency += Constants.STEP;
if (totalFrequency > Constants.MAX_FREQ){
modelRenormalize();
}
// Keep symbols approximately frequency sorted (ascending order)
- if (i > 0 && frequencies[i] > frequencies[i-1]){
- // swap frequencies
- int tmp = frequencies[i];
- frequencies[i] = frequencies[i-1];
- frequencies[i-1]=tmp;
-
- // swap symbols
- tmp = symbols[i];
- symbols[i] = symbols[i-1];
- symbols[i-1] = tmp;
+ if (i > 0 && data[i * 2] > data[(i - 1) * 2]){
+ // swap frequency and symbol pairs
+ final int tmpFreq = data[i * 2];
+ final int tmpSym = data[i * 2 + 1];
+ data[i * 2] = data[(i - 1) * 2];
+ data[i * 2 + 1] = data[(i - 1) * 2 + 1];
+ data[(i - 1) * 2] = tmpFreq;
+ data[(i - 1) * 2 + 1] = tmpSym;
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java b/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java
index 6de5550850..514d0abdf1 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java
@@ -2,6 +2,22 @@
import java.nio.ByteBuffer;
+/**
+ * Arithmetic range coder used by the CRAM 3.1 Range (adaptive arithmetic) codec and FQZComp quality
+ * score codec. Implements both encoding and decoding using a 32-bit range with carry propagation
+ * for output byte generation.
+ *
+ *
The range coder maintains a probability interval [low, low+range) and narrows it for each
+ * symbol based on cumulative and symbol frequencies. When the range becomes too small (< 2^24),
+ * it renormalizes by shifting out the top byte.
+ *
+ *
Encoding output is written to an internal {@code byte[]} buffer (set via {@link #setOutput})
+ * rather than a {@link ByteBuffer}, eliminating bounds checking and position tracking overhead
+ * in the hot encoding loop.
+ *
+ * @see ByteModel
+ * @see CRAM 3.1 specification, Section 3.5
+ */
public class RangeCoder {
private long low;
@@ -11,16 +27,44 @@ public class RangeCoder {
private boolean carry;
private int cache;
+ // Encoding output buffer (byte[] for performance — avoids ByteBuffer overhead in hot loop)
+ private byte[] outBuf;
+ private int outPos;
+
public RangeCoder() {
- // Spec: RangeEncodeStart
this.low = 0;
- this.range = Constants.MAX_RANGE; // 4 bytes of all 1's
+ this.range = Constants.MAX_RANGE;
this.code = 0;
this.FFnum = 0;
this.carry = false;
this.cache = 0;
}
+ /**
+ * Set the output buffer for encoding. Must be called before any encode operations.
+ *
+ * @param buf the byte array to write compressed output to
+ * @param pos the starting write position in the buffer
+ */
+ public void setOutput(final byte[] buf, final int pos) {
+ this.outBuf = buf;
+ this.outPos = pos;
+ }
+
+ /**
+ * Return the current write position in the output buffer. Call after encoding is complete
+ * to determine how many bytes were written.
+ */
+ public int getOutputPosition() {
+ return outPos;
+ }
+
+ /**
+ * Initialize the decoder by reading the first 5 bytes of the compressed stream into the code register.
+ * Must be called before any calls to {@link ByteModel#modelDecode}.
+ *
+ * @param inBuffer the compressed input stream
+ */
public void rangeDecodeStart(final ByteBuffer inBuffer){
for (int i = 0; i < 5; i++){
code = (code << 8) + (inBuffer.get() & 0xFF);
@@ -28,6 +72,13 @@ public void rangeDecodeStart(final ByteBuffer inBuffer){
code &= Constants.MAX_RANGE;
}
+ /**
+ * Update the decoder state after a symbol has been decoded.
+ *
+ * @param inBuffer the compressed input stream (for renormalization reads)
+ * @param cumulativeFrequency cumulative frequency of symbols before the decoded symbol
+ * @param symbolFrequency frequency of the decoded symbol
+ */
protected void rangeDecode(final ByteBuffer inBuffer, final int cumulativeFrequency, final int symbolFrequency){
code -= cumulativeFrequency * range;
range *= symbolFrequency;
@@ -38,18 +89,31 @@ protected void rangeDecode(final ByteBuffer inBuffer, final int cumulativeFreque
}
}
+ /**
+ * Compute the scaled frequency for symbol lookup during decoding.
+ *
+ * @param totalFrequency the sum of all symbol frequencies
+ * @return the scaled frequency value used to identify the decoded symbol
+ */
protected int rangeGetFrequency(final int totalFrequency){
- range = (long) Math.floor(range / totalFrequency);
- return (int) Math.floor(code / range);
+ range = range / totalFrequency;
+ return (int) (code / range);
}
+ /**
+ * Encode a symbol by narrowing the range interval and emitting output bytes as needed.
+ * Output is written to the internal byte[] buffer (set via {@link #setOutput}).
+ *
+ * @param cumulativeFrequency cumulative frequency of all symbols before this one
+ * @param symbolFrequency frequency of the symbol being encoded
+ * @param totalFrequency sum of all symbol frequencies
+ */
protected void rangeEncode(
- final ByteBuffer outBuffer,
final int cumulativeFrequency,
final int symbolFrequency,
final int totalFrequency){
final long old_low = low;
- range = (long) Math.floor(range/totalFrequency);
+ range = range / totalFrequency;
low += cumulativeFrequency * range;
low &= 0xFFFFFFFFL; // keep bottom 4 bytes, shift the top byte out of low
range *= symbolFrequency;
@@ -61,18 +125,22 @@ protected void rangeEncode(
// Renormalise if range gets too small
while (range < (1<<24)) {
range <<= 8;
- rangeShiftLow(outBuffer);
+ rangeShiftLow();
}
}
- protected void rangeEncodeEnd(final ByteBuffer outBuffer){
+ /**
+ * Flush the encoder state by emitting the final 5 bytes. Must be called after all symbols
+ * have been encoded to produce a valid compressed stream.
+ */
+ public void rangeEncodeEnd(){
for(int i = 0; i < 5; i++){
- rangeShiftLow(outBuffer);
+ rangeShiftLow();
}
}
- private void rangeShiftLow(final ByteBuffer outBuffer) {
+ private void rangeShiftLow() {
// rangeShiftLow tracks the total number of extra bytes to emit and
// carry indicates whether they are a string of 0xFF or 0x00 values
@@ -81,15 +149,15 @@ private void rangeShiftLow(final ByteBuffer outBuffer) {
if ((low < 0xff000000L) || carry) {
if (carry == false) {
- outBuffer.put((byte) cache);
+ outBuf[outPos++] = (byte) cache;
while (FFnum > 0) {
- outBuffer.put((byte) 0xFF);
+ outBuf[outPos++] = (byte) 0xFF;
FFnum--;
}
} else {
- outBuffer.put((byte) (cache + 1));
+ outBuf[outPos++] = (byte) (cache + 1);
while (FFnum > 0) {
- outBuffer.put((byte) 0x00);
+ outBuf[outPos++] = (byte) 0x00;
FFnum--;
}
@@ -102,4 +170,4 @@ private void rangeShiftLow(final ByteBuffer outBuffer) {
low = low<<8 & (0xFFFFFFFFL); // force low to be +ve
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java b/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java
index 7c6e5b11ed..66271f788e 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java
@@ -19,9 +19,13 @@ public class RangeDecode {
private static final ByteBuffer EMPTY_BUFFER = CompressionUtils.allocateByteBuffer(0);
- // This method assumes that inBuffer is already rewound.
- // It uncompresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the uncompressed data.
+ /**
+ * Decompress data that was compressed with the CRAM 3.1 arithmetic (range) codec.
+ * The input buffer is consumed (position advanced to limit) and the returned buffer is rewound.
+ *
+ * @param inBuffer compressed input data (position to limit is decompressed)
+ * @return a rewound ByteBuffer containing the decompressed data
+ */
public ByteBuffer uncompress(final ByteBuffer inBuffer) {
return uncompress(inBuffer, 0);
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/RangeEncode.java b/src/main/java/htsjdk/samtools/cram/compression/range/RangeEncode.java
index d570b598a4..32a6f3eeaa 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/range/RangeEncode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/range/RangeEncode.java
@@ -17,9 +17,14 @@ public class RangeEncode {
private static final ByteBuffer EMPTY_BUFFER = CompressionUtils.allocateByteBuffer(0);
- // This method assumes that inBuffer is already rewound.
- // It compresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the compressed data.
+ /**
+ * Compress data using the CRAM 3.1 arithmetic (range) codec with the given parameters.
+ * The input buffer is consumed (position advanced to limit) and the returned buffer is rewound.
+ *
+ * @param inBuffer input data to compress (position to limit is compressed)
+ * @param rangeParams encoding parameters controlling order, RLE, PACK, STRIPE, and other flags
+ * @return a rewound ByteBuffer containing the compressed data
+ */
public ByteBuffer compress(final ByteBuffer inBuffer, final RangeParams rangeParams) {
if (inBuffer.remaining() == 0) {
return EMPTY_BUFFER;
@@ -37,9 +42,9 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final RangeParams rangePar
ByteBuffer inputBuffer = inBuffer;
- // Stripe flag is not implemented in the write implementation
if (rangeParams.isStripe()) {
- throw new CRAMException("Range Encoding with Stripe Flag is not implemented.");
+ compressStripe(inputBuffer, outBuffer);
+ return outBuffer;
}
final int inSize = inputBuffer.remaining(); // e_len -> inSize
@@ -74,7 +79,7 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final RangeParams rangePar
outBuffer.rewind(); // set position to 0
} else if (rangeParams.isExternalCompression()) {
final byte[] rawBytes = new byte[inputBuffer.remaining()];
- inputBuffer.get(rawBytes, inBuffer.position(), inputBuffer.remaining());
+ inputBuffer.get(rawBytes, 0, inputBuffer.remaining());
final BZIP2ExternalCompressor compressor = new BZIP2ExternalCompressor();
final byte[] extCompressedBytes = compressor.compress(rawBytes, null);
outBuffer.put(extCompressedBytes);
@@ -121,10 +126,12 @@ private void compressOrder0(
final ByteModel byteModel = new ByteModel(maxSymbol);
outBuffer.put((byte) maxSymbol);
final RangeCoder rangeCoder = new RangeCoder();
+ rangeCoder.setOutput(outBuffer.array(), outBuffer.position());
for (int i = 0; i < inSize; i++) {
- byteModel.modelEncode(outBuffer, rangeCoder, inBuffer.get(i) & 0xFF);
+ byteModel.modelEncode(rangeCoder, inBuffer.get(i) & 0xFF);
}
- rangeCoder.rangeEncodeEnd(outBuffer);
+ rangeCoder.rangeEncodeEnd();
+ outBuffer.position(rangeCoder.getOutputPosition());
outBuffer.limit(outBuffer.position());
outBuffer.rewind();
}
@@ -146,12 +153,14 @@ private void compressOrder1(
}
outBuffer.put((byte) maxSymbol);
final RangeCoder rangeCoder = new RangeCoder();
+ rangeCoder.setOutput(outBuffer.array(), outBuffer.position());
int last = 0;
for (int i = 0; i < inSize; i++) {
- byteModelList.get(last).modelEncode(outBuffer, rangeCoder, inBuffer.get(i) & 0xFF);
+ byteModelList.get(last).modelEncode(rangeCoder, inBuffer.get(i) & 0xFF);
last = inBuffer.get(i) & 0xFF;
}
- rangeCoder.rangeEncodeEnd(outBuffer);
+ rangeCoder.rangeEncodeEnd();
+ outBuffer.position(rangeCoder.getOutputPosition());
outBuffer.limit(outBuffer.position());
outBuffer.rewind();
}
@@ -166,7 +175,7 @@ private void compressRLEOrder0(
maxSymbols = inBuffer.get(i) & 0xFF;
}
}
- maxSymbols++; // FIXME not what spec states!
+ maxSymbols++; // number of symbols [0..max], stored as byte (256 wraps to 0, decoded as 256)
final ByteModel modelLit = new ByteModel(maxSymbols);
final List byteModelRunsList = new ArrayList(258);
@@ -176,9 +185,10 @@ private void compressRLEOrder0(
}
outBuffer.put((byte) maxSymbols);
final RangeCoder rangeCoder = new RangeCoder();
+ rangeCoder.setOutput(outBuffer.array(), outBuffer.position());
int i = 0;
while (i < inSize) {
- modelLit.modelEncode(outBuffer, rangeCoder, inBuffer.get(i) & 0xFF);
+ modelLit.modelEncode(rangeCoder, inBuffer.get(i) & 0xFF);
int run = 1;
while (i + run < inSize && (inBuffer.get(i + run) & 0xFF) == (inBuffer.get(i) & 0xFF)) {
run++;
@@ -187,17 +197,18 @@ private void compressRLEOrder0(
int rctx = inBuffer.get(i) & 0xFF;
i += run + 1;
int part = run >= 3 ? 3 : run;
- byteModelRunsList.get(rctx).modelEncode(outBuffer, rangeCoder, part);
+ byteModelRunsList.get(rctx).modelEncode(rangeCoder, part);
run -= part;
rctx = 256;
while (part == 3) {
part = run >= 3 ? 3 : run;
- byteModelRunsList.get(rctx).modelEncode(outBuffer, rangeCoder, part);
+ byteModelRunsList.get(rctx).modelEncode(rangeCoder, part);
rctx = 257;
run -= part;
}
}
- rangeCoder.rangeEncodeEnd(outBuffer);
+ rangeCoder.rangeEncodeEnd();
+ outBuffer.position(rangeCoder.getOutputPosition());
outBuffer.limit(outBuffer.position());
outBuffer.rewind();
}
@@ -212,7 +223,7 @@ private void compressRLEOrder1(
maxSymbols = inBuffer.get(i) & 0xFF;
}
}
- maxSymbols++; // FIXME not what spec states!
+ maxSymbols++; // number of symbols [0..max], stored as byte (256 wraps to 0, decoded as 256)
final List modelLitList = new ArrayList<>(maxSymbols);
for (int i = 0; i < maxSymbols; i++) {
@@ -224,10 +235,11 @@ private void compressRLEOrder1(
}
outBuffer.put((byte) maxSymbols);
final RangeCoder rangeCoder = new RangeCoder();
+ rangeCoder.setOutput(outBuffer.array(), outBuffer.position());
int i = 0;
int last = 0;
while (i < inSize) {
- modelLitList.get(last).modelEncode(outBuffer, rangeCoder, inBuffer.get(i) & 0xFF);
+ modelLitList.get(last).modelEncode(rangeCoder, inBuffer.get(i) & 0xFF);
int run = 1;
while (i + run < inSize && inBuffer.get(i + run) == inBuffer.get(i)) {
run++;
@@ -237,19 +249,54 @@ private void compressRLEOrder1(
last = inBuffer.get(i) & 0xFF;
i += run + 1;
int part = run >= 3 ? 3 : run;
- byteModelRunsList.get(rctx).modelEncode(outBuffer, rangeCoder, part);
+ byteModelRunsList.get(rctx).modelEncode(rangeCoder, part);
run -= part;
rctx = 256;
while (part == 3) {
part = run >= 3 ? 3 : run;
- byteModelRunsList.get(rctx).modelEncode(outBuffer, rangeCoder, part);
+ byteModelRunsList.get(rctx).modelEncode(rangeCoder, part);
rctx = 257;
run -= part;
}
}
- rangeCoder.rangeEncodeEnd(outBuffer);
+ rangeCoder.rangeEncodeEnd();
+ outBuffer.position(rangeCoder.getOutputPosition());
outBuffer.limit(outBuffer.position());
outBuffer.rewind();
}
-}
\ No newline at end of file
+ /**
+ * Compress data using the STRIPE transformation: de-interleave the input into 4 streams,
+ * compress each independently with order-0 arithmetic coding (NOSZ flag), and write the
+ * stripe framing format: [num_streams (1 byte)] [uint7 compressed sizes] [compressed data].
+ *
+ * @param inBuffer input data to compress
+ * @param outBuffer output buffer (must already contain the format flags and optional size header)
+ */
+ private void compressStripe(final ByteBuffer inBuffer, final ByteBuffer outBuffer) {
+ final int numStreams = CompressionUtils.getStripeNumStreams();
+ final int[] sizes = CompressionUtils.buildStripeUncompressedSizes(inBuffer.remaining());
+ final ByteBuffer[] chunks = CompressionUtils.stripeTranspose(inBuffer, sizes);
+
+ // Compress each chunk independently using NOSZ flag (sizes are in the stripe framing)
+ final ByteBuffer[] compressedChunks = new ByteBuffer[numStreams];
+ for (int i = 0; i < numStreams; i++) {
+ compressedChunks[i] = compress(chunks[i], new RangeParams(RangeParams.NOSZ_FLAG_MASK));
+ }
+
+ // Write stripe framing: [numStreams] [compressed_size x N] [compressed_data x N]
+ outBuffer.put((byte) numStreams);
+ for (int i = 0; i < numStreams; i++) {
+ CompressionUtils.writeUint7(compressedChunks[i].remaining(), outBuffer);
+ }
+ for (int i = 0; i < numStreams; i++) {
+ outBuffer.put(compressedChunks[i]);
+ }
+
+ // Mark input as consumed (stripe uses absolute get, so position isn't advanced by transpose)
+ inBuffer.position(inBuffer.limit());
+ outBuffer.limit(outBuffer.position());
+ outBuffer.rewind();
+ }
+
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/ArithmeticDecoder.java b/src/main/java/htsjdk/samtools/cram/compression/rans/ArithmeticDecoder.java
deleted file mode 100644
index bfc7f33795..0000000000
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/ArithmeticDecoder.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2019 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-package htsjdk.samtools.cram.compression.rans;
-
-final public class ArithmeticDecoder {
- public final int[] frequencies = new int[Constants.NUMBER_OF_SYMBOLS];
-
- // reverse lookup table
- public final byte[] reverseLookup = new byte[Constants.TOTAL_FREQ];
-
- public ArithmeticDecoder() {
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- frequencies[i] = 0;
- }
- }
-
- public void reset() {
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- frequencies[i] = 0;
- }
- for (int i = 0; i < Constants.TOTAL_FREQ; i++) {
- reverseLookup[i] = 0;
- }
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Decode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Decode.java
new file mode 100644
index 0000000000..b790c342ca
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Decode.java
@@ -0,0 +1,245 @@
+package htsjdk.samtools.cram.compression.rans;
+
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.CompressionUtils;
+import htsjdk.samtools.cram.compression.rans.Constants;
+import htsjdk.samtools.cram.compression.rans.RANSDecode;
+import htsjdk.samtools.cram.compression.rans.RANSDecodingSymbol;
+import htsjdk.samtools.cram.compression.rans.RANSParams;
+import htsjdk.samtools.cram.compression.rans.Utils;
+
+import java.util.Arrays;
+
+/**
+ * Decoder for the CRAM 3.0 rANS 4x8 codec. Supports Order-0 and Order-1 decoding
+ * with 4-way interleaved rANS states. Each state processes one quarter of the output,
+ * enabling instruction-level parallelism.
+ */
+public class RANS4x8Decode extends RANSDecode {
+
+ private static final int RAW_BYTE_LENGTH = 4;
+
+ /**
+ * Uncompress a rANS 4x8 encoded byte stream. The first byte of the input
+ * indicates the order (0 or 1), followed by compressed and uncompressed lengths,
+ * the frequency table, and the encoded data.
+ *
+ * @param input the compressed byte stream
+ * @return the uncompressed data
+ */
+ @Override
+ public byte[] uncompress(final byte[] input) {
+ if (input.length == 0) {
+ return new byte[0];
+ }
+
+ final int[] inPos = {0};
+ final RANSParams.ORDER order = RANSParams.ORDER.fromInt(input[inPos[0]++]);
+
+ // compressed bytes length (LE int32)
+ final int inSize = readLittleEndianInt(input, inPos);
+ if (inSize != input.length - inPos[0] - RAW_BYTE_LENGTH) {
+ throw new CRAMException("Invalid input length detected in a CRAM rans 4x8 input stream.");
+ }
+
+ // uncompressed bytes length (LE int32)
+ final int outSize = readLittleEndianInt(input, inPos);
+ final byte[] out = new byte[outSize];
+ resetDecoderState();
+
+ switch (order) {
+ case ZERO:
+ uncompressOrder0Way4(input, inPos, out, outSize);
+ return out;
+ case ONE:
+ uncompressOrder1Way4(input, inPos, out, outSize);
+ return out;
+ default:
+ throw new CRAMException("Unknown rANS order: " + order);
+ }
+ }
+
+ private void uncompressOrder0Way4(final byte[] in, final int[] inPos, final byte[] out, final int outSize) {
+ readStatsOrder0(in, inPos);
+
+ long rans0 = readLittleEndianInt(in, inPos);
+ long rans1 = readLittleEndianInt(in, inPos);
+ long rans2 = readLittleEndianInt(in, inPos);
+ long rans3 = readLittleEndianInt(in, inPos);
+
+ final int out_end = outSize & ~3;
+ final byte[] revLookup0 = getReverseLookup()[0];
+ final RANSDecodingSymbol[] syms = getDecodingSymbols()[0];
+
+ for (int i = 0; i < out_end; i += 4) {
+ final byte c0 = revLookup0[Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c1 = revLookup0[Utils.RANSGetCumulativeFrequency(rans1, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c2 = revLookup0[Utils.RANSGetCumulativeFrequency(rans2, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c3 = revLookup0[Utils.RANSGetCumulativeFrequency(rans3, Constants.TOTAL_FREQ_SHIFT)];
+
+ out[i] = c0;
+ out[i + 1] = c1;
+ out[i + 2] = c2;
+ out[i + 3] = c3;
+
+ rans0 = syms[0xFF & c0].advanceSymbolStep(rans0, Constants.TOTAL_FREQ_SHIFT);
+ rans1 = syms[0xFF & c1].advanceSymbolStep(rans1, Constants.TOTAL_FREQ_SHIFT);
+ rans2 = syms[0xFF & c2].advanceSymbolStep(rans2, Constants.TOTAL_FREQ_SHIFT);
+ rans3 = syms[0xFF & c3].advanceSymbolStep(rans3, Constants.TOTAL_FREQ_SHIFT);
+
+ rans0 = Utils.RANSDecodeRenormalize4x8(rans0, in, inPos);
+ rans1 = Utils.RANSDecodeRenormalize4x8(rans1, in, inPos);
+ rans2 = Utils.RANSDecodeRenormalize4x8(rans2, in, inPos);
+ rans3 = Utils.RANSDecodeRenormalize4x8(rans3, in, inPos);
+ }
+
+ int outIdx = out_end;
+ switch (outSize & 3) {
+ case 3:
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans1, Constants.TOTAL_FREQ_SHIFT)];
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans2, Constants.TOTAL_FREQ_SHIFT)];
+ break;
+ case 2:
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans1, Constants.TOTAL_FREQ_SHIFT)];
+ break;
+ case 1:
+ out[outIdx++] = revLookup0[Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
+ break;
+ }
+ }
+
+ private void uncompressOrder1Way4(final byte[] in, final int[] inPos, final byte[] out, final int outSize) {
+ readStatsOrder1(in, inPos);
+
+ long rans0 = readLittleEndianInt(in, inPos);
+ long rans1 = readLittleEndianInt(in, inPos);
+ long rans2 = readLittleEndianInt(in, inPos);
+ long rans7 = readLittleEndianInt(in, inPos);
+
+ final int isz4 = outSize >> 2;
+ int i0 = 0, i1 = isz4, i2 = 2 * isz4, i7 = 3 * isz4;
+ byte l0 = 0, l1 = 0, l2 = 0, l7 = 0;
+ final byte[][] revLookup = getReverseLookup();
+ final RANSDecodingSymbol[][] syms = getDecodingSymbols();
+
+ for (; i0 < isz4; i0++, i1++, i2++, i7++) {
+ final byte c0 = revLookup[l0 & 0xFF][Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c1 = revLookup[l1 & 0xFF][Utils.RANSGetCumulativeFrequency(rans1, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c2 = revLookup[l2 & 0xFF][Utils.RANSGetCumulativeFrequency(rans2, Constants.TOTAL_FREQ_SHIFT)];
+ final byte c7 = revLookup[l7 & 0xFF][Utils.RANSGetCumulativeFrequency(rans7, Constants.TOTAL_FREQ_SHIFT)];
+
+ out[i0] = c0;
+ out[i1] = c1;
+ out[i2] = c2;
+ out[i7] = c7;
+
+ rans0 = syms[l0 & 0xFF][c0 & 0xFF].advanceSymbolStep(rans0, Constants.TOTAL_FREQ_SHIFT);
+ rans1 = syms[l1 & 0xFF][c1 & 0xFF].advanceSymbolStep(rans1, Constants.TOTAL_FREQ_SHIFT);
+ rans2 = syms[l2 & 0xFF][c2 & 0xFF].advanceSymbolStep(rans2, Constants.TOTAL_FREQ_SHIFT);
+ rans7 = syms[l7 & 0xFF][c7 & 0xFF].advanceSymbolStep(rans7, Constants.TOTAL_FREQ_SHIFT);
+
+ rans0 = Utils.RANSDecodeRenormalize4x8(rans0, in, inPos);
+ rans1 = Utils.RANSDecodeRenormalize4x8(rans1, in, inPos);
+ rans2 = Utils.RANSDecodeRenormalize4x8(rans2, in, inPos);
+ rans7 = Utils.RANSDecodeRenormalize4x8(rans7, in, inPos);
+
+ l0 = c0; l1 = c1; l2 = c2; l7 = c7;
+ }
+
+ // Remainder
+ for (; i7 < outSize; i7++) {
+ final byte c7 = revLookup[l7 & 0xFF][Utils.RANSGetCumulativeFrequency(rans7, Constants.TOTAL_FREQ_SHIFT)];
+ out[i7] = c7;
+ rans7 = syms[l7 & 0xFF][c7 & 0xFF].advanceSymbolStep(rans7, Constants.TOTAL_FREQ_SHIFT);
+ rans7 = Utils.RANSDecodeRenormalize4x8(rans7, in, inPos);
+ l7 = c7;
+ }
+ }
+
+ private void readStatsOrder0(final byte[] in, final int[] inPos) {
+ markRowUsed(0);
+ final int[] freq = getFrequencies()[0];
+ final byte[] revLookup = getReverseLookup()[0];
+ final RANSDecodingSymbol[] decodingSymbols = getDecodingSymbols()[0];
+ int rle = 0;
+ int cumulativeFrequency = 0;
+ int symbol = in[inPos[0]++] & 0xFF;
+ do {
+ if ((freq[symbol] = (in[inPos[0]++] & 0xFF)) >= 0x80) {
+ freq[symbol] &= ~0x80;
+ freq[symbol] = ((freq[symbol] & 0x7F) << 8) | (in[inPos[0]++] & 0xFF);
+ }
+ decodingSymbols[symbol].set(cumulativeFrequency, freq[symbol]);
+ Arrays.fill(revLookup, cumulativeFrequency, cumulativeFrequency + freq[symbol], (byte) symbol);
+ cumulativeFrequency += freq[symbol];
+
+ if (rle == 0 && symbol + 1 == (in[inPos[0]] & 0xFF)) {
+ symbol = in[inPos[0]++] & 0xFF;
+ rle = in[inPos[0]++] & 0xFF;
+ } else if (rle != 0) {
+ rle--;
+ symbol++;
+ } else {
+ symbol = in[inPos[0]++] & 0xFF;
+ }
+ } while (symbol != 0);
+ }
+
+ private void readStatsOrder1(final byte[] in, final int[] inPos) {
+ final int[][] freq = getFrequencies();
+ final byte[][] revLookup = getReverseLookup();
+ final RANSDecodingSymbol[][] decodingSymbols = getDecodingSymbols();
+ int rle_i = 0;
+ int i = in[inPos[0]++] & 0xFF;
+ do {
+ markRowUsed(i);
+ int rle_j = 0;
+ int cumulativeFrequency = 0;
+ int j = in[inPos[0]++] & 0xFF;
+ do {
+ if ((freq[i][j] = (in[inPos[0]++] & 0xFF)) >= 0x80) {
+ freq[i][j] &= ~0x80;
+ freq[i][j] = ((freq[i][j] & 0x7F) << 8) | (in[inPos[0]++] & 0xFF);
+ }
+ if (freq[i][j] == 0) {
+ freq[i][j] = Constants.TOTAL_FREQ;
+ }
+ decodingSymbols[i][j].set(cumulativeFrequency, freq[i][j]);
+ Arrays.fill(revLookup[i], cumulativeFrequency, cumulativeFrequency + freq[i][j], (byte) j);
+ cumulativeFrequency += freq[i][j];
+
+ if (rle_j == 0 && j + 1 == (in[inPos[0]] & 0xFF)) {
+ j = in[inPos[0]++] & 0xFF;
+ rle_j = in[inPos[0]++] & 0xFF;
+ } else if (rle_j != 0) {
+ rle_j--;
+ j++;
+ } else {
+ j = in[inPos[0]++] & 0xFF;
+ }
+ } while (j != 0);
+
+ if (rle_i == 0 && i + 1 == (in[inPos[0]] & 0xFF)) {
+ i = in[inPos[0]++] & 0xFF;
+ rle_i = in[inPos[0]++] & 0xFF;
+ } else if (rle_i != 0) {
+ rle_i--;
+ i++;
+ } else {
+ i = in[inPos[0]++] & 0xFF;
+ }
+ } while (i != 0);
+ }
+
+ private static int readLittleEndianInt(final byte[] in, final int[] inPos) {
+ int pos = inPos[0];
+ final int value = (in[pos] & 0xFF)
+ | ((in[pos + 1] & 0xFF) << 8)
+ | ((in[pos + 2] & 0xFF) << 16)
+ | ((in[pos + 3] & 0xFF) << 24);
+ inPos[0] = pos + 4;
+ return value;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Encode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Encode.java
new file mode 100644
index 0000000000..4491a6ecc3
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Encode.java
@@ -0,0 +1,315 @@
+package htsjdk.samtools.cram.compression.rans;
+
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.rans.Constants;
+import htsjdk.samtools.cram.compression.rans.RANSEncode;
+import htsjdk.samtools.cram.compression.rans.RANSEncodingSymbol;
+import htsjdk.samtools.cram.compression.rans.RANSParams;
+
+/**
+ * Encoder for the CRAM 3.0 rANS 4x8 codec. Supports Order-0 and Order-1 encoding
+ * with 4-way interleaved rANS states. Encoding proceeds backwards through the input
+ * to produce a stream that can be decoded forwards.
+ */
+public class RANS4x8Encode extends RANSEncode {
+
+ private static final int MINIMUM_ORDER_1_SIZE = 4;
+
+ /**
+ * Compress a byte array using the rANS 4x8 codec. Inputs shorter than
+ * {@code MINIMUM_ORDER_1_SIZE} are always compressed with Order-0 regardless
+ * of the requested order.
+ *
+ * @param input the data to compress
+ * @param params encoding parameters specifying Order-0 or Order-1
+ * @return the compressed byte stream including header, frequency table, and encoded data
+ */
+ @Override
+ public byte[] compress(final byte[] input, final RANS4x8Params params) {
+ if (input.length == 0) {
+ return new byte[0];
+ }
+ if (input.length < MINIMUM_ORDER_1_SIZE) {
+ return compressOrder0Way4(input);
+ }
+ switch (params.getOrder()) {
+ case ZERO:
+ return compressOrder0Way4(input);
+ case ONE:
+ return compressOrder1Way4(input);
+ default:
+ throw new CRAMException("Unknown rANS order: " + params.getOrder());
+ }
+ }
+
+ private byte[] compressOrder0Way4(final byte[] in) {
+ final int inputSize = in.length;
+ final int[] F = calcFrequenciesOrder0(in);
+ buildSymsOrder0(F);
+
+ // Write frequency table
+ final byte[] freqTable = new byte[1024];
+ final int[] freqPos = {0};
+ writeFrequenciesOrder0(freqTable, freqPos, F);
+ final int frequencyTableSize = freqPos[0];
+
+ // Encode backwards
+ final RANSEncodingSymbol[] syms = getEncodingSymbols()[0];
+ long rans0 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans1 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans2 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans3 = Constants.RANS_4x8_LOWER_BOUND;
+
+ final int maxCompressedSize = inputSize + inputSize / 4 + 64;
+ final byte[] compressedData = new byte[maxCompressedSize];
+ final int[] writePos = {maxCompressedSize};
+
+ // Remainder
+ int i;
+ switch (i = inputSize & 3) {
+ case 3: rans2 = syms[in[inputSize - (i - 2)] & 0xFF].putSymbol4x8(rans2, compressedData, writePos);
+ case 2: rans1 = syms[in[inputSize - (i - 1)] & 0xFF].putSymbol4x8(rans1, compressedData, writePos);
+ case 1: rans0 = syms[in[inputSize - i] & 0xFF].putSymbol4x8(rans0, compressedData, writePos);
+ case 0: break;
+ }
+
+ // Main loop
+ for (i = inputSize & ~3; i > 0; i -= 4) {
+ rans3 = syms[in[i - 1] & 0xFF].putSymbol4x8(rans3, compressedData, writePos);
+ rans2 = syms[in[i - 2] & 0xFF].putSymbol4x8(rans2, compressedData, writePos);
+ rans1 = syms[in[i - 3] & 0xFF].putSymbol4x8(rans1, compressedData, writePos);
+ rans0 = syms[in[i - 4] & 0xFF].putSymbol4x8(rans0, compressedData, writePos);
+ }
+
+ // Flush states: rans3 first (highest addr), rans0 last (lowest addr) = LE
+ flushState4x8(rans3, compressedData, writePos);
+ flushState4x8(rans2, compressedData, writePos);
+ flushState4x8(rans1, compressedData, writePos);
+ flushState4x8(rans0, compressedData, writePos);
+
+ final int compressedSize = maxCompressedSize - writePos[0];
+ return assembleOutput(RANSParams.ORDER.ZERO, inputSize, freqTable, frequencyTableSize, compressedData, writePos[0], compressedSize);
+ }
+
+ private byte[] compressOrder1Way4(final byte[] in) {
+ final int inSize = in.length;
+ final int[][] F = calcFrequenciesOrder1(in);
+ buildSymsOrder1(F);
+
+ // Write frequency table
+ final byte[] freqTable = new byte[257 * 256 * 3 + 256];
+ final int[] freqPos = {0};
+ writeFrequenciesOrder1(freqTable, freqPos, F);
+ final int frequencyTableSize = freqPos[0];
+
+ // Encode backwards
+ final RANSEncodingSymbol[][] syms = getEncodingSymbols();
+ long rans0 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans1 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans2 = Constants.RANS_4x8_LOWER_BOUND;
+ long rans3 = Constants.RANS_4x8_LOWER_BOUND;
+
+ final int maxCompressedSize = inSize + inSize / 4 + 64;
+ final byte[] compressedData = new byte[maxCompressedSize];
+ final int[] writePos = {maxCompressedSize};
+
+ final int isz4 = inSize >> 2;
+ int i0 = isz4 - 2;
+ int i1 = 2 * isz4 - 2;
+ int i2 = 3 * isz4 - 2;
+ int i3;
+
+ byte l0 = (i0 + 1 >= 0) ? in[i0 + 1] : 0;
+ byte l1 = (i1 + 1 >= 0) ? in[i1 + 1] : 0;
+ byte l2 = (i2 + 1 >= 0) ? in[i2 + 1] : 0;
+ byte l3 = in[inSize - 1];
+
+ // Remainder
+ for (i3 = inSize - 2; i3 > 4 * isz4 - 2 && i3 >= 0; i3--) {
+ final byte c3 = in[i3];
+ rans3 = syms[c3 & 0xFF][l3 & 0xFF].putSymbol4x8(rans3, compressedData, writePos);
+ l3 = c3;
+ }
+
+ // Main loop
+ for (; i0 >= 0; i0--, i1--, i2--, i3--) {
+ rans3 = syms[in[i3] & 0xFF][l3 & 0xFF].putSymbol4x8(rans3, compressedData, writePos);
+ rans2 = syms[in[i2] & 0xFF][l2 & 0xFF].putSymbol4x8(rans2, compressedData, writePos);
+ rans1 = syms[in[i1] & 0xFF][l1 & 0xFF].putSymbol4x8(rans1, compressedData, writePos);
+ rans0 = syms[in[i0] & 0xFF][l0 & 0xFF].putSymbol4x8(rans0, compressedData, writePos);
+ l0 = in[i0]; l1 = in[i1]; l2 = in[i2]; l3 = in[i3];
+ }
+
+ // Final context=0 symbols
+ rans3 = syms[0][l3 & 0xFF].putSymbol4x8(rans3, compressedData, writePos);
+ rans2 = syms[0][l2 & 0xFF].putSymbol4x8(rans2, compressedData, writePos);
+ rans1 = syms[0][l1 & 0xFF].putSymbol4x8(rans1, compressedData, writePos);
+ rans0 = syms[0][l0 & 0xFF].putSymbol4x8(rans0, compressedData, writePos);
+
+ // Flush states
+ flushState4x8(rans3, compressedData, writePos);
+ flushState4x8(rans2, compressedData, writePos);
+ flushState4x8(rans1, compressedData, writePos);
+ flushState4x8(rans0, compressedData, writePos);
+
+ final int compressedSize = maxCompressedSize - writePos[0];
+ return assembleOutput(RANSParams.ORDER.ONE, inSize, freqTable, frequencyTableSize, compressedData, writePos[0], compressedSize);
+ }
+
+ /** Write a 4-byte LE state backwards into the compressed data array. */
+ private static void flushState4x8(final long rans, final byte[] out, final int[] writePos) {
+ final int state = (int) rans;
+ out[--writePos[0]] = (byte) ((state >> 24) & 0xFF);
+ out[--writePos[0]] = (byte) ((state >> 16) & 0xFF);
+ out[--writePos[0]] = (byte) ((state >> 8) & 0xFF);
+ out[--writePos[0]] = (byte) (state & 0xFF);
+ }
+
+ /** Assemble the final output: [order(1)] [compressedLen(4)] [uncompressedLen(4)] [freqTable] [compressedData] */
+ private static byte[] assembleOutput(
+ final RANSParams.ORDER order, final int uncompressedSize,
+ final byte[] freqTable, final int freqTableSize,
+ final byte[] compressedData, final int compDataOffset, final int compDataSize) {
+ final int totalCompressed = freqTableSize + compDataSize;
+ final byte[] result = new byte[Constants.RANS_4x8_PREFIX_BYTE_LENGTH + totalCompressed];
+
+ // Prefix: order(1) + compressedLen(4 LE) + uncompressedLen(4 LE)
+ result[0] = (byte) (order == RANSParams.ORDER.ZERO ? 0 : 1);
+ writeLittleEndianInt(result, Constants.RANS_4x8_ORDER_BYTE_LENGTH, totalCompressed);
+ writeLittleEndianInt(result, Constants.RANS_4x8_ORDER_BYTE_LENGTH + Constants.RANS_4x8_COMPRESSED_BYTE_LENGTH, uncompressedSize);
+
+ // Frequency table + compressed data
+ System.arraycopy(freqTable, 0, result, Constants.RANS_4x8_PREFIX_BYTE_LENGTH, freqTableSize);
+ System.arraycopy(compressedData, compDataOffset, result, Constants.RANS_4x8_PREFIX_BYTE_LENGTH + freqTableSize, compDataSize);
+ return result;
+ }
+
+ private static void writeLittleEndianInt(final byte[] out, final int offset, final int value) {
+ out[offset] = (byte) (value & 0xFF);
+ out[offset + 1] = (byte) ((value >> 8) & 0xFF);
+ out[offset + 2] = (byte) ((value >> 16) & 0xFF);
+ out[offset + 3] = (byte) ((value >> 24) & 0xFF);
+ }
+
+ // ---- Frequency calculation and writing (byte[]) ----
+
+ private static int[] calcFrequenciesOrder0(final byte[] in) {
+ final int T = in.length;
+ final int[] F = new int[Constants.NUMBER_OF_SYMBOLS];
+ for (final byte b : in) F[b & 0xFF]++;
+
+ int m = 0, M = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (m < F[j]) { m = F[j]; M = j; }
+ }
+
+ final long tr = ((long) Constants.TOTAL_FREQ << 31) / T + (1 << 30) / T;
+ int fsum = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[j] == 0) continue;
+ if ((F[j] = (int) ((F[j] * tr) >> 31)) == 0) F[j] = 1;
+ fsum += F[j];
+ }
+ if (fsum < Constants.TOTAL_FREQ) F[M] += Constants.TOTAL_FREQ - fsum;
+ else F[M] -= fsum - Constants.TOTAL_FREQ;
+ return F;
+ }
+
+ private static int[][] calcFrequenciesOrder1(final byte[] in) {
+ final int in_size = in.length;
+ final int[][] F = new int[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
+ final int[] T = new int[Constants.NUMBER_OF_SYMBOLS];
+ int last_i = 0;
+ for (int i = 0; i < in_size; i++) {
+ int c = in[i] & 0xFF;
+ F[last_i][c]++;
+ T[last_i]++;
+ last_i = c;
+ }
+ F[0][in[in_size >> 2] & 0xFF]++;
+ F[0][in[2 * (in_size >> 2)] & 0xFF]++;
+ F[0][in[3 * (in_size >> 2)] & 0xFF]++;
+ T[0] += 3;
+
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ if (T[i] == 0) continue;
+ final double p = ((double) Constants.TOTAL_FREQ) / T[i];
+ int t2 = 0, m = 0, M = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[i][j] == 0) continue;
+ if (m < F[i][j]) { m = F[i][j]; M = j; }
+ if ((F[i][j] *= p) == 0) F[i][j] = 1;
+ t2 += F[i][j];
+ }
+ if (t2 < Constants.TOTAL_FREQ) F[i][M] += Constants.TOTAL_FREQ - t2;
+ else F[i][M] -= t2 - Constants.TOTAL_FREQ;
+ }
+ return F;
+ }
+
+ private static void writeFrequenciesOrder0(final byte[] out, final int[] pos, final int[] F) {
+ int rle = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[j] != 0) {
+ if (rle != 0) { rle--; } else {
+ out[pos[0]++] = (byte) j;
+ if (j != 0 && F[j - 1] != 0) {
+ for (rle = j + 1; rle < Constants.NUMBER_OF_SYMBOLS && F[rle] != 0; rle++) ;
+ rle -= j + 1;
+ out[pos[0]++] = (byte) rle;
+ }
+ }
+ if (F[j] < 128) {
+ out[pos[0]++] = (byte) F[j];
+ } else {
+ out[pos[0]++] = (byte) (128 | (F[j] >> 8));
+ out[pos[0]++] = (byte) (F[j] & 0xFF);
+ }
+ }
+ }
+ out[pos[0]++] = 0;
+ }
+
+ private static void writeFrequenciesOrder1(final byte[] out, final int[] pos, final int[][] F) {
+ final int[] T = new int[Constants.NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++)
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++)
+ T[i] += F[i][j];
+
+ int rle_i = 0;
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ if (T[i] == 0) continue;
+ if (rle_i != 0) { rle_i--; } else {
+ out[pos[0]++] = (byte) i;
+ if (i != 0 && T[i - 1] != 0) {
+ for (rle_i = i + 1; rle_i < Constants.NUMBER_OF_SYMBOLS && T[rle_i] != 0; rle_i++) ;
+ rle_i -= i + 1;
+ out[pos[0]++] = (byte) rle_i;
+ }
+ }
+
+ final int[] F_i = F[i];
+ int rle_j = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F_i[j] != 0) {
+ if (rle_j != 0) { rle_j--; } else {
+ out[pos[0]++] = (byte) j;
+ if (j != 0 && F_i[j - 1] != 0) {
+ for (rle_j = j + 1; rle_j < Constants.NUMBER_OF_SYMBOLS && F_i[rle_j] != 0; rle_j++) ;
+ rle_j -= j + 1;
+ out[pos[0]++] = (byte) rle_j;
+ }
+ }
+ if (F_i[j] < 128) {
+ out[pos[0]++] = (byte) F_i[j];
+ } else {
+ out[pos[0]++] = (byte) (128 | (F_i[j] >> 8));
+ out[pos[0]++] = (byte) (F_i[j] & 0xFF);
+ }
+ }
+ }
+ out[pos[0]++] = 0;
+ }
+ out[pos[0]++] = 0;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Params.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Params.java
new file mode 100644
index 0000000000..8621910fc7
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANS4x8Params.java
@@ -0,0 +1,26 @@
+package htsjdk.samtools.cram.compression.rans;
+
+/** Parameters for the rANS 4x8 codec: only the encoding order (0 or 1). */
+public final class RANS4x8Params implements RANSParams {
+
+ private final ORDER order;
+
+ public RANS4x8Params(final ORDER order) {
+ this.order = order;
+ }
+
+ @Override
+ public String toString() {
+ return "RANS4x8Params{" + "order=" + order + "}";
+ }
+
+ @Override
+ public ORDER getOrder() {
+ return order;
+ }
+
+ @Override
+ public int getFormatFlags() {
+ return order == ORDER.ONE ? RANSNx16Params.ORDER_FLAG_MASK : 0;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecode.java
index 154cfa9614..4440b07823 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecode.java
@@ -1,51 +1,78 @@
package htsjdk.samtools.cram.compression.rans;
-import java.nio.ByteBuffer;
-
+/**
+ * Abstract base class for rANS decoders (both 4x8 and Nx16). Holds the shared decoding
+ * state: per-context frequency tables, reverse-lookup tables, and decoding symbols.
+ *
+ *
State is allocated once at construction and reused across calls. Between calls,
+ * only the rows that were actually used in the previous decode are reset, avoiding
+ * the O(65536) full reset that would otherwise be required.
+ */
public abstract class RANSDecode {
- private ArithmeticDecoder[] D;
- private RANSDecodingSymbol[][] decodingSymbols;
+ private final int[][] frequencies;
+ private final byte[][] reverseLookup;
+ private final RANSDecodingSymbol[][] decodingSymbols;
+ private final boolean[] usedRows;
+ private int usedRowCount;
+
+ protected RANSDecode() {
+ frequencies = new int[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
+ reverseLookup = new byte[Constants.NUMBER_OF_SYMBOLS][Constants.TOTAL_FREQ];
+ decodingSymbols = new RANSDecodingSymbol[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ decodingSymbols[i][j] = new RANSDecodingSymbol();
+ }
+ }
+ usedRows = new boolean[Constants.NUMBER_OF_SYMBOLS];
+ }
+
+ protected final int[][] getFrequencies() {
+ return frequencies;
+ }
- // GETTERS
- protected ArithmeticDecoder[] getD() {
- return D;
+ protected final byte[][] getReverseLookup() {
+ return reverseLookup;
}
- protected RANSDecodingSymbol[][] getDecodingSymbols() {
+ protected final RANSDecodingSymbol[][] getDecodingSymbols() {
return decodingSymbols;
}
- // This method assumes that inBuffer is already rewound.
- // It uncompresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the uncompressed data.
- public abstract ByteBuffer uncompress(final ByteBuffer inBuffer);
-
- // Lazy initialization of working memory for the decoder
- protected void initializeRANSDecoder() {
- if (D == null) {
- D = new ArithmeticDecoder[Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- D[i] = new ArithmeticDecoder();
- }
- } else {
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- D[i].reset();
- }
+ /**
+ * Uncompress a rANS-encoded byte stream.
+ *
+ * @param input the compressed byte stream (format-specific header + encoded data)
+ * @return the uncompressed data
+ */
+ public abstract byte[] uncompress(final byte[] input);
+
+ /**
+ * Mark a context row as used. Called by subclass readFrequencyTable methods when
+ * populating a row. Enables selective reset on the next {@link #resetDecoderState} call.
+ */
+ protected final void markRowUsed(final int row) {
+ if (!usedRows[row]) {
+ usedRows[row] = true;
+ usedRowCount++;
}
- if (decodingSymbols == null) {
- decodingSymbols = new RANSDecodingSymbol[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < decodingSymbols.length; i++) {
- for (int j = 0; j < decodingSymbols[i].length; j++) {
- decodingSymbols[i][j] = new RANSDecodingSymbol();
- }
- }
- } else {
- for (int i = 0; i < decodingSymbols.length; i++) {
- for (int j = 0; j < decodingSymbols[i].length; j++) {
+ }
+
+ /**
+ * Reset only the decoder rows that were used in the previous decode operation.
+ * Called at the start of each uncompress to prepare for new data.
+ */
+ protected final void resetDecoderState() {
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS && usedRowCount > 0; i++) {
+ if (usedRows[i]) {
+ java.util.Arrays.fill(frequencies[i], 0);
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
decodingSymbols[i][j].set(0, 0);
}
+ usedRows[i] = false;
+ usedRowCount--;
}
}
+ usedRowCount = 0;
}
-
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecodingSymbol.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecodingSymbol.java
index 34d0bc7dda..e4211efcae 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecodingSymbol.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSDecodingSymbol.java
@@ -24,62 +24,43 @@
*/
package htsjdk.samtools.cram.compression.rans;
-import java.nio.ByteBuffer;
-
-final public class RANSDecodingSymbol {
- int start; // Start of range.
- int freq; // Symbol frequency.
+/**
+ * Holds the start and frequency for a single symbol in the rANS decoding table.
+ * The reverse-lookup table mapping cumulative frequency to symbol is held separately
+ * in the decoder's {@code reverseLookup} arrays.
+ *
+ *
Decoding a symbol is a two-step process:
+ *
+ *
{@link #advanceSymbolStep} — updates the rANS state using the symbol's range
+ *
{@link Utils#RANSDecodeRenormalizeNx16} or {@link Utils#RANSDecodeRenormalize4x8}
+ * — renormalizes by reading bytes from the compressed stream
+ *
+ */
+public final class RANSDecodingSymbol {
+ int start;
+ int freq;
+ /**
+ * Set the decoding parameters for this symbol.
+ *
+ * @param start the cumulative frequency of all preceding symbols
+ * @param freq the frequency of this symbol
+ */
public void set(final int start, final int freq) {
- // This method gets called a LOT so this validation is too expensive to leave in.
- //ValidationUtils.validateArg(start <= (1 << 16), "invalid RANSDecodingSymbol start");
- //ValidationUtils.validateArg(freq <= (1 << 16) - start, "invalid RANSDecodingSymbol frequency");
this.start = start;
this.freq = freq;
}
- // Advances in the bit stream by "popping" a single symbol with range start
- // "start" and frequency "freq". All frequencies are assumed to sum to
- // "1 << scale_bits".
- // No renormalization or output happens.
+ /**
+ * Advance the rANS state by one decoded symbol. Does not renormalize — the caller
+ * must call the appropriate {@code Utils.RANSDecodeRenormalize*} method after this.
+ *
+ * @param r the current rANS state
+ * @param scaleBits the frequency scale (log2 of total frequency sum)
+ * @return the updated rANS state (before renormalization)
+ */
public long advanceSymbolStep(final long r, final int scaleBits) {
- final int mask = ((1 << scaleBits) - 1);
-
- // s, x = D(x)
- return freq * (r >> scaleBits) + (r & mask) - start;
- }
-
- // Advances in the bit stream by "popping" a single symbol with range start
- // "start" and frequency "freq". All frequencies are assumed to sum to
- // "1 << scale_bits".
- public long advanceSymbol4x8(final long rIn, final ByteBuffer byteBuffer, final int scaleBits) {
- final int mask = (1 << scaleBits) - 1;
-
- // s, x = D(x)
- long ret = freq * (rIn >> scaleBits) + (rIn & mask) - start;
-
- // re-normalize
- if (ret < Constants.RANS_4x8_LOWER_BOUND) {
- do {
- final int b = 0xFF & byteBuffer.get();
- ret = (ret << 8) | b;
- } while (ret < Constants.RANS_4x8_LOWER_BOUND);
- }
- return ret;
- }
-
- public long advanceSymbolNx16(final long rIn, final ByteBuffer byteBuffer, final int scaleBits) {
final int mask = (1 << scaleBits) - 1;
-
- // s, x = D(x)
- long ret = freq * (rIn >> scaleBits) + (rIn & mask) - start;
-
- // re-normalize
- if (ret < (Constants.RANS_Nx16_LOWER_BOUND)){
- final int i = (0xFF & byteBuffer.get()) | ((0xFF & byteBuffer.get()) << 8);
- ret = (ret << 16) + i;
- }
- return ret;
+ return freq * (r >> scaleBits) + (r & mask) - start;
}
-
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncode.java
index 49b12dd275..04f966614d 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncode.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncode.java
@@ -1,58 +1,64 @@
package htsjdk.samtools.cram.compression.rans;
-import java.nio.ByteBuffer;
-
+/**
+ * Abstract base class for rANS encoders (both 4x8 and Nx16). Holds the shared encoding
+ * symbol matrix and provides helper methods for frequency-to-symbol setup.
+ *
+ *
The encoding symbol matrix is allocated once at construction and reused across
+ * compress calls. Between calls, only the symbols that were actually used are reset.
+ */
public abstract class RANSEncode {
- private RANSEncodingSymbol[][] encodingSymbols;
-
- // Getter
- protected RANSEncodingSymbol[][] getEncodingSymbols() {
- return encodingSymbols;
- }
-
- // This method assumes that inBuffer is already rewound.
- // It compresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the compressed data.
- public abstract ByteBuffer compress(final ByteBuffer inBuffer, final T params);
+ private final RANSEncodingSymbol[][] encodingSymbols;
- // Lazy initialization of working memory for the encoder
- protected void initializeRANSEncoder() {
- if (encodingSymbols == null) {
- encodingSymbols = new RANSEncodingSymbol[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < encodingSymbols.length; i++) {
- for (int j = 0; j < encodingSymbols[i].length; j++) {
- encodingSymbols[i][j] = new RANSEncodingSymbol();
- }
- }
- } else {
- for (int i = 0; i < encodingSymbols.length; i++) {
- for (int j = 0; j < encodingSymbols[i].length; j++) {
- encodingSymbols[i][j].reset();
- }
+ protected RANSEncode() {
+ encodingSymbols = new RANSEncodingSymbol[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < encodingSymbols.length; i++) {
+ for (int j = 0; j < encodingSymbols[i].length; j++) {
+ encodingSymbols[i][j] = new RANSEncodingSymbol();
}
}
}
- protected void buildSymsOrder0(final int[] frequencies) {
- updateEncodingSymbols(frequencies, getEncodingSymbols()[0]);
+ protected final RANSEncodingSymbol[][] getEncodingSymbols() {
+ return encodingSymbols;
}
- protected void buildSymsOrder1(final int[][] frequencies) {
- final RANSEncodingSymbol[][] encodingSymbols = getEncodingSymbols();
+ /**
+ * Compress a byte array using this rANS encoder.
+ *
+ * @param input the data to compress
+ * @param params encoder-specific parameters (order, flags, etc.)
+ * @return the compressed byte stream
+ */
+ public abstract byte[] compress(final byte[] input, final T params);
+
+ /**
+ * Set up encoding symbols for Order-0 from the given normalized frequency table.
+ * Only symbols with non-zero frequency are initialized; others are reset to zero.
+ */
+ protected final void buildSymsOrder0(final int[] frequencies) {
+ resetAndUpdateEncodingSymbols(frequencies, encodingSymbols[0]);
+ }
+
+ /**
+ * Set up encoding symbols for Order-1 from the given normalized frequency tables.
+ * Each row corresponds to one context symbol.
+ */
+ protected final void buildSymsOrder1(final int[][] frequencies) {
for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- updateEncodingSymbols(frequencies[i], encodingSymbols[i]);
+ resetAndUpdateEncodingSymbols(frequencies[i], encodingSymbols[i]);
}
}
- private void updateEncodingSymbols(int[] frequencies, RANSEncodingSymbol[] encodingSymbols) {
+ private void resetAndUpdateEncodingSymbols(final int[] frequencies, final RANSEncodingSymbol[] symbols) {
+ // No explicit reset needed: set() overwrites all fields, and symbols with zero frequency
+ // are never accessed during encoding (only symbols present in the input are encoded).
int cumulativeFreq = 0;
for (int symbol = 0; symbol < Constants.NUMBER_OF_SYMBOLS; symbol++) {
if (frequencies[symbol] != 0) {
- //For each symbol, set start = cumulative frequency and freq = frequencies[symbol]
- encodingSymbols[symbol].set(cumulativeFreq, frequencies[symbol], Constants.TOTAL_FREQ_SHIFT);
+ symbols[symbol].set(cumulativeFreq, frequencies[symbol], Constants.TOTAL_FREQ_SHIFT);
cumulativeFreq += frequencies[symbol];
}
}
}
-
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncodingSymbol.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncodingSymbol.java
index 8188d1a825..67e79c2857 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncodingSymbol.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSEncodingSymbol.java
@@ -24,21 +24,37 @@
*/
package htsjdk.samtools.cram.compression.rans;
-import htsjdk.utils.ValidationUtils;
-
-import java.nio.ByteBuffer;
-
+/**
+ * Encoding state for a single symbol in the rANS codec. Fields are package-private to allow
+ * the encode loops in {@link RANSNx16Encode} and {@link RANS4x8Encode} to inline the
+ * renormalization and state-update arithmetic directly, avoiding method call overhead
+ * in the hot inner loop.
+ */
public final class RANSEncodingSymbol {
- private long xMax; // (Exclusive) upper bound of pre-normalization interval
- private int rcpFreq; // Fixed-point reciprocal frequency
- private int bias; // Bias
- private int cmplFreq; // Complement of frequency: (1 << scaleBits) - freq
- private int rcpShift; // Reciprocal shift
+ /** (Exclusive) upper bound of pre-normalization interval. */
+ long xMax;
+ /** Fixed-point reciprocal frequency for integer division. */
+ int rcpFreq;
+ /** Bias term for the encoding formula. */
+ int bias;
+ /** Complement of frequency: (1 << scaleBits) - freq. */
+ int cmplFreq;
+ /** Reciprocal shift (includes the +32 adjustment). */
+ int rcpShift;
+ /** Reset all encoding parameters to zero. */
public void reset() {
xMax = rcpFreq = bias = cmplFreq = rcpShift = 0;
}
+ /**
+ * Initialize encoding parameters for a symbol given its position in the frequency table.
+ * Computes the reciprocal frequency and bias needed for fast integer division during encoding.
+ *
+ * @param start cumulative frequency of all preceding symbols
+ * @param freq frequency of this symbol (must be > 0)
+ * @param scaleBits log2 of the total frequency sum
+ */
public void set(final int start, final int freq, final int scaleBits) {
// Rans4x8: xMax = ((Constants.RANS_BYTE_L_4x8 >> scaleBits) << 8) * freq = (1<< 31-scaleBits) * freq
@@ -67,54 +83,43 @@ public void set(final int start, final int freq, final int scaleBits) {
rcpShift += 32; // Avoid the extra >>32 in RansEncPutSymbol
}
- public long putSymbol4x8(final long r, final ByteBuffer byteBuffer) {
- ValidationUtils.validateArg(xMax != 0, "can't encode symbol with freq=0");
-
- // re-normalize
+ /**
+ * byte[] variant for Nx16 encoding — writes backwards (decrementing posHolder[0]).
+ * Renormalization bytes are written so the final memory layout is little-endian
+ * (LSB at lower address), matching htslib's RansEncPutSymbol output format.
+ */
+ public long putSymbolNx16(final long r, final byte[] out, final int[] posHolder) {
long retSymbol = r;
+ int pos = posHolder[0];
if (retSymbol >= xMax) {
- byteBuffer.put((byte) (retSymbol & 0xFF));
- retSymbol >>= 8;
+ // Write 2-byte LE renorm word: MSB at higher addr (written first), LSB at lower addr (written second)
+ out[--pos] = (byte) ((retSymbol >> 8) & 0xFF);
+ out[--pos] = (byte) (retSymbol & 0xFF);
+ retSymbol >>= 16;
if (retSymbol >= xMax) {
- byteBuffer.put((byte) (retSymbol & 0xFF));
- retSymbol >>= 8;
+ out[--pos] = (byte) ((retSymbol >> 8) & 0xFF);
+ out[--pos] = (byte) (retSymbol & 0xFF);
+ retSymbol >>= 16;
}
}
-
- // x = C(s,x)
- // NOTE: written this way so we get a 32-bit "multiply high" when
- // available. If you're on a 64-bit platform with cheap multiplies
- // (e.g. x64), just bake the +32 into rcp_shift.
- // int q = (int) (((uint64_t)x * sym.rcp_freq) >> 32) >> sym.rcp_shift;
-
- // The extra >>32 has already been added to RansEncSymbolInit
+ posHolder[0] = pos;
final long q = ((retSymbol * (0xFFFFFFFFL & rcpFreq)) >> rcpShift);
return retSymbol + bias + q * cmplFreq;
}
- public long putSymbolNx16(final long r, final ByteBuffer byteBuffer) {
- ValidationUtils.validateArg(xMax != 0, "can't encode symbol with freq=0");
-
- // re-normalize
+ /** byte[] variant for 4x8 encoding — writes backwards, decrementing posHolder[0]. */
+ public long putSymbol4x8(final long r, final byte[] out, final int[] posHolder) {
long retSymbol = r;
+ int pos = posHolder[0];
if (retSymbol >= xMax) {
- byteBuffer.put((byte) ((retSymbol>>8) & 0xFF)); // extra line - 1 more byte
- byteBuffer.put((byte) (retSymbol & 0xFF));
- retSymbol >>=16;
+ out[--pos] = (byte) (retSymbol & 0xFF);
+ retSymbol >>= 8;
if (retSymbol >= xMax) {
- byteBuffer.put((byte) ((retSymbol>>8) & 0xFF)); // extra line - 1 more byte
- byteBuffer.put((byte) (retSymbol & 0xFF));
- retSymbol >>=16;
+ out[--pos] = (byte) (retSymbol & 0xFF);
+ retSymbol >>= 8;
}
}
-
- // x = C(s,x)
- // NOTE: written this way so we get a 32-bit "multiply high" when
- // available. If you're on a 64-bit platform with cheap multiplies
- // (e.g. x64), just bake the +32 into rcp_shift.
- // int q = (int) (((uint64_t)x * sym.rcp_freq) >> 32) >> sym.rcp_shift;
-
- // The extra >>32 has already been added to RansEncSymbolInit
+ posHolder[0] = pos;
final long q = ((retSymbol * (0xFFFFFFFFL & rcpFreq)) >> rcpShift);
return retSymbol + bias + q * cmplFreq;
}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Decode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Decode.java
new file mode 100644
index 0000000000..c895079c5d
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Decode.java
@@ -0,0 +1,426 @@
+package htsjdk.samtools.cram.compression.rans;
+
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.CompressionUtils;
+import htsjdk.samtools.cram.compression.rans.Constants;
+import htsjdk.samtools.cram.compression.rans.RANSDecode;
+import htsjdk.samtools.cram.compression.rans.RANSDecodingSymbol;
+import htsjdk.samtools.cram.compression.rans.Utils;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * Decoder for the CRAM 3.1 rANSNx16 codec. All internal operations use byte[] with explicit
+ * offset tracking for performance. The public API accepts and returns byte[].
+ */
+public class RANSNx16Decode extends RANSDecode {
+ private static final int FREQ_TABLE_OPTIONALLY_COMPRESSED_MASK = 0x01;
+ private static final int RLE_META_OPTIONALLY_COMPRESSED_MASK = 0x01;
+
+ /**
+ * Uncompress a rANS Nx16 encoded byte stream. The format flags byte at the start
+ * of the stream determines which transformations (PACK, RLE, STRIPE, CAT) are applied,
+ * along with the order (0 or 1) and interleave width (4 or 32).
+ *
+ * @param input the compressed byte stream
+ * @return the uncompressed data
+ */
+ @Override
+ public byte[] uncompress(final byte[] input) {
+ if (input.length == 0) {
+ return new byte[0];
+ }
+ return uncompressInternal(input, new int[]{0}, 0);
+ }
+
+ /**
+ * Internal uncompress that works on byte[] with an explicit read position.
+ * Used for recursive calls (stripe, freq table decompression).
+ */
+ private byte[] uncompressInternal(final byte[] in, final int[] inPos, final int outSize) {
+ if (inPos[0] >= in.length) {
+ return new byte[0];
+ }
+
+ final int formatFlags = in[inPos[0]++] & 0xFF;
+ final RANSNx16Params ransNx16Params = new RANSNx16Params(formatFlags);
+
+ int uncompressedSize = ransNx16Params.isNosz() ? outSize : CompressionUtils.readUint7(in, inPos);
+
+ // Stripe: decode each sub-stream and transpose
+ if (ransNx16Params.isStripe()) {
+ return decodeStripe(in, inPos, uncompressedSize);
+ }
+
+ // Pack metadata
+ int packDataLength = 0;
+ int numSymbols = 0;
+ byte[] packMappingTable = null;
+ if (ransNx16Params.isPack()) {
+ packDataLength = uncompressedSize;
+ numSymbols = in[inPos[0]++] & 0xFF;
+ if (numSymbols <= 16 && numSymbols != 0) {
+ packMappingTable = new byte[numSymbols];
+ System.arraycopy(in, inPos[0], packMappingTable, 0, numSymbols);
+ inPos[0] += numSymbols;
+ uncompressedSize = CompressionUtils.readUint7(in, inPos);
+ } else {
+ throw new CRAMException("Bit Packing is not permitted when number of distinct symbols is greater than 16 or equal to 0. " +
+ "Number of distinct symbols: " + numSymbols);
+ }
+ }
+
+ // RLE metadata
+ int uncompressedRLEOutputLength = 0;
+ int[] rleSymbols = null;
+ byte[] rleMetaData = null;
+ int[] rleMetaPos = null; // position into rleMetaData for reading run-lengths
+ if (ransNx16Params.isRLE()) {
+ rleSymbols = new int[Constants.NUMBER_OF_SYMBOLS];
+ final int uncompressedRLEMetaDataLength = CompressionUtils.readUint7(in, inPos);
+ uncompressedRLEOutputLength = uncompressedSize;
+ uncompressedSize = CompressionUtils.readUint7(in, inPos);
+ rleMetaPos = new int[]{0};
+ rleMetaData = decodeRLEMeta(in, inPos, uncompressedRLEMetaDataLength, rleSymbols, rleMetaPos, ransNx16Params);
+ }
+
+ byte[] out;
+
+ if (ransNx16Params.isCAT()) {
+ out = new byte[uncompressedSize];
+ System.arraycopy(in, inPos[0], out, 0, uncompressedSize);
+ inPos[0] += uncompressedSize;
+ } else {
+ if (uncompressedSize == 0) {
+ throw new CRAMException("Unexpected uncompressed size of 0 in RANSNx16 stream");
+ }
+ out = new byte[uncompressedSize];
+ switch (ransNx16Params.getOrder()) {
+ case ZERO:
+ uncompressOrder0WayN(in, inPos, out, uncompressedSize, ransNx16Params);
+ break;
+ case ONE:
+ uncompressOrder1WayN(in, inPos, out, uncompressedSize, ransNx16Params);
+ break;
+ default:
+ throw new CRAMException("Unknown rANSNx16 order: " + ransNx16Params.getOrder());
+ }
+ }
+
+ if (ransNx16Params.isRLE()) {
+ out = decodeRLE(out, rleSymbols, rleMetaData, rleMetaPos, uncompressedRLEOutputLength);
+ }
+
+ if (ransNx16Params.isPack()) {
+ // decodePack still uses ByteBuffer — bridge at this boundary
+ final ByteBuffer packed = ByteBuffer.wrap(out).order(ByteOrder.LITTLE_ENDIAN);
+ final ByteBuffer unpacked = CompressionUtils.decodePack(packed, packMappingTable, numSymbols, packDataLength);
+ out = new byte[unpacked.remaining()];
+ unpacked.get(out);
+ }
+ return out;
+ }
+
+ private void uncompressOrder0WayN(
+ final byte[] in, final int[] inPos,
+ final byte[] out, final int outSize,
+ final RANSNx16Params ransNx16Params) {
+ resetDecoderState();
+ readFrequencyTableOrder0(in, inPos);
+
+ final int Nway = ransNx16Params.getNumInterleavedRANSStates();
+ final long[] rans = new long[Nway];
+ for (int r = 0; r < Nway; r++) {
+ rans[r] = readLittleEndianInt(in, inPos);
+ }
+
+ final int interleaveSize = (Nway == 4) ? (outSize >> 2) : (outSize >> 5);
+ int remSize = outSize - (interleaveSize * Nway);
+ final int out_end = outSize - remSize;
+ final byte[] reverseLookup0 = getReverseLookup()[0];
+ final RANSDecodingSymbol[] syms = getDecodingSymbols()[0];
+
+ for (int i = 0; i < out_end; i += Nway) {
+ for (int r = 0; r < Nway; r++) {
+ final byte decodedSymbol = reverseLookup0[Utils.RANSGetCumulativeFrequency(rans[r], Constants.TOTAL_FREQ_SHIFT)];
+ out[i + r] = decodedSymbol;
+ rans[r] = syms[0xFF & decodedSymbol].advanceSymbolStep(rans[r], Constants.TOTAL_FREQ_SHIFT);
+ rans[r] = Utils.RANSDecodeRenormalizeNx16(rans[r], in, inPos);
+ }
+ }
+
+ int reverseIndex = 0;
+ int outIdx = out_end;
+ while (remSize > 0) {
+ final byte remainingSymbol = reverseLookup0[Utils.RANSGetCumulativeFrequency(rans[reverseIndex], Constants.TOTAL_FREQ_SHIFT)];
+ rans[reverseIndex] = syms[0xFF & remainingSymbol].advanceSymbolStep(rans[reverseIndex], Constants.TOTAL_FREQ_SHIFT);
+ rans[reverseIndex] = Utils.RANSDecodeRenormalizeNx16(rans[reverseIndex], in, inPos);
+ out[outIdx++] = remainingSymbol;
+ remSize--;
+ reverseIndex++;
+ }
+ }
+
+ private void uncompressOrder1WayN(
+ final byte[] in, final int[] inPos,
+ final byte[] out, final int outputSize,
+ final RANSNx16Params ransNx16Params) {
+
+ final int frequencyTableFirstByte = in[inPos[0]++] & 0xFF;
+ final boolean optionalCompressFlag = ((frequencyTableFirstByte & FREQ_TABLE_OPTIONALLY_COMPRESSED_MASK) != 0);
+
+ byte[] freqTableBytes;
+ int[] freqTablePos;
+ if (optionalCompressFlag) {
+ final int uncompressedLength = CompressionUtils.readUint7(in, inPos);
+ final int compressedLength = CompressionUtils.readUint7(in, inPos);
+ final byte[] compressedFreqTable = new byte[compressedLength];
+ System.arraycopy(in, inPos[0], compressedFreqTable, 0, compressedLength);
+ inPos[0] += compressedLength;
+
+ // Decompress freq table using raw Order-0 (no format-flags framing)
+ freqTableBytes = new byte[uncompressedLength];
+ final int[] compPos = new int[]{0};
+ uncompressOrder0WayN(compressedFreqTable, compPos, freqTableBytes, uncompressedLength,
+ new RANSNx16Params(~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK)));
+ freqTablePos = new int[]{0};
+ } else {
+ freqTableBytes = in;
+ freqTablePos = inPos;
+ }
+
+ // Re-initialize decoder after nested O0 call may have clobbered state
+ resetDecoderState();
+ final int shift = frequencyTableFirstByte >> 4;
+ readFrequencyTableOrder1(freqTableBytes, freqTablePos, shift);
+
+ // If we used a separate freqTableBytes, inPos wasn't advanced by reading the freq table.
+ // If freqTableBytes == in, inPos was advanced. Nothing to do here.
+
+ final int Nway = ransNx16Params.getNumInterleavedRANSStates();
+ final long[] rans = new long[Nway];
+ final int[] interleaveStreamIndex = new int[Nway];
+ final int[] context = new int[Nway];
+ final int interleaveSize = (Nway == 4) ? (outputSize >> 2) : (outputSize >> 5);
+
+ for (int r = 0; r < Nway; r++) {
+ rans[r] = readLittleEndianInt(in, inPos);
+ interleaveStreamIndex[r] = r * interleaveSize;
+ context[r] = 0;
+ }
+
+ final byte[][] reverseLookup = getReverseLookup();
+ final RANSDecodingSymbol[][] syms = getDecodingSymbols();
+ final int[] symbol = new int[Nway];
+
+ while (interleaveStreamIndex[0] < interleaveSize) {
+ for (int r = 0; r < Nway; r++) {
+ symbol[r] = 0xFF & reverseLookup[context[r]][Utils.RANSGetCumulativeFrequency(rans[r], shift)];
+ out[interleaveStreamIndex[r]] = (byte) symbol[r];
+ rans[r] = syms[context[r]][symbol[r]].advanceSymbolStep(rans[r], shift);
+ rans[r] = Utils.RANSDecodeRenormalizeNx16(rans[r], in, inPos);
+ context[r] = symbol[r];
+ }
+ for (int r = 0; r < Nway; r++) {
+ interleaveStreamIndex[r]++;
+ }
+ }
+
+ // Remainder
+ for (; interleaveStreamIndex[Nway - 1] < outputSize; interleaveStreamIndex[Nway - 1]++) {
+ symbol[Nway - 1] = 0xFF & reverseLookup[context[Nway - 1]][Utils.RANSGetCumulativeFrequency(rans[Nway - 1], shift)];
+ out[interleaveStreamIndex[Nway - 1]] = (byte) symbol[Nway - 1];
+ rans[Nway - 1] = syms[context[Nway - 1]][symbol[Nway - 1]].advanceSymbolStep(rans[Nway - 1], shift);
+ rans[Nway - 1] = Utils.RANSDecodeRenormalizeNx16(rans[Nway - 1], in, inPos);
+ context[Nway - 1] = symbol[Nway - 1];
+ }
+ }
+
+ private void readFrequencyTableOrder0(final byte[] in, final int[] inPos) {
+ final int[] alphabet = readAlphabet(in, inPos);
+ markRowUsed(0);
+ final int[] freq = getFrequencies()[0];
+ final byte[] revLookup = getReverseLookup()[0];
+
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (alphabet[j] > 0) {
+ freq[j] = CompressionUtils.readUint7(in, inPos);
+ }
+ }
+ Utils.normaliseFrequenciesOrder0Shift(freq, Constants.TOTAL_FREQ_SHIFT);
+
+ final RANSDecodingSymbol[] decodingSymbols = getDecodingSymbols()[0];
+ int cumulativeFrequency = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (alphabet[j] > 0) {
+ decodingSymbols[j].set(cumulativeFrequency, freq[j]);
+ Arrays.fill(revLookup, cumulativeFrequency, cumulativeFrequency + freq[j], (byte) j);
+ cumulativeFrequency += freq[j];
+ }
+ }
+ }
+
+ private void readFrequencyTableOrder1(final byte[] in, final int[] inPos, final int shift) {
+ final int[][] freq = getFrequencies();
+ final byte[][] revLookup = getReverseLookup();
+ final RANSDecodingSymbol[][] decodingSymbols = getDecodingSymbols();
+ final int[] alphabet = readAlphabet(in, inPos);
+
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ if (alphabet[i] > 0) {
+ markRowUsed(i);
+ int run = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (alphabet[j] > 0) {
+ if (run > 0) {
+ run--;
+ } else {
+ freq[i][j] = CompressionUtils.readUint7(in, inPos);
+ if (freq[i][j] == 0) {
+ run = in[inPos[0]++] & 0xFF;
+ }
+ }
+ }
+ }
+
+ Utils.normaliseFrequenciesOrder0Shift(freq[i], shift);
+ int cumulativeFreq = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ decodingSymbols[i][j].set(cumulativeFreq, freq[i][j]);
+ Arrays.fill(revLookup[i], cumulativeFreq, cumulativeFreq + freq[i][j], (byte) j);
+ cumulativeFreq += freq[i][j];
+ }
+ }
+ }
+ }
+
+ private static int[] readAlphabet(final byte[] in, final int[] inPos) {
+ final int[] alphabet = new int[Constants.NUMBER_OF_SYMBOLS];
+ int rle = 0;
+ int symbol = in[inPos[0]++] & 0xFF;
+ int lastSymbol = symbol;
+ do {
+ alphabet[symbol] = 1;
+ if (rle != 0) {
+ rle--;
+ symbol++;
+ } else {
+ symbol = in[inPos[0]++] & 0xFF;
+ if (symbol == lastSymbol + 1) {
+ rle = in[inPos[0]++] & 0xFF;
+ }
+ }
+ lastSymbol = symbol;
+ } while (symbol != 0);
+ return alphabet;
+ }
+
+ /**
+ * Decode RLE metadata: extract the symbol list and set rleMetaPos to the start of
+ * the run-length data within the returned byte array.
+ */
+ private byte[] decodeRLEMeta(
+ final byte[] in, final int[] inPos,
+ final int uncompressedRLEMetaDataLength,
+ final int[] rleSymbols,
+ final int[] rleMetaPos,
+ final RANSNx16Params ransNx16Params) {
+ final byte[] uncompressedRLEMetaData;
+
+ if ((uncompressedRLEMetaDataLength & RLE_META_OPTIONALLY_COMPRESSED_MASK) != 0) {
+ final int len = (uncompressedRLEMetaDataLength - 1) / 2;
+ uncompressedRLEMetaData = new byte[len];
+ System.arraycopy(in, inPos[0], uncompressedRLEMetaData, 0, len);
+ inPos[0] += len;
+ } else {
+ final int compressedLen = CompressionUtils.readUint7(in, inPos);
+ final byte[] compressed = new byte[compressedLen];
+ System.arraycopy(in, inPos[0], compressed, 0, compressedLen);
+ inPos[0] += compressedLen;
+ // Decompress using raw Order-0 (not through uncompressInternal, since the data
+ // doesn't have format-flags framing — it was compressed with compressOrder0WayN directly)
+ uncompressedRLEMetaData = new byte[uncompressedRLEMetaDataLength / 2];
+ final int[] compPos = new int[]{0};
+ uncompressOrder0WayN(compressed, compPos, uncompressedRLEMetaData, uncompressedRLEMetaDataLength / 2,
+ new RANSNx16Params(0x00 | ransNx16Params.getFormatFlags() & RANSNx16Params.N32_FLAG_MASK));
+ }
+
+ // Read symbol list from the metadata; rleMetaPos[0] advances past it
+ int pos = 0;
+ int numRLESymbols = uncompressedRLEMetaData[pos++] & 0xFF;
+ if (numRLESymbols == 0) {
+ numRLESymbols = Constants.NUMBER_OF_SYMBOLS;
+ }
+ for (int i = 0; i < numRLESymbols; i++) {
+ rleSymbols[uncompressedRLEMetaData[pos++] & 0xFF] = 1;
+ }
+
+ // Set rleMetaPos to point past the symbol list, at the run-length data
+ rleMetaPos[0] = pos;
+ return uncompressedRLEMetaData;
+ }
+
+ private static byte[] decodeRLE(
+ final byte[] in,
+ final int[] rleSymbols,
+ final byte[] rleMetaData,
+ final int[] rleMetaPos,
+ final int uncompressedRLEOutputLength) {
+ final byte[] out = new byte[uncompressedRLEOutputLength];
+ int j = 0;
+ for (int i = 0; j < uncompressedRLEOutputLength; i++) {
+ final byte sym = in[i];
+ if (rleSymbols[sym & 0xFF] != 0) {
+ final int run = CompressionUtils.readUint7(rleMetaData, rleMetaPos);
+ for (int r = 0; r <= run; r++) {
+ out[j++] = sym;
+ }
+ } else {
+ out[j++] = sym;
+ }
+ }
+ return out;
+ }
+
+ private byte[] decodeStripe(final byte[] in, final int[] inPos, final int outSize) {
+ final int numInterleaveStreams = in[inPos[0]++] & 0xFF;
+
+ // Read (and discard) compressed lengths
+ for (int j = 0; j < numInterleaveStreams; j++) {
+ CompressionUtils.readUint7(in, inPos);
+ }
+
+ // Decode each sub-stream
+ final int[] uncompressedLengths = new int[numInterleaveStreams];
+ final byte[][] transposedData = new byte[numInterleaveStreams][];
+ for (int j = 0; j < numInterleaveStreams; j++) {
+ uncompressedLengths[j] = outSize / numInterleaveStreams;
+ if ((outSize % numInterleaveStreams) > j) {
+ uncompressedLengths[j]++;
+ }
+ transposedData[j] = uncompressInternal(in, inPos, uncompressedLengths[j]);
+ }
+
+ // Transpose
+ final byte[] out = new byte[outSize];
+ for (int j = 0; j < numInterleaveStreams; j++) {
+ for (int i = 0; i < uncompressedLengths[j]; i++) {
+ out[(i * numInterleaveStreams) + j] = transposedData[j][i];
+ }
+ }
+ return out;
+ }
+
+ /** Read a 4-byte little-endian int from in at inPos[0], advancing inPos[0] by 4. */
+ private static long readLittleEndianInt(final byte[] in, final int[] inPos) {
+ int pos = inPos[0];
+ final long value = (in[pos] & 0xFFL)
+ | ((in[pos + 1] & 0xFFL) << 8)
+ | ((in[pos + 2] & 0xFFL) << 16)
+ | ((in[pos + 3] & 0xFFL) << 24);
+ inPos[0] = pos + 4;
+ return value;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Encode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Encode.java
new file mode 100644
index 0000000000..e263f29288
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Encode.java
@@ -0,0 +1,455 @@
+package htsjdk.samtools.cram.compression.rans;
+
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.compression.CompressionUtils;
+import htsjdk.samtools.cram.compression.rans.Constants;
+import htsjdk.samtools.cram.compression.rans.RANSEncode;
+import htsjdk.samtools.cram.compression.rans.RANSEncodingSymbol;
+import htsjdk.samtools.cram.compression.rans.RANSParams;
+import htsjdk.samtools.cram.compression.rans.Utils;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Encoder for the CRAM 3.1 rANSNx16 codec. Internal encoding uses byte[] with backwards-write
+ * to eliminate the O(N) reverse pass. Pack/RLE/Stripe preprocessing still bridges through ByteBuffer
+ * where CompressionUtils methods require it.
+ */
+public class RANSNx16Encode extends RANSEncode {
+
+ /**
+ * Compress a byte array using the rANS Nx16 codec. Applies the transformations
+ * specified by the params (PACK, RLE, STRIPE) as preprocessing, then encodes the
+ * result with Order-0 or Order-1 rANS using 4-way or 32-way interleaving.
+ *
+ * @param input the data to compress
+ * @param ransNx16Params encoding parameters specifying order, interleave width, and transformations
+ * @return the compressed byte stream
+ */
+ @Override
+ public byte[] compress(final byte[] input, final RANSNx16Params ransNx16Params) {
+ if (input.length == 0) {
+ return new byte[0];
+ }
+ final ByteBuffer outBuffer = CompressionUtils.allocateOutputBuffer(input.length);
+ final int formatFlags = ransNx16Params.getFormatFlags();
+ outBuffer.put((byte) formatFlags);
+
+ if (!ransNx16Params.isNosz()) {
+ CompressionUtils.writeUint7(input.length, outBuffer);
+ }
+
+ ByteBuffer inputBuffer = CompressionUtils.wrap(input);
+
+ // Stripe
+ if (ransNx16Params.isStripe()) {
+ compressStripe(inputBuffer, outBuffer);
+ final byte[] result = new byte[outBuffer.remaining()];
+ outBuffer.get(result);
+ return result;
+ }
+
+ // Pack
+ if (ransNx16Params.isPack()) {
+ final int[] frequencyTable = new int[Constants.NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < input.length; i++) {
+ frequencyTable[input[i] & 0xFF]++;
+ }
+ int numSymbols = 0;
+ final int[] packMappingTable = new int[Constants.NUMBER_OF_SYMBOLS];
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ if (frequencyTable[i] > 0) {
+ packMappingTable[i] = numSymbols++;
+ }
+ }
+ if (numSymbols > 1 && numSymbols <= 16) {
+ inputBuffer = CompressionUtils.encodePack(inputBuffer, outBuffer, frequencyTable, packMappingTable, numSymbols);
+ } else {
+ outBuffer.put(0, (byte) (outBuffer.get(0) & ~RANSNx16Params.PACK_FLAG_MASK));
+ }
+ }
+
+ // RLE
+ if (ransNx16Params.isRLE()) {
+ inputBuffer = encodeRLE(inputBuffer, outBuffer, ransNx16Params);
+ }
+
+ // Extract input bytes for the core encoder
+ final byte[] in = new byte[inputBuffer.remaining()];
+ inputBuffer.get(in);
+
+ if (ransNx16Params.isCAT()) {
+ outBuffer.put(in);
+ outBuffer.limit(outBuffer.position());
+ outBuffer.rewind();
+ final byte[] result = new byte[outBuffer.remaining()];
+ outBuffer.get(result);
+ return result;
+ }
+
+ final int Nway = ransNx16Params.getNumInterleavedRANSStates();
+ RANSNx16Params effectiveParams = ransNx16Params;
+ if (in.length < Nway && ransNx16Params.getOrder() == RANSParams.ORDER.ONE) {
+ outBuffer.put(0, (byte) (outBuffer.get(0) & ~RANSNx16Params.ORDER_FLAG_MASK));
+ effectiveParams = new RANSNx16Params(outBuffer.get(0));
+ if (in.length == 0) {
+ outBuffer.limit(outBuffer.position());
+ outBuffer.rewind();
+ final byte[] result = new byte[outBuffer.remaining()];
+ outBuffer.get(result);
+ return result;
+ }
+ }
+
+ final int prefixSize = outBuffer.position();
+ final byte[] encoded;
+ switch (effectiveParams.getOrder()) {
+ case ZERO:
+ encoded = compressOrder0WayN(in, effectiveParams);
+ break;
+ case ONE:
+ encoded = compressOrder1WayN(in, effectiveParams);
+ break;
+ default:
+ throw new CRAMException("Unknown rANS order: " + effectiveParams.getOrder());
+ }
+
+ final byte[] result = new byte[prefixSize + encoded.length];
+ outBuffer.rewind();
+ outBuffer.get(result, 0, prefixSize);
+ System.arraycopy(encoded, 0, result, prefixSize, encoded.length);
+ return result;
+ }
+
+ // ---- Core Order-0 encoder (byte[] backwards-write) ----
+
+ private byte[] compressOrder0WayN(final byte[] in, final RANSNx16Params params) {
+ final int inSize = in.length;
+ int bitSize = inSize <= 1 ? 0 : 32 - Integer.numberOfLeadingZeros(inSize - 1);
+ if (bitSize > Constants.TOTAL_FREQ_SHIFT) bitSize = Constants.TOTAL_FREQ_SHIFT;
+
+ final int[] F = buildFrequenciesOrder0(in);
+ Utils.normaliseFrequenciesOrder0(F, bitSize);
+
+ final byte[] freqTable = new byte[1024];
+ final int[] freqPos = {0};
+ writeFrequenciesOrder0(freqTable, freqPos, F);
+ final int frequencyTableSize = freqPos[0];
+
+ if (bitSize != Constants.TOTAL_FREQ_SHIFT) {
+ Utils.normaliseFrequenciesOrder0Shift(F, Constants.TOTAL_FREQ_SHIFT);
+ }
+ buildSymsOrder0(F);
+
+ final int Nway = params.getNumInterleavedRANSStates();
+ final int interleaveSize = (Nway == 4) ? (inSize >> 2) : (inSize >> 5);
+ int remainingSize = inSize - (interleaveSize * Nway);
+ final long[] rans = new long[Nway];
+ for (int r = 0; r < Nway; r++) rans[r] = Constants.RANS_Nx16_LOWER_BOUND;
+
+ final int maxCompressedSize = inSize + inSize / 4 + Nway * 4 + 64;
+ final byte[] compressedData = new byte[maxCompressedSize];
+ int pos = maxCompressedSize; // write position, decrements — kept as local for register allocation
+
+ final RANSEncodingSymbol[] syms = getEncodingSymbols()[0];
+
+ // Remainder symbols (inlined putSymbolNx16 to keep pos in a register)
+ int reverseIndex = 1;
+ while (remainingSize > 0) {
+ final RANSEncodingSymbol sym = syms[in[inSize - reverseIndex] & 0xFF];
+ long x = rans[remainingSize - 1];
+ if (x >= sym.xMax) {
+ compressedData[--pos] = (byte) ((x >> 8) & 0xFF);
+ compressedData[--pos] = (byte) (x & 0xFF);
+ x >>= 16;
+ if (x >= sym.xMax) {
+ compressedData[--pos] = (byte) ((x >> 8) & 0xFF);
+ compressedData[--pos] = (byte) (x & 0xFF);
+ x >>= 16;
+ }
+ }
+ rans[remainingSize - 1] = x + sym.bias + ((x * (0xFFFFFFFFL & sym.rcpFreq)) >> sym.rcpShift) * sym.cmplFreq;
+ remainingSize--;
+ reverseIndex++;
+ }
+
+ // Main interleaved encoding loop (inlined putSymbolNx16)
+ for (int i = interleaveSize * Nway; i > 0; i -= Nway) {
+ for (int r = Nway - 1; r >= 0; r--) {
+ final RANSEncodingSymbol sym = syms[in[i - (Nway - r)] & 0xFF];
+ long x = rans[r];
+ if (x >= sym.xMax) {
+ compressedData[--pos] = (byte) ((x >> 8) & 0xFF);
+ compressedData[--pos] = (byte) (x & 0xFF);
+ x >>= 16;
+ if (x >= sym.xMax) {
+ compressedData[--pos] = (byte) ((x >> 8) & 0xFF);
+ compressedData[--pos] = (byte) (x & 0xFF);
+ x >>= 16;
+ }
+ }
+ rans[r] = x + sym.bias + ((x * (0xFFFFFFFFL & sym.rcpFreq)) >> sym.rcpShift) * sym.cmplFreq;
+ }
+ }
+
+ // Flush states: rans[Nway-1] first (highest addr), rans[0] last (lowest addr)
+ for (int i = Nway - 1; i >= 0; i--) {
+ final int state = (int) rans[i];
+ compressedData[--pos] = (byte) ((state >> 24) & 0xFF);
+ compressedData[--pos] = (byte) ((state >> 16) & 0xFF);
+ compressedData[--pos] = (byte) ((state >> 8) & 0xFF);
+ compressedData[--pos] = (byte) (state & 0xFF);
+ }
+
+ final int compressedSize = maxCompressedSize - pos;
+ final byte[] result = new byte[frequencyTableSize + compressedSize];
+ System.arraycopy(freqTable, 0, result, 0, frequencyTableSize);
+ System.arraycopy(compressedData, pos, result, frequencyTableSize, compressedSize);
+ return result;
+ }
+
+ // ---- Core Order-1 encoder (byte[] backwards-write) ----
+
+ private byte[] compressOrder1WayN(final byte[] in, final RANSNx16Params params) {
+ final int inputSize = in.length;
+ final int Nway = params.getNumInterleavedRANSStates();
+ final int[][] frequencies = buildFrequenciesOrder1(in, Nway);
+
+ Utils.normaliseFrequenciesOrder1(frequencies, Constants.TOTAL_FREQ_SHIFT);
+
+ final byte[] uncompFreqTable = new byte[257 * 256 * 3 + 256];
+ final int[] uncompPos = {0};
+ writeFrequenciesOrder1(uncompFreqTable, uncompPos, frequencies);
+ final int uncompFreqTableSize = uncompPos[0];
+
+ final byte[] compFreqTable = compressOrder0WayN(
+ java.util.Arrays.copyOf(uncompFreqTable, uncompFreqTableSize),
+ new RANSNx16Params(~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK)));
+
+
+ final byte[] freqHeader;
+ if (compFreqTable.length < uncompFreqTableSize) {
+ final byte[] h = new byte[1 + 10 + 10 + compFreqTable.length];
+ final int[] hp = {0};
+ h[hp[0]++] = (byte) (1 | Constants.TOTAL_FREQ_SHIFT << 4);
+ CompressionUtils.writeUint7(uncompFreqTableSize, h, hp);
+ CompressionUtils.writeUint7(compFreqTable.length, h, hp);
+ System.arraycopy(compFreqTable, 0, h, hp[0], compFreqTable.length);
+ hp[0] += compFreqTable.length;
+ freqHeader = java.util.Arrays.copyOf(h, hp[0]);
+ } else {
+ freqHeader = new byte[1 + uncompFreqTableSize];
+ freqHeader[0] = (byte) (0 | Constants.TOTAL_FREQ_SHIFT << 4);
+ System.arraycopy(uncompFreqTable, 0, freqHeader, 1, uncompFreqTableSize);
+ }
+
+ Utils.normaliseFrequenciesOrder1Shift(frequencies, Constants.TOTAL_FREQ_SHIFT);
+ buildSymsOrder1(frequencies);
+
+ final long[] rans = new long[Nway];
+ for (int r = 0; r < Nway; r++) rans[r] = Constants.RANS_Nx16_LOWER_BOUND;
+
+ final int interleaveSize = (Nway == 4) ? inputSize >> 2 : inputSize >> 5;
+ final int[] idx = new int[Nway];
+ final byte[] symbol = new byte[Nway];
+ for (int r = 0; r < Nway; r++) {
+ idx[r] = (r + 1) * interleaveSize - 2;
+ symbol[r] = 0;
+ if (idx[r] + 1 >= 0 && r != Nway - 1) symbol[r] = in[idx[r] + 1];
+ if (r == Nway - 1) symbol[r] = in[inputSize - 1];
+ }
+
+ final int maxCompressedSize = inputSize + inputSize / 4 + Nway * 4 + 64;
+ final byte[] compressedData = new byte[maxCompressedSize];
+ final int[] writePos = {maxCompressedSize};
+
+ final RANSEncodingSymbol[][] syms = getEncodingSymbols();
+ final byte[] context = new byte[Nway];
+
+ // Remainder
+ for (idx[Nway - 1] = inputSize - 2;
+ idx[Nway - 1] > Nway * interleaveSize - 2 && idx[Nway - 1] >= 0;
+ idx[Nway - 1]--) {
+ context[Nway - 1] = in[idx[Nway - 1]];
+ rans[Nway - 1] = syms[context[Nway - 1] & 0xFF][symbol[Nway - 1] & 0xFF].putSymbolNx16(rans[Nway - 1], compressedData, writePos);
+ symbol[Nway - 1] = context[Nway - 1];
+ }
+
+ // Main loop
+ while (idx[0] >= 0) {
+ for (int r = 0; r < Nway; r++) {
+ context[Nway - 1 - r] = in[idx[Nway - 1 - r]];
+ rans[Nway - 1 - r] = syms[context[Nway - 1 - r] & 0xFF][symbol[Nway - 1 - r] & 0xFF].putSymbolNx16(rans[Nway - 1 - r], compressedData, writePos);
+ symbol[Nway - 1 - r] = context[Nway - 1 - r];
+ }
+ for (int r = 0; r < Nway; r++) idx[r]--;
+ }
+
+ // Final context=0 symbols
+ for (int r = 0; r < Nway; r++) {
+ rans[Nway - 1 - r] = syms[0][symbol[Nway - 1 - r] & 0xFF].putSymbolNx16(rans[Nway - 1 - r], compressedData, writePos);
+ }
+
+ // Flush states (same pattern as O0)
+ for (int i = Nway - 1; i >= 0; i--) {
+ final int state = (int) rans[i];
+ compressedData[--writePos[0]] = (byte) ((state >> 24) & 0xFF);
+ compressedData[--writePos[0]] = (byte) ((state >> 16) & 0xFF);
+ compressedData[--writePos[0]] = (byte) ((state >> 8) & 0xFF);
+ compressedData[--writePos[0]] = (byte) (state & 0xFF);
+ }
+
+ final int compressedSize = maxCompressedSize - writePos[0];
+ final byte[] result = new byte[freqHeader.length + compressedSize];
+ System.arraycopy(freqHeader, 0, result, 0, freqHeader.length);
+ System.arraycopy(compressedData, writePos[0], result, freqHeader.length, compressedSize);
+ return result;
+ }
+
+ // ---- Frequency helpers (byte[]) ----
+
+ private static int[] buildFrequenciesOrder0(final byte[] in) {
+ final int[] F = new int[Constants.NUMBER_OF_SYMBOLS];
+ for (final byte b : in) F[b & 0xFF]++;
+ return F;
+ }
+
+ private static int[][] buildFrequenciesOrder1(final byte[] in, final int Nway) {
+ final int inputSize = in.length;
+ final int[][] F = new int[Constants.NUMBER_OF_SYMBOLS + 1][Constants.NUMBER_OF_SYMBOLS];
+ byte ctx = 0;
+ for (int i = 0; i < inputSize; i++) {
+ F[Constants.NUMBER_OF_SYMBOLS][ctx & 0xFF]++;
+ F[ctx & 0xFF][in[i] & 0xFF]++;
+ ctx = in[i];
+ }
+ F[Constants.NUMBER_OF_SYMBOLS][ctx & 0xFF]++;
+ for (int n = 1; n < Nway; n++) {
+ final int pos = Nway == 4 ? (n * (inputSize >> 2)) : (n * (inputSize >> 5));
+ F[0][in[pos] & 0xFF]++;
+ }
+ F[Constants.NUMBER_OF_SYMBOLS][0] += Nway - 1;
+ return F;
+ }
+
+ private static void writeFrequenciesOrder0(final byte[] out, final int[] pos, final int[] F) {
+ writeAlphabet(out, pos, F);
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[j] != 0) {
+ if (F[j] < 128) {
+ out[pos[0]++] = (byte) (F[j] & 0x7f);
+ } else {
+ out[pos[0]++] = (byte) (128 | (F[j] >> 7));
+ out[pos[0]++] = (byte) (F[j] & 0x7f);
+ }
+ }
+ }
+ }
+
+ private static void writeFrequenciesOrder1(final byte[] out, final int[] pos, final int[][] F) {
+ writeAlphabet(out, pos, F[Constants.NUMBER_OF_SYMBOLS]);
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
+ if (F[Constants.NUMBER_OF_SYMBOLS][i] == 0) continue;
+ int run = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[Constants.NUMBER_OF_SYMBOLS][j] == 0) continue;
+ if (run > 0) { run--; continue; }
+ CompressionUtils.writeUint7(F[i][j], out, pos);
+ if (F[i][j] == 0) {
+ for (int k = j + 1; k < Constants.NUMBER_OF_SYMBOLS; k++) {
+ if (F[Constants.NUMBER_OF_SYMBOLS][k] == 0) continue;
+ if (F[i][k] == 0) run++; else break;
+ }
+ out[pos[0]++] = (byte) run;
+ }
+ }
+ }
+ }
+
+ private static void writeAlphabet(final byte[] out, final int[] pos, final int[] F) {
+ int rle = 0;
+ for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
+ if (F[j] != 0) {
+ if (rle != 0) { rle--; } else {
+ out[pos[0]++] = (byte) j;
+ if (j != 0 && F[j - 1] != 0) {
+ for (rle = j + 1; rle < Constants.NUMBER_OF_SYMBOLS && F[rle] != 0; rle++) ;
+ rle -= j + 1;
+ out[pos[0]++] = (byte) rle;
+ }
+ }
+ }
+ }
+ out[pos[0]++] = 0;
+ }
+
+ // ---- RLE and Stripe (ByteBuffer bridge) ----
+
+ private ByteBuffer encodeRLE(final ByteBuffer inBuffer, final ByteBuffer outBuffer, final RANSNx16Params ransNx16Params) {
+ final int[] runCounts = new int[Constants.NUMBER_OF_SYMBOLS];
+ final int inputSize = inBuffer.remaining();
+ int lastSymbol = -1;
+ for (int i = 0; i < inputSize; i++) {
+ final int s = inBuffer.get(i) & 0xFF;
+ runCounts[s] += (s == lastSymbol ? 1 : -1);
+ lastSymbol = s;
+ }
+ int numRLESymbols = 0;
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) if (runCounts[i] > 0) numRLESymbols++;
+ if (numRLESymbols == 0) { numRLESymbols = 1; runCounts[0] = 1; }
+
+ final ByteBuffer rleMetaData = CompressionUtils.allocateByteBuffer(numRLESymbols + 1 + inputSize);
+ rleMetaData.put((byte) numRLESymbols);
+ for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) if (runCounts[i] > 0) rleMetaData.put((byte) i);
+
+ final ByteBuffer encodedBuffer = CompressionUtils.allocateByteBuffer(inputSize);
+ int idx = 0;
+ for (int i = 0; i < inputSize; i++) {
+ encodedBuffer.put(idx++, inBuffer.get(i));
+ if (runCounts[inBuffer.get(i) & 0xFF] > 0) {
+ lastSymbol = inBuffer.get(i) & 0xFF;
+ int run = 0;
+ while (i + run + 1 < inputSize && (inBuffer.get(i + run + 1) & 0xFF) == lastSymbol) run++;
+ CompressionUtils.writeUint7(run, rleMetaData);
+ i += run;
+ }
+ }
+ encodedBuffer.limit(idx);
+ rleMetaData.limit(rleMetaData.position());
+ rleMetaData.rewind();
+
+ final byte[] rleMeta = new byte[rleMetaData.remaining()];
+ rleMetaData.get(rleMeta);
+ final byte[] compressedRleMeta = compressOrder0WayN(rleMeta,
+ new RANSNx16Params(0x00 | ransNx16Params.getFormatFlags() & RANSNx16Params.N32_FLAG_MASK));
+
+ CompressionUtils.writeUint7(rleMeta.length * 2, outBuffer);
+ CompressionUtils.writeUint7(idx, outBuffer);
+ CompressionUtils.writeUint7(compressedRleMeta.length, outBuffer);
+ outBuffer.put(compressedRleMeta);
+
+ inBuffer.position(inBuffer.limit());
+ return encodedBuffer;
+ }
+
+ private void compressStripe(final ByteBuffer inBuffer, final ByteBuffer outBuffer) {
+ final int numStreams = CompressionUtils.getStripeNumStreams();
+ final int[] sizes = CompressionUtils.buildStripeUncompressedSizes(inBuffer.remaining());
+ final ByteBuffer[] chunks = CompressionUtils.stripeTranspose(inBuffer, sizes);
+
+ final byte[][] compressedChunks = new byte[numStreams][];
+ for (int i = 0; i < numStreams; i++) {
+ final byte[] chunkBytes = new byte[chunks[i].remaining()];
+ chunks[i].get(chunkBytes);
+ compressedChunks[i] = compress(chunkBytes, new RANSNx16Params(RANSNx16Params.NOSZ_FLAG_MASK));
+ }
+
+ outBuffer.put((byte) numStreams);
+ for (int i = 0; i < numStreams; i++) CompressionUtils.writeUint7(compressedChunks[i].length, outBuffer);
+ for (int i = 0; i < numStreams; i++) outBuffer.put(compressedChunks[i]);
+
+ inBuffer.position(inBuffer.limit());
+ outBuffer.limit(outBuffer.position());
+ outBuffer.rewind();
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Params.java b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Params.java
new file mode 100644
index 0000000000..5f2197e42d
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/RANSNx16Params.java
@@ -0,0 +1,72 @@
+package htsjdk.samtools.cram.compression.rans;
+
+/**
+ * Parameters for the rANS Nx16 codec. The format flags byte encodes the combination
+ * of transformations (ORDER, N32, STRIPE, NOSZ, CAT, RLE, PACK) to apply.
+ */
+public final class RANSNx16Params implements RANSParams {
+
+ public static final int ORDER_FLAG_MASK = 0x01;
+ public static final int N32_FLAG_MASK = 0x04;
+ public static final int STRIPE_FLAG_MASK = 0x08;
+ public static final int NOSZ_FLAG_MASK = 0x10;
+ public static final int CAT_FLAG_MASK = 0x20;
+ public static final int RLE_FLAG_MASK = 0x40;
+ public static final int PACK_FLAG_MASK = 0x80;
+
+ private static final int FORMAT_FLAG_MASK = 0xFF;
+
+ private final int formatFlags;
+
+ /**
+ * @param formatFlags the raw format flags byte from the compressed stream header
+ */
+ public RANSNx16Params(final int formatFlags) {
+ this.formatFlags = formatFlags;
+ }
+
+ @Override
+ public String toString() {
+ return "RANSNx16Params{" + "formatFlags=" + formatFlags + "}";
+ }
+
+ @Override
+ public ORDER getOrder() {
+ return ORDER.fromInt(formatFlags & ORDER_FLAG_MASK);
+ }
+
+ @Override
+ public int getFormatFlags() {
+ return formatFlags & FORMAT_FLAG_MASK;
+ }
+
+ /** @return the number of interleaved rANS states: 4 (default) or 32 (if N32 flag set). */
+ public int getNumInterleavedRANSStates() {
+ return ((formatFlags & N32_FLAG_MASK) == 0) ? 4 : 32;
+ }
+
+ /** @return true if the STRIPE transformation flag is set. */
+ public boolean isStripe() {
+ return (formatFlags & STRIPE_FLAG_MASK) != 0;
+ }
+
+ /** @return true if the NOSZ (no-size) flag is set, meaning output size is externally provided. */
+ public boolean isNosz() {
+ return (formatFlags & NOSZ_FLAG_MASK) != 0;
+ }
+
+ /** @return true if the CAT (concatenation/uncompressed) flag is set. */
+ public boolean isCAT() {
+ return (formatFlags & CAT_FLAG_MASK) != 0;
+ }
+
+ /** @return true if the RLE (run-length encoding) preprocessing flag is set. */
+ public boolean isRLE() {
+ return (formatFlags & RLE_FLAG_MASK) != 0;
+ }
+
+ /** @return true if the PACK (bit-packing) preprocessing flag is set. */
+ public boolean isPack() {
+ return (formatFlags & PACK_FLAG_MASK) != 0;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/Utils.java b/src/main/java/htsjdk/samtools/cram/compression/rans/Utils.java
index 06abbca89d..039f5b9f4c 100644
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/Utils.java
+++ b/src/main/java/htsjdk/samtools/cram/compression/rans/Utils.java
@@ -1,74 +1,54 @@
package htsjdk.samtools.cram.compression.rans;
-import java.nio.ByteBuffer;
-
-final public class Utils {
-
- private static void reverse(final byte[] array, final int offset, final int size) {
- if (array == null) {
- return;
- }
- int i = offset;
- int j = offset + size - 1;
- while (j > i) {
- byte tmp = array[j];
- array[j] = array[i];
- array[i] = tmp;
- j--;
- i++;
- }
- }
-
- public static void reverse(final ByteBuffer byteBuffer) {
- if (byteBuffer.hasArray()) {
- reverse(byteBuffer.array(), byteBuffer.arrayOffset(), byteBuffer.limit());
- } else {
- for (int i = 0; i < byteBuffer.limit(); i++) {
- byteBuffer.put(i, byteBuffer.get(byteBuffer.limit() - i - 1));
- byteBuffer.put(byteBuffer.limit() - i - 1, byteBuffer.get(i));
- }
- }
- }
-
- // Returns the current cumulative frequency (map it to a symbol yourself!)
+/**
+ * Utility methods for rANS encoding and decoding: cumulative frequency lookup,
+ * state renormalization, and frequency table normalization.
+ */
+public final class Utils {
+
+ /**
+ * Extract the cumulative frequency from a rANS state by masking off the lower {@code scaleBits} bits.
+ *
+ * @param r the current rANS state
+ * @param scaleBits log2 of the total frequency sum
+ * @return the cumulative frequency used to look up the decoded symbol
+ */
public static int RANSGetCumulativeFrequency(final long r, final int scaleBits) {
- return (int) (r & ((1 << scaleBits) - 1)); // since cumulative frequency will be a maximum of 4096
+ return (int) (r & ((1 << scaleBits) - 1));
}
- public static long RANSDecodeRenormalize4x8(final long r, final ByteBuffer byteBuffer) {
+ /** Nx16 renormalization: reads 2 LE bytes from buf at posHolder[0] if state is below lower bound. */
+ public static long RANSDecodeRenormalizeNx16(final long r, final byte[] buf, final int[] posHolder) {
long ret = r;
- while (ret < Constants.RANS_4x8_LOWER_BOUND) {
- ret = (ret << 8) | (0xFF & byteBuffer.get());
+ if (ret < Constants.RANS_Nx16_LOWER_BOUND) {
+ int pos = posHolder[0];
+ ret = (ret << 16) | (buf[pos++] & 0xFF) | ((buf[pos++] & 0xFF) << 8);
+ posHolder[0] = pos;
}
return ret;
}
- public static long RANSDecodeRenormalizeNx16(final long r, final ByteBuffer byteBuffer) {
+ /** 4x8 renormalization: reads 1 byte at a time from buf at posHolder[0] until state reaches lower bound. */
+ public static long RANSDecodeRenormalize4x8(final long r, final byte[] buf, final int[] posHolder) {
long ret = r;
- if (ret < (Constants.RANS_Nx16_LOWER_BOUND)) {
- final int i = (0xFF & byteBuffer.get()) | ((0xFF & byteBuffer.get()) << 8);
- ret = (ret << 16) | i;
+ while (ret < Constants.RANS_4x8_LOWER_BOUND) {
+ ret = (ret << 8) | (buf[posHolder[0]++] & 0xFF);
}
return ret;
}
+ /**
+ * Normalize symbol frequencies so they sum to {@code 1 << bits}.
+ * Uses fixed-point arithmetic to scale frequencies proportionally.
+ */
public static void normaliseFrequenciesOrder0(final int[] F, final int bits) {
- // Returns an array of normalised Frequencies,
- // such that the frequencies add up to 1<0)?(((long) (renormFreq) << 31) / T + (1 << 30) / T):0;
+ final long tr = (T > 0) ? (((long) renormFreq << 31) / T + (1 << 30) / T) : 0;
int fsum = 0;
for (int symbol = 0; symbol < Constants.NUMBER_OF_SYMBOLS; symbol++) {
if (F[symbol] == 0) {
continue;
}
-
- // As per spec, total frequencies after normalization should be 4096 (4095 could be considered legacy value)
- // using tr to normalize symbol frequencies such that their total = renormFreq
if ((F[symbol] = (int) ((F[symbol] * tr) >> 31)) == 0) {
-
- // A non-zero symbol frequency should not be incorrectly set to 0.
- // If the calculated value is 0, change it to 1
F[symbol] = 1;
}
fsum += F[symbol];
}
- // adjust the frequency of the symbol "M" such that
- // the sum of frequencies of all the symbols = renormFreq
if (fsum < renormFreq) {
F[M] += renormFreq - fsum;
} else if (fsum > renormFreq) {
@@ -107,63 +77,59 @@ public static void normaliseFrequenciesOrder0(final int[] F, final int bits) {
}
}
+ /**
+ * Normalize Order-1 frequency tables: for each context symbol with non-zero frequency,
+ * compute the minimum bit size and normalize that context's frequency table.
+ */
public static void normaliseFrequenciesOrder1(final int[][] F, final int shift) {
- // calculate the minimum bit size required for representing the frequency array for each symbol
- // and normalise the frequency array using the calculated bit size
for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[Constants.NUMBER_OF_SYMBOLS][j]==0){
+ if (F[Constants.NUMBER_OF_SYMBOLS][j] == 0) {
continue;
}
- // log2 N = Math.log(N)/Math.log(2)
int bitSize = (int) Math.ceil(Math.log(F[Constants.NUMBER_OF_SYMBOLS][j]) / Math.log(2));
- if (bitSize > shift)
+ if (bitSize > shift) {
bitSize = shift;
+ }
+ if (bitSize == 0) {
+ bitSize = 1;
+ }
- // TODO: check if handling bitSize = 0 is required
- if (bitSize == 0)
- bitSize = 1; // bitSize cannot be zero
-
- // special case -> if a symbol occurs only once and at the end of the input,
- // then the order 0 freq table associated with it should have a frequency of 1 for symbol 0
- // i.e, F[sym][0] = 1
normaliseFrequenciesOrder0(F[j], bitSize);
}
}
- public static void normaliseFrequenciesOrder0Shift(final int[] frequencies, final int bits){
-
- // compute total frequency
+ /**
+ * Shift-based frequency normalization: scale frequencies by a power of 2 so they sum to {@code 1 << bits}.
+ */
+ public static void normaliseFrequenciesOrder0Shift(final int[] frequencies, final int bits) {
int totalFrequency = 0;
- for (int freq : frequencies) {
+ for (final int freq : frequencies) {
totalFrequency += freq;
}
- if (totalFrequency == 0 || totalFrequency == (1<> 2;
- int i0 = 0;
- int i1 = isz4;
- int i2 = 2 * isz4;
- int i7 = 3 * isz4;
- byte l0 = 0;
- byte l1 = 0;
- byte l2 = 0;
- byte l7 = 0;
- final ArithmeticDecoder[] D = getD();
- final RANSDecodingSymbol[][] syms = getDecodingSymbols();
- for (; i0 < isz4; i0++, i1++, i2++, i7++) {
- final byte c0 = D[0xFF & l0].reverseLookup[Utils.RANSGetCumulativeFrequency(rans0, Constants.TOTAL_FREQ_SHIFT)];
- final byte c1 = D[0xFF & l1].reverseLookup[Utils.RANSGetCumulativeFrequency(rans1, Constants.TOTAL_FREQ_SHIFT)];
- final byte c2 = D[0xFF & l2].reverseLookup[Utils.RANSGetCumulativeFrequency(rans2, Constants.TOTAL_FREQ_SHIFT)];
- final byte c7 = D[0xFF & l7].reverseLookup[Utils.RANSGetCumulativeFrequency(rans7, Constants.TOTAL_FREQ_SHIFT)];
-
- outBuffer.put(i0, c0);
- outBuffer.put(i1, c1);
- outBuffer.put(i2, c2);
- outBuffer.put(i7, c7);
-
- rans0 = syms[0xFF & l0][0xFF & c0].advanceSymbolStep(rans0, Constants.TOTAL_FREQ_SHIFT);
- rans1 = syms[0xFF & l1][0xFF & c1].advanceSymbolStep(rans1, Constants.TOTAL_FREQ_SHIFT);
- rans2 = syms[0xFF & l2][0xFF & c2].advanceSymbolStep(rans2, Constants.TOTAL_FREQ_SHIFT);
- rans7 = syms[0xFF & l7][0xFF & c7].advanceSymbolStep(rans7, Constants.TOTAL_FREQ_SHIFT);
-
- rans0 = Utils.RANSDecodeRenormalize4x8(rans0, inBuffer);
- rans1 = Utils.RANSDecodeRenormalize4x8(rans1, inBuffer);
- rans2 = Utils.RANSDecodeRenormalize4x8(rans2, inBuffer);
- rans7 = Utils.RANSDecodeRenormalize4x8(rans7, inBuffer);
-
- l0 = c0;
- l1 = c1;
- l2 = c2;
- l7 = c7;
- }
-
- // Remainder
- for (; i7 < out_sz; i7++) {
- final byte c7 = D[0xFF & l7].reverseLookup[Utils.RANSGetCumulativeFrequency(rans7, Constants.TOTAL_FREQ_SHIFT)];
- outBuffer.put(i7, c7);
- rans7 = syms[0xFF & l7][0xFF & c7].advanceSymbol4x8(rans7, inBuffer, Constants.TOTAL_FREQ_SHIFT);
- // TODO: the spec specifies renormalize here
- // rans7 = Utils.RANSDecodeRenormalize4x8(rans7, inBuffer);
- l7 = c7;
- }
- }
-
- private void readStatsOrder0(final ByteBuffer cp) {
- // Pre-compute reverse lookup of frequency.
- final ArithmeticDecoder decoder = getD()[0];
- final RANSDecodingSymbol[] decodingSymbols = getDecodingSymbols()[0];
- int rle = 0;
- int cumulativeFrequency = 0;
- int symbol = cp.get() & 0xFF;
- do {
- if ((decoder.frequencies[symbol] = (cp.get() & 0xFF)) >= 0x80) {
-
- // read a variable sized unsigned integer with ITF8 encoding
- decoder.frequencies[symbol] &= ~0x80;
- decoder.frequencies[symbol] = ((decoder.frequencies[symbol] & 0x7F) << 8) | (cp.get() & 0xFF);
- }
-
- decodingSymbols[symbol].set(cumulativeFrequency, decoder.frequencies[symbol]);
-
- /* Build reverse lookup table */
- Arrays.fill(decoder.reverseLookup, cumulativeFrequency, cumulativeFrequency + decoder.frequencies[symbol], (byte) symbol);
-
- cumulativeFrequency += decoder.frequencies[symbol];
-
- if (rle == 0 && symbol + 1 == (0xFF & cp.get(cp.position()))) {
- symbol = cp.get() & 0xFF;
- rle = cp.get() & 0xFF;
- } else if (rle != 0) {
- rle--;
- symbol++;
- } else {
- symbol = cp.get() & 0xFF;
- }
- } while (symbol != 0);
-
- assert (cumulativeFrequency <= Constants.TOTAL_FREQ);
- }
-
- private void readStatsOrder1(final ByteBuffer cp) {
- final ArithmeticDecoder[] D = getD();
- final RANSDecodingSymbol[][] decodingSymbols = getDecodingSymbols();
- int rle_i = 0;
- int i = 0xFF & cp.get();
- do {
- int rle_j = 0;
- int cumulativeFrequency = 0;
- int j = 0xFF & cp.get();
- do {
- if ((D[i].frequencies[j] = (0xFF & cp.get())) >= 0x80) {
-
- // read a variable sized unsigned integer with ITF8 encoding
- D[i].frequencies[j] &= ~0x80;
- D[i].frequencies[j] = ((D[i].frequencies[j] & 0x7F) << 8) | (0xFF & cp.get());
- }
-
- if (D[i].frequencies[j] == 0) {
- D[i].frequencies[j] = Constants.TOTAL_FREQ;
- }
-
- decodingSymbols[i][j].set(
- cumulativeFrequency,
- D[i].frequencies[j]
- );
-
- /* Build reverse lookup table */
- Arrays.fill(D[i].reverseLookup, cumulativeFrequency, cumulativeFrequency + D[i].frequencies[j], (byte) j);
-
- cumulativeFrequency += D[i].frequencies[j];
- assert (cumulativeFrequency <= Constants.TOTAL_FREQ);
-
- if (rle_j == 0 && j + 1 == (0xFF & cp.get(cp.position()))) {
- j = (0xFF & cp.get());
- rle_j = (0xFF & cp.get());
- } else if (rle_j != 0) {
- rle_j--;
- j++;
- } else {
- j = (0xFF & cp.get());
- }
- } while (j != 0);
-
- if (rle_i == 0 && i + 1 == (0xFF & cp.get(cp.position()))) {
- i = (0xFF & cp.get());
- rle_i = (0xFF & cp.get());
- } else if (rle_i != 0) {
- rle_i--;
- i++;
- } else {
- i = (0xFF & cp.get());
- }
- } while (i != 0);
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Encode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Encode.java
deleted file mode 100644
index 638882fb67..0000000000
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Encode.java
+++ /dev/null
@@ -1,442 +0,0 @@
-package htsjdk.samtools.cram.compression.rans.rans4x8;
-
-import htsjdk.samtools.cram.CRAMException;
-import htsjdk.samtools.cram.compression.CompressionUtils;
-import htsjdk.samtools.cram.compression.rans.Constants;
-import htsjdk.samtools.cram.compression.rans.RANSEncode;
-import htsjdk.samtools.cram.compression.rans.RANSEncodingSymbol;
-import htsjdk.samtools.cram.compression.rans.RANSParams;
-import htsjdk.samtools.cram.compression.rans.Utils;
-import htsjdk.utils.ValidationUtils;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-public class RANS4x8Encode extends RANSEncode {
-
- // streams smaller than this value don't have sufficient symbol context for ORDER-1 encoding,
- // so always use ORDER-0
- private static final int MINIMUM_ORDER_1_SIZE = 4;
- private static final ByteBuffer EMPTY_BUFFER = CompressionUtils.allocateByteBuffer(0);
-
- // This method assumes that inBuffer is already rewound.
- // It compresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the compressed data.
- public ByteBuffer compress(final ByteBuffer inBuffer, final RANS4x8Params params) {
- if (inBuffer.remaining() == 0) {
- return EMPTY_BUFFER;
- }
- initializeRANSEncoder();
- if (inBuffer.remaining() < MINIMUM_ORDER_1_SIZE) {
- // ORDER-1 encoding of less than 4 bytes is not permitted, so just use ORDER-0
- return compressOrder0Way4(inBuffer);
- }
- final RANSParams.ORDER order= params.getOrder();
- switch (order) {
- case ZERO:
- return compressOrder0Way4(inBuffer);
-
- case ONE:
- return compressOrder1Way4(inBuffer);
-
- default:
- throw new CRAMException("Unknown rANS order: " + params.getOrder());
- }
- }
-
- private ByteBuffer compressOrder0Way4(final ByteBuffer inBuffer) {
- final int inputSize = inBuffer.remaining();
- final ByteBuffer outBuffer = CompressionUtils.allocateOutputBuffer(inputSize);
-
- // move the output buffer ahead to the start of the frequency table (we'll come back and
- // write the output stream prefix at the end of this method)
- outBuffer.position(Constants.RANS_4x8_PREFIX_BYTE_LENGTH); // start of frequency table
-
- // get the normalised frequencies of the alphabets
- final int[] normalizedFreq = calcFrequenciesOrder0(inBuffer);
-
- // using the normalised frequencies, set the RANSEncodingSymbols
- buildSymsOrder0(normalizedFreq);
- final ByteBuffer cp = CompressionUtils.slice(outBuffer);
-
- // write Frequency table
- final int frequencyTableSize = writeFrequenciesOrder0(cp, normalizedFreq);
-
- inBuffer.rewind();
-
- final RANSEncodingSymbol[] syms = getEncodingSymbols()[0];
- final int in_size = inBuffer.remaining();
- long rans0, rans1, rans2, rans3;
- final ByteBuffer ptr = CompressionUtils.slice(cp);
- rans0 = Constants.RANS_4x8_LOWER_BOUND;
- rans1 = Constants.RANS_4x8_LOWER_BOUND;
- rans2 = Constants.RANS_4x8_LOWER_BOUND;
- rans3 = Constants.RANS_4x8_LOWER_BOUND;
-
- int i;
- switch (i = (in_size & 3)) {
- case 3:
- rans2 = syms[0xFF & inBuffer.get(in_size - (i - 2))].putSymbol4x8(rans2, ptr);
- case 2:
- rans1 = syms[0xFF & inBuffer.get(in_size - (i - 1))].putSymbol4x8(rans1, ptr);
- case 1:
- rans0 = syms[0xFF & inBuffer.get(in_size - (i))].putSymbol4x8(rans0, ptr);
- case 0:
- break;
- }
- for (i = (in_size & ~3); i > 0; i -= 4) {
- final byte c3 = inBuffer.get(i - 1);
- final byte c2 = inBuffer.get(i - 2);
- final byte c1 = inBuffer.get(i - 3);
- final byte c0 = inBuffer.get(i - 4);
-
- rans3 = syms[0xFF & c3].putSymbol4x8(rans3, ptr);
- rans2 = syms[0xFF & c2].putSymbol4x8(rans2, ptr);
- rans1 = syms[0xFF & c1].putSymbol4x8(rans1, ptr);
- rans0 = syms[0xFF & c0].putSymbol4x8(rans0, ptr);
- }
-
- ptr.order(ByteOrder.BIG_ENDIAN);
- ptr.putInt((int) rans3);
- ptr.putInt((int) rans2);
- ptr.putInt((int) rans1);
- ptr.putInt((int) rans0);
- ptr.flip();
- final int cdata_size = ptr.limit();
- // reverse the compressed bytes, so that they become in REVERSE order:
- Utils.reverse(ptr);
- inBuffer.position(inBuffer.limit());
-
- // write the prefix at the beginning of the output buffer
- writeCompressionPrefix(RANSParams.ORDER.ZERO, outBuffer, inputSize, frequencyTableSize, cdata_size);
- return outBuffer;
- }
-
- private ByteBuffer compressOrder1Way4(final ByteBuffer inBuffer) {
- final int inSize = inBuffer.remaining();
- final ByteBuffer outBuffer = CompressionUtils.allocateOutputBuffer(inSize);
-
- // move to start of frequency
- outBuffer.position(Constants.RANS_4x8_PREFIX_BYTE_LENGTH);
-
- // get normalized frequencies
- final int[][] normalizedFreq = calcFrequenciesOrder1(inBuffer);
-
- // using the normalised frequencies, set the RANSEncodingSymbols
- buildSymsOrder1(normalizedFreq);
-
- final ByteBuffer cp = CompressionUtils.slice(outBuffer);
- final int frequencyTableSize = writeFrequenciesOrder1(cp, normalizedFreq);
- inBuffer.rewind();
- final int in_size = inBuffer.remaining();
- long rans0, rans1, rans2, rans3;
- rans0 = Constants.RANS_4x8_LOWER_BOUND;
- rans1 = Constants.RANS_4x8_LOWER_BOUND;
- rans2 = Constants.RANS_4x8_LOWER_BOUND;
- rans3 = Constants.RANS_4x8_LOWER_BOUND;
-
- final int isz4 = in_size >> 2;
- int i0 = isz4 - 2;
- int i1 = 2 * isz4 - 2;
- int i2 = 3 * isz4 - 2;
- int i3 = 4 * isz4 - 2;
-
- byte l0 = 0;
- if (i0 + 1 >= 0) {
- l0 = inBuffer.get(i0 + 1);
- }
- byte l1 = 0;
- if (i1 + 1 >= 0) {
- l1 = inBuffer.get(i1 + 1);
- }
- byte l2 = 0;
- if (i2 + 1 >= 0) {
- l2 = inBuffer.get(i2 + 1);
- }
-
- // Deal with the remainder
- byte l3 = inBuffer.get(in_size - 1);
-
- // Slicing is needed for buffer reversing later
- final ByteBuffer ptr = CompressionUtils.slice(cp);
- final RANSEncodingSymbol[][] syms = getEncodingSymbols();
- for (i3 = in_size - 2; i3 > 4 * isz4 - 2 && i3 >= 0; i3--) {
- final byte c3 = inBuffer.get(i3);
- rans3 = syms[0xFF & c3][0xFF & l3].putSymbol4x8(rans3, ptr);
- l3 = c3;
- }
-
- for (; i0 >= 0; i0--, i1--, i2--, i3--) {
- final byte c0 = inBuffer.get(i0);
- final byte c1 = inBuffer.get(i1);
- final byte c2 = inBuffer.get(i2);
- final byte c3 = inBuffer.get(i3);
-
- rans3 = syms[0xFF & c3][0xFF & l3].putSymbol4x8(rans3, ptr);
- rans2 = syms[0xFF & c2][0xFF & l2].putSymbol4x8(rans2, ptr);
- rans1 = syms[0xFF & c1][0xFF & l1].putSymbol4x8(rans1, ptr);
- rans0 = syms[0xFF & c0][0xFF & l0].putSymbol4x8(rans0, ptr);
-
- l0 = c0;
- l1 = c1;
- l2 = c2;
- l3 = c3;
- }
-
- rans3 = syms[0][0xFF & l3].putSymbol4x8(rans3, ptr);
- rans2 = syms[0][0xFF & l2].putSymbol4x8(rans2, ptr);
- rans1 = syms[0][0xFF & l1].putSymbol4x8(rans1, ptr);
- rans0 = syms[0][0xFF & l0].putSymbol4x8(rans0, ptr);
-
- ptr.order(ByteOrder.BIG_ENDIAN);
- ptr.putInt((int) rans3);
- ptr.putInt((int) rans2);
- ptr.putInt((int) rans1);
- ptr.putInt((int) rans0);
- ptr.flip();
- final int compressedBlobSize = ptr.limit();
- Utils.reverse(ptr);
- /*
- * Depletion of the in buffer cannot be confirmed because of the get(int
- * position) method use during encoding, hence enforcing:
- */
- inBuffer.position(inBuffer.limit());
-
- // write the prefix at the beginning of the output buffer
- writeCompressionPrefix(RANSParams.ORDER.ONE, outBuffer, inSize, frequencyTableSize, compressedBlobSize);
- return outBuffer;
- }
-
- private static void writeCompressionPrefix(
- final RANSParams.ORDER order,
- final ByteBuffer outBuffer,
- final int inSize,
- final int frequencyTableSize,
- final int compressedBlobSize) {
- ValidationUtils.validateArg(order == RANSParams.ORDER.ONE || order == RANSParams.ORDER.ZERO,"unrecognized RANS order");
- outBuffer.limit(Constants.RANS_4x8_PREFIX_BYTE_LENGTH + frequencyTableSize + compressedBlobSize);
-
- // go back to the beginning of the stream and write the prefix values
- // write the (ORDER as a single byte at offset 0)
- outBuffer.put(0, (byte) (order == RANSParams.ORDER.ZERO ? 0 : 1));
- // move past the ORDER and write the compressed size
- outBuffer.putInt(Constants.RANS_4x8_ORDER_BYTE_LENGTH, frequencyTableSize + compressedBlobSize);
- // move past the compressed size and write the uncompressed size
- outBuffer.putInt(Constants.RANS_4x8_ORDER_BYTE_LENGTH + Constants.RANS_4x8_COMPRESSED_BYTE_LENGTH, inSize);
- outBuffer.rewind();
- }
-
- private static int[] calcFrequenciesOrder0(final ByteBuffer inBuffer) {
- // TODO: remove duplicate code -use Utils.normalise here
- final int T = inBuffer.remaining();
-
- // Compute statistics
- // T = total of true counts = inBuffer size
- // F = scaled integer frequencies
- // M = sum(fs)
- final int[] F = new int[Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < T; i++) {
- F[0xFF & inBuffer.get()]++;
- }
-
- // Normalise so T == TOTFREQ
- // m is the maximum frequency value
- // M is the symbol that has the maximum frequency
- int m = 0;
- int M = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (m < F[j]) {
- m = F[j];
- M = j;
- }
- }
-
- final long tr = ((long) Constants.TOTAL_FREQ << 31) / T + (1 << 30) / T;
- int fsum = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[j] == 0) {
- continue;
- }
- // using tr to normalize symbol frequencies such that their total = (1<<12) = 4096
- if ((F[j] = (int) ((F[j] * tr) >> 31)) == 0) {
- // make sure that a non-zero symbol frequency is not incorrectly set to 0.
- // Change it to 1 if the calculated value is 0.
- F[j] = 1;
- }
- fsum += F[j];
- }
-
- // Commenting the below line as it is incrementing fsum by 1, which does not make sense
- // and it also makes total normalised frequency = 4095 and not 4096.
- // fsum++;
-
- // adjust the frequency of the symbol with maximum frequency to make sure that
- // the sum of frequencies of all the symbols = 4096
- if (fsum < Constants.TOTAL_FREQ) {
- F[M] += Constants.TOTAL_FREQ - fsum;
- } else {
- F[M] -= fsum - Constants.TOTAL_FREQ;
- }
- return F;
- }
-
- private static int[][] calcFrequenciesOrder1(final ByteBuffer in) {
- final int in_size = in.remaining();
-
- final int[][] F = new int[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS];
- final int[] T = new int[Constants.NUMBER_OF_SYMBOLS];
- int last_i = 0;
- for (int i = 0; i < in_size; i++) {
- int c = 0xFF & in.get();
- F[last_i][c]++;
- T[last_i]++;
- last_i = c;
- }
- F[0][0xFF & in.get((in_size >> 2))]++;
- F[0][0xFF & in.get(2 * (in_size >> 2))]++;
- F[0][0xFF & in.get(3 * (in_size >> 2))]++;
- T[0] += 3;
-
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- if (T[i] == 0) {
- continue;
- }
-
- final double p = ((double) Constants.TOTAL_FREQ) / T[i];
- int t2 = 0, m = 0, M = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[i][j] == 0)
- continue;
-
- if (m < F[i][j]) {
- m = F[i][j];
- M = j;
- }
-
- if ((F[i][j] *= p) == 0)
- F[i][j] = 1;
- t2 += F[i][j];
- }
-
- // Commenting the below line as it is incrementing t2 by 1, which does not make sense
- // and it also makes total normalised frequency = 4095 and not 4096.
- // t2++;
-
- if (t2 < Constants.TOTAL_FREQ) {
- F[i][M] += Constants.TOTAL_FREQ - t2;
- } else {
- F[i][M] -= t2 - Constants.TOTAL_FREQ;
- }
- }
-
- return F;
- }
-
- private static int writeFrequenciesOrder0(final ByteBuffer cp, final int[] F) {
- final int start = cp.position();
-
- int rle = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[j] != 0) {
- // j
- if (rle != 0) {
- rle--;
- } else {
- // write the symbol if it is the first symbol or if rle = 0.
- // if rle != 0, then skip writing the symbol.
- cp.put((byte) j);
- // We've encoded two symbol frequencies in a row.
- // How many more are there? Store that count so
- // we can avoid writing consecutive symbols.
- // Note: maximum possible rle = 254
- // rle requires atmost 1 byte
- if (rle == 0 && j != 0 && F[j - 1] != 0) {
- for (rle = j + 1; rle < Constants.NUMBER_OF_SYMBOLS && F[rle] != 0; rle++)
- ;
- rle -= j + 1;
- cp.put((byte) rle);
- }
- }
-
- // F[j]
- if (F[j] < 128) {
- cp.put((byte) (F[j]));
- } else {
- // if F[j] >127, it is written in 2 bytes
- cp.put((byte) (128 | (F[j] >> 8)));
- cp.put((byte) (F[j] & 0xff));
- }
- }
- }
-
- // write 0 indicating the end of frequency table
- cp.put((byte) 0);
- return cp.position() - start;
- }
-
- private static int writeFrequenciesOrder1(final ByteBuffer cp, final int[][] F) {
- final int start = cp.position();
- final int[] T = new int[Constants.NUMBER_OF_SYMBOLS];
-
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- T[i] += F[i][j];
- }
- }
-
- int rle_i = 0;
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- if (T[i] == 0) {
- continue;
- }
-
- // Store frequency table
- // i
- if (rle_i != 0) {
- rle_i--;
- } else {
- cp.put((byte) i);
- // FIXME: could use order-0 statistics to observe which alphabet
- // symbols are present and base RLE on that ordering instead.
- if (i != 0 && T[i - 1] != 0) {
- for (rle_i = i + 1; rle_i < Constants.NUMBER_OF_SYMBOLS && T[rle_i] != 0; rle_i++)
- ;
- rle_i -= i + 1;
- cp.put((byte) rle_i);
- }
- }
-
- final int[] F_i_ = F[i];
- int rle_j = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F_i_[j] != 0) {
-
- // j
- if (rle_j != 0) {
- rle_j--;
- } else {
- cp.put((byte) j);
- if (rle_j == 0 && j != 0 && F_i_[j - 1] != 0) {
- for (rle_j = j + 1; rle_j < Constants.NUMBER_OF_SYMBOLS && F_i_[rle_j] != 0; rle_j++)
- ;
- rle_j -= j + 1;
- cp.put((byte) rle_j);
- }
- }
-
- // F_i_[j]
- if (F_i_[j] < 128) {
- cp.put((byte) F_i_[j]);
- } else {
- cp.put((byte) (128 | (F_i_[j] >> 8)));
- cp.put((byte) (F_i_[j] & 0xff));
- }
- }
- }
- cp.put((byte) 0);
- }
- cp.put((byte) 0);
-
- return cp.position() - start;
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Params.java b/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Params.java
deleted file mode 100644
index 8ea6c9e855..0000000000
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/rans4x8/RANS4x8Params.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package htsjdk.samtools.cram.compression.rans.rans4x8;
-
-import htsjdk.samtools.cram.compression.rans.RANSParams;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params;
-
-public class RANS4x8Params implements RANSParams {
-
- private final ORDER order;
-
- public RANS4x8Params(final ORDER order) {
- this.order = order;
- }
-
- @Override
- public String toString() {
- return "RANS4x8Params{" + "order=" + order + "}";
- }
-
- @Override
- public ORDER getOrder() {
- return order;
- }
-
- public int getFormatFlags(){
- return order == ORDER.ONE ?
- RANSNx16Params.ORDER_FLAG_MASK :
- 0;
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java
deleted file mode 100644
index d6875190b2..0000000000
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java
+++ /dev/null
@@ -1,448 +0,0 @@
-package htsjdk.samtools.cram.compression.rans.ransnx16;
-
-import htsjdk.samtools.cram.CRAMException;
-import htsjdk.samtools.cram.compression.CompressionUtils;
-import htsjdk.samtools.cram.compression.rans.ArithmeticDecoder;
-import htsjdk.samtools.cram.compression.rans.Constants;
-import htsjdk.samtools.cram.compression.rans.RANSDecode;
-import htsjdk.samtools.cram.compression.rans.RANSDecodingSymbol;
-import htsjdk.samtools.cram.compression.rans.Utils;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-
-/**
- * Decoder for the CRAM 3.1 rANSNx16 codec with 16-bit state renormalization (as opposed to the rAns4x8 codec,
- * which uses 8-bit state renormalization), and order-0 or order-1 context. Also supports bit-packing, run length
- * encoding and striping (see the spec).
- *
- * This codec is also used internally by the read name NameTokenisation codec.
- */
-public class RANSNx16Decode extends RANSDecode {
- private static final ByteBuffer EMPTY_BUFFER = CompressionUtils.allocateByteBuffer(0);
- private static final int FREQ_TABLE_OPTIONALLY_COMPRESSED_MASK = 0x01;
- private static final int RLE_META_OPTIONALLY_COMPRESSED_MASK = 0x01;
-
- // This method assumes that inBuffer is already rewound.
- // It uncompresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the uncompressed data.
- public ByteBuffer uncompress(final ByteBuffer inBuffer) {
-
- // For RANS decoding, the bytes are read in little endian from the input stream
- inBuffer.order(ByteOrder.LITTLE_ENDIAN);
- return uncompress(inBuffer, 0);
- }
-
- private ByteBuffer uncompress(final ByteBuffer inBuffer, final int outSize) {
- if (inBuffer.remaining() == 0) {
- return EMPTY_BUFFER;
- }
-
- // the first byte of compressed stream gives the formatFlags
- final int formatFlags = inBuffer.get() & 0xFF;
- final RANSNx16Params ransNx16Params = new RANSNx16Params(formatFlags);
-
- // if nosz flag is set, then uncompressed size is not recorded.
- int uncompressedSize = ransNx16Params.isNosz() ? outSize : CompressionUtils.readUint7(inBuffer);
-
- // if stripe, then decodeStripe
- if (ransNx16Params.isStripe()) {
- return decodeStripe(inBuffer, uncompressedSize);
- }
-
- // if pack, get pack metadata, which will be used later to decode packed data
- int packDataLength = 0;
- int numSymbols = 0;
- byte[] packMappingTable = null;
- if (ransNx16Params.isPack()) {
- packDataLength = uncompressedSize;
- numSymbols = inBuffer.get() & 0xFF;
-
- // if (numSymbols > 16 or numSymbols==0), raise exception
- if (numSymbols <= 16 && numSymbols != 0) {
- packMappingTable = new byte[numSymbols];
- for (int i = 0; i < numSymbols; i++) {
- packMappingTable[i] = inBuffer.get();
- }
- uncompressedSize = CompressionUtils.readUint7(inBuffer);
- } else {
- throw new CRAMException("Bit Packing is not permitted when number of distinct symbols is greater than 16 or equal to 0. " +
- "Number of distinct symbols: " + numSymbols);
- }
- }
-
- // if rle, get rle metadata, which will be used later to decode rle
- int uncompressedRLEOutputLength = 0;
- int[] rleSymbols = null;
- ByteBuffer uncompressedRLEMetaData = null;
- if (ransNx16Params.isRLE()) {
- rleSymbols = new int[Constants.NUMBER_OF_SYMBOLS];
- final int uncompressedRLEMetaDataLength = CompressionUtils.readUint7(inBuffer);
- uncompressedRLEOutputLength = uncompressedSize;
- uncompressedSize = CompressionUtils.readUint7(inBuffer);
- uncompressedRLEMetaData = decodeRLEMeta(inBuffer, uncompressedRLEMetaDataLength, rleSymbols, ransNx16Params);
- }
-
- ByteBuffer outBuffer;
-
- // If CAT is set then, the input is uncompressed
- if (ransNx16Params.isCAT()) {
- outBuffer = CompressionUtils.slice(inBuffer);
- outBuffer.limit(uncompressedSize);
- // While resetting the position to the end is not strictly necessary,
- // it is being done for the sake of completeness and
- // to meet the requirements of the tests that verify the boundary conditions.
- inBuffer.position(inBuffer.position()+uncompressedSize);
- } else {
- outBuffer = CompressionUtils.allocateByteBuffer(uncompressedSize);
-
- if (uncompressedSize == 0) {
- throw new CRAMException("Unexpected uncompressed size of 0 in RANSNx16 stream");
- }
- switch (ransNx16Params.getOrder()) {
- case ZERO:
- uncompressOrder0WayN(inBuffer, outBuffer, uncompressedSize, ransNx16Params);
- break;
- case ONE:
- uncompressOrder1WayN(inBuffer, outBuffer, ransNx16Params);
- break;
- default:
- throw new CRAMException("Unknown rANSNx16 order: " + ransNx16Params.getOrder());
- }
- }
-
- // if rle, then decodeRLE
- if (ransNx16Params.isRLE()) {
- outBuffer = decodeRLE(outBuffer, rleSymbols, uncompressedRLEMetaData, uncompressedRLEOutputLength);
- }
-
- // if pack, then decodePack
- if (ransNx16Params.isPack()) {
- outBuffer = CompressionUtils.decodePack(outBuffer, packMappingTable, numSymbols, packDataLength);
- }
- return outBuffer;
- }
-
- private void uncompressOrder0WayN(
- final ByteBuffer inBuffer,
- final ByteBuffer outBuffer,
- final int outSize,
- final RANSNx16Params ransNx16Params) {
- initializeRANSDecoder();
-
- // read the frequency table, get the normalised frequencies and use it to set the RANSDecodingSymbols
- readFrequencyTableOrder0(inBuffer);
-
- // uncompress using Nway rans states
- final int Nway = ransNx16Params.getNumInterleavedRANSStates();
-
- // Nway parallel rans states. Nway = 4 or 32
- final long[] rans = new long[Nway];
-
- for (int r=0; r> 2) : (outSize >> 5);
-
- // Number of elements that don't fall into the Nway streams
- int remSize = outSize - (interleaveSize * Nway);
- final int out_end = outSize - remSize;
- final ArithmeticDecoder D = getD()[0];
- final RANSDecodingSymbol[] syms = getDecodingSymbols()[0];
- for (int i = 0; i < out_end; i += Nway) {
- for (int r=0; r0){
- byte remainingSymbol = D.reverseLookup[Utils.RANSGetCumulativeFrequency(rans[reverseIndex], Constants.TOTAL_FREQ_SHIFT)];
- syms[0xFF & remainingSymbol].advanceSymbolNx16(rans[reverseIndex], inBuffer, Constants.TOTAL_FREQ_SHIFT);
- outBuffer.put(remainingSymbol);
- remSize --;
- reverseIndex ++;
- }
- outBuffer.rewind();
- }
-
- private void uncompressOrder1WayN(
- final ByteBuffer inBuffer,
- final ByteBuffer outBuffer,
- final RANSNx16Params ransNx16Params) {
-
- // read the first byte
- final int frequencyTableFirstByte = (inBuffer.get() & 0xFF);
- final boolean optionalCompressFlag = ((frequencyTableFirstByte & FREQ_TABLE_OPTIONALLY_COMPRESSED_MASK)!=0);
- final ByteBuffer freqTableSource;
- if (optionalCompressFlag) {
-
- // spec: The order-1 frequency table itself may still be quite large,
- // so is optionally compressed using the order-0 rANSNx16 codec with a fixed 4-way interleaving.
-
- // if optionalCompressFlag is true, the frequency table was compressed using RANS Nx16, N=4 Order 0
- final int uncompressedLength = CompressionUtils.readUint7(inBuffer);
- final int compressedLength = CompressionUtils.readUint7(inBuffer);
- byte[] compressedFreqTable = new byte[compressedLength];
-
- // read compressedLength bytes into compressedFreqTable byte array
- inBuffer.get(compressedFreqTable,0,compressedLength);
-
- // decode the compressedFreqTable to get the uncompressedFreqTable using RANS Nx16, N=4 Order 0 uncompress
- freqTableSource = CompressionUtils.allocateByteBuffer(uncompressedLength);
- final ByteBuffer compressedFrequencyTableBuffer = CompressionUtils.wrap(compressedFreqTable);
-
- // uncompress using RANSNx16 Order 0, Nway = 4
- // formatFlags = (~RANSNx16Params.ORDER_FLAG_MASK & ~RANSNx16Params.N32_FLAG_MASK) = ~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK)
- uncompressOrder0WayN(compressedFrequencyTableBuffer, freqTableSource, uncompressedLength,new RANSNx16Params(~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK))); // format flags = 0
- }
- else {
- freqTableSource = inBuffer;
- }
-
- // Moving initializeRANSDecoder() from the beginning of this method to this point in the code
- // due to the nested call to uncompressOrder0WayN, which also invokes the initializeRANSDecoder() method.
- // TODO: we should work on a more permanent solution for this issue!
- initializeRANSDecoder();
- final int shift = frequencyTableFirstByte >> 4;
- readFrequencyTableOrder1(freqTableSource, shift);
- final int outputSize = outBuffer.remaining();
-
- // Nway parallel rans states. Nway = 4 or 32
- final int Nway = ransNx16Params.getNumInterleavedRANSStates();
- final long[] rans = new long[Nway];
- final int[] interleaveStreamIndex = new int[Nway];
- final int[] context = new int[Nway];
-
- // size of interleaved stream = outputSize / Nway
- // For Nway = 4, division by 4 is the same as right shift by 2 bits
- // For Nway = 32, division by 32 is the same as right shift by 5 bits
- final int interleaveSize = (Nway==4) ? (outputSize >> 2): (outputSize >> 5);
-
- for (int r=0; r 0) {
- decoder.frequencies[j] = CompressionUtils.readUint7(cp);
- }
- }
- Utils.normaliseFrequenciesOrder0Shift(decoder.frequencies, Constants.TOTAL_FREQ_SHIFT);
-
- final RANSDecodingSymbol[] decodingSymbols = getDecodingSymbols()[0];
- int cumulativeFrequency = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if(alphabet[j]>0){
-
- // set RANSDecodingSymbol
- decodingSymbols[j].set(cumulativeFrequency, decoder.frequencies[j]);
-
- // update Reverse Lookup table
- Arrays.fill(decoder.reverseLookup, cumulativeFrequency, cumulativeFrequency + decoder.frequencies[j], (byte) j);
- cumulativeFrequency += decoder.frequencies[j];
- }
- }
- }
-
- private void readFrequencyTableOrder1(
- final ByteBuffer cp,
- final int shift) {
- final ArithmeticDecoder[] D = getD();
- final RANSDecodingSymbol[][] decodingSymbols = getDecodingSymbols();
- final int[] alphabet = readAlphabet(cp);
- for (int i=0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- if (alphabet[i] > 0) {
- int run = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (alphabet[j] > 0) {
- if (run > 0) {
- run--;
- } else {
- D[i].frequencies[j] = CompressionUtils.readUint7(cp);
- if (D[i].frequencies[j] == 0){
- run = cp.get() & 0xFF;
- }
- }
- }
- }
-
- // For each symbol, normalise it's order 0 frequency table
- Utils.normaliseFrequenciesOrder0Shift(D[i].frequencies,shift);
- int cumulativeFreq=0;
-
- // set decoding symbols
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- decodingSymbols[i][j].set(
- cumulativeFreq,
- D[i].frequencies[j]
- );
- /* Build reverse lookup table */
- Arrays.fill(D[i].reverseLookup, cumulativeFreq, cumulativeFreq + D[i].frequencies[j], (byte) j);
- cumulativeFreq+=D[i].frequencies[j];
- }
- }
- }
- }
-
- private static int[] readAlphabet(final ByteBuffer cp){
- // gets the list of alphabets whose frequency!=0
- final int[] alphabet = new int[Constants.NUMBER_OF_SYMBOLS];
- int rle = 0;
- int symbol = cp.get() & 0xFF;
- int lastSymbol = symbol;
- do {
- alphabet[symbol] = 1;
- if (rle!=0) {
- rle--;
- symbol++;
- } else {
- symbol = cp.get() & 0xFF;
- if (symbol == lastSymbol+1) {
- rle = cp.get() & 0xFF;
- }
- }
- lastSymbol = symbol;
- } while (symbol != 0);
- return alphabet;
- }
-
- private ByteBuffer decodeRLEMeta(
- final ByteBuffer inBuffer,
- final int uncompressedRLEMetaDataLength,
- final int[] rleSymbols,
- final RANSNx16Params ransNx16Params) {
- final ByteBuffer uncompressedRLEMetaData;
-
- // The bottom bit of uncompressedRLEMetaDataLength is a flag to indicate
- // whether rle metadata is uncompressed (1) or com- pressed (0).
- if ((uncompressedRLEMetaDataLength & RLE_META_OPTIONALLY_COMPRESSED_MASK)!=0) {
- final byte[] uncompressedRLEMetaDataArray = new byte[(uncompressedRLEMetaDataLength-1)/2];
- inBuffer.get(uncompressedRLEMetaDataArray, 0, (uncompressedRLEMetaDataLength-1)/2);
- uncompressedRLEMetaData = CompressionUtils.wrap(uncompressedRLEMetaDataArray);
- } else {
- final int compressedRLEMetaDataLength = CompressionUtils.readUint7(inBuffer);
- final byte[] compressedRLEMetaDataArray = new byte[compressedRLEMetaDataLength];
- inBuffer.get(compressedRLEMetaDataArray,0,compressedRLEMetaDataLength);
- final ByteBuffer compressedRLEMetaData = CompressionUtils.wrap(compressedRLEMetaDataArray);
- uncompressedRLEMetaData = CompressionUtils.allocateByteBuffer(uncompressedRLEMetaDataLength / 2);
- // uncompress using Order 0 and N = Nway
- uncompressOrder0WayN(
- compressedRLEMetaData,
- uncompressedRLEMetaData,
- uncompressedRLEMetaDataLength / 2,
- new RANSNx16Params(0x00 | ransNx16Params.getFormatFlags() & RANSNx16Params.N32_FLAG_MASK));
- }
-
- int numRLESymbols = uncompressedRLEMetaData.get() & 0xFF;
- if (numRLESymbols == 0) {
- numRLESymbols = Constants.NUMBER_OF_SYMBOLS;
- }
- for (int i = 0; i< numRLESymbols; i++) {
- rleSymbols[uncompressedRLEMetaData.get() & 0xFF] = 1;
- }
- return uncompressedRLEMetaData;
- }
-
- private ByteBuffer decodeRLE(
- final ByteBuffer inBuffer,
- final int[] rleSymbols,
- final ByteBuffer uncompressedRLEMetaData,
- final int uncompressedRLEOutputLength) {
- final ByteBuffer rleOutBuffer = CompressionUtils.allocateByteBuffer(uncompressedRLEOutputLength);
- int j = 0;
- for(int i = 0; j< uncompressedRLEOutputLength; i++){
- final byte sym = inBuffer.get(i);
- if (rleSymbols[sym & 0xFF]!=0){
- final int run = CompressionUtils.readUint7(uncompressedRLEMetaData);
- for (int r=0; r<= run; r++){
- rleOutBuffer.put(j++, sym);
- }
- }else {
- rleOutBuffer.put(j++, sym);
- }
- }
- return rleOutBuffer;
- }
-
- private ByteBuffer decodeStripe(final ByteBuffer inBuffer, final int outSize){
- final int numInterleaveStreams = inBuffer.get() & 0xFF;
-
- // read lengths of compressed interleaved streams
- for ( int j=0; j j){
- uncompressedLengths[j]++;
- }
-
- transposedData[j] = uncompress(inBuffer, uncompressedLengths[j]);
- }
-
- // Transpose
- final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer(outSize);
- for (int j = 0; j {
- /////////////////////////////////////////////////////////////////////////////////////////////////
- // Stripe flag is not implemented in the write implementation
- /////////////////////////////////////////////////////////////////////////////////////////////////
-
- private static final ByteBuffer EMPTY_BUFFER = CompressionUtils.allocateByteBuffer(0);
-
- // This method assumes that inBuffer is already rewound.
- // It compresses the data in the inBuffer, leaving it consumed.
- // Returns a rewound ByteBuffer containing the compressed data.
- public ByteBuffer compress(final ByteBuffer inBuffer, final RANSNx16Params ransNx16Params) {
- if (inBuffer.remaining() == 0) {
- return EMPTY_BUFFER;
- }
- final ByteBuffer outBuffer = CompressionUtils.allocateOutputBuffer(inBuffer.remaining());
- final int formatFlags = ransNx16Params.getFormatFlags();
- outBuffer.put((byte) (formatFlags)); // one byte for formatFlags
-
- // NoSize
- if (!ransNx16Params.isNosz()) {
- // original size is not recorded
- CompressionUtils.writeUint7(inBuffer.remaining(),outBuffer);
- }
-
- ByteBuffer inputBuffer = inBuffer;
-
- // Stripe
- // Stripe flag is not implemented in the write implementation
- if (ransNx16Params.isStripe()) {
- throw new CRAMException("RANSNx16 Encoding with Stripe Flag is not implemented.");
- }
-
- // Pack
- if (ransNx16Params.isPack()) {
- final int[] frequencyTable = new int[Constants.NUMBER_OF_SYMBOLS];
- final int inSize = inputBuffer.remaining();
- for (int i = 0; i < inSize; i ++) {
- frequencyTable[inputBuffer.get(i) & 0xFF]++;
- }
- int numSymbols = 0;
- final int[] packMappingTable = new int[Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- if (frequencyTable[i]>0) {
- packMappingTable[i] = numSymbols++;
- }
- }
-
- // skip Packing if numSymbols < 2 or numSymbols > 16 (if there aren't at least 2 symbols to encode, then
- // bit packing would result in no data being emitted to the rAns stream, since we would only need to
- // consult the packing table; so the spec says to skip packing in this case)
- if (numSymbols > 1 && numSymbols <= 16) {
- inputBuffer = CompressionUtils.encodePack(inputBuffer, outBuffer, frequencyTable, packMappingTable, numSymbols);
- } else {
- // unset pack flag in the first byte of the outBuffer
- outBuffer.put(0,(byte)(outBuffer.get(0) & ~RANSNx16Params.PACK_FLAG_MASK));
- }
- }
-
- // RLE
- if (ransNx16Params.isRLE()){
- inputBuffer = encodeRLE(inputBuffer, outBuffer, ransNx16Params);
- }
-
- if (ransNx16Params.isCAT()) {
- // Data is uncompressed
- outBuffer.put(inputBuffer);
- outBuffer.limit(outBuffer.position());
- outBuffer.rewind(); // set position to 0
- return outBuffer;
- }
-
- // if after encoding pack and rle, the inputBuffer size < Nway, then use order 0
- if (inputBuffer.remaining() < ransNx16Params.getNumInterleavedRANSStates() && ransNx16Params.getOrder() == RANSParams.ORDER.ONE) {
-
- // set order flag to "0" in the first byte of the outBuffer
- outBuffer.put(0,(byte)(outBuffer.get(0) & ~RANSNx16Params.ORDER_FLAG_MASK));
- if (inputBuffer.remaining() == 0){
- outBuffer.limit(outBuffer.position());
- outBuffer.rewind();
- return outBuffer;
- }
- compressOrder0WayN(inputBuffer, new RANSNx16Params(outBuffer.get(0)), outBuffer);
- return outBuffer;
- }
-
- switch (ransNx16Params.getOrder()) {
- case ZERO:
- compressOrder0WayN(inputBuffer, ransNx16Params, outBuffer);
- return outBuffer;
- case ONE:
- compressOrder1WayN(inputBuffer, ransNx16Params, outBuffer);
- return outBuffer;
- default:
- throw new CRAMException("Unknown rANS order: " + ransNx16Params.getOrder());
- }
- }
-
- private void compressOrder0WayN (
- final ByteBuffer inBuffer,
- final RANSNx16Params ransNx16Params,
- final ByteBuffer outBuffer) {
- initializeRANSEncoder();
- final int inSize = inBuffer.remaining();
- int bitSize = (int) Math.ceil(Math.log(inSize) / Math.log(2));
- if (bitSize > Constants.TOTAL_FREQ_SHIFT) {
- bitSize = Constants.TOTAL_FREQ_SHIFT;
- }
- final int prefix_size = outBuffer.position();
- final int[] F = buildFrequenciesOrder0(inBuffer);
- final ByteBuffer cp = CompressionUtils.slice(outBuffer);
-
- // Normalize Frequencies such that sum of Frequencies = 1 << bitsize
- Utils.normaliseFrequenciesOrder0(F, bitSize);
-
- // Write the Frequency table. Keep track of the size for later
- final int frequencyTableSize = writeFrequenciesOrder0(cp, F);
-
- // Normalise Frequencies such that sum of Frequencies = 1 << 12
- // Since, Frequencies are already normalised to be a sum of power of 2,
- // for further normalisation, calculate the bit shift that is required to scale the frequencies to (1 << bits)
- if (bitSize != Constants.TOTAL_FREQ_SHIFT) {
- Utils.normaliseFrequenciesOrder0Shift(F, Constants.TOTAL_FREQ_SHIFT);
- }
-
- // using the normalised frequencies, set the RANSEncodingSymbols
- buildSymsOrder0(F);
- inBuffer.rewind();
- final int Nway = ransNx16Params.getNumInterleavedRANSStates();
-
- // number of remaining elements = inputSize % Nway = inputSize - (interleaveSize * Nway)
- // For Nway = 4, division by 4 is the same as right shift by 2 bits
- // For Nway = 32, division by 32 is the same as right shift by 5 bits
- final int inputSize = inBuffer.remaining();
- final int interleaveSize = (Nway == 4) ? (inputSize >> 2) : (inputSize >> 5);
- int remainingSize = inputSize - (interleaveSize * Nway);
- int reverseIndex = 1;
- final long[] rans = new long[Nway];
-
- // initialize rans states
- for (int r=0; r0){
-
- // encode remaining elements first
- int remainingSymbol = 0xFF & inBuffer.get(inputSize - reverseIndex);
- rans[remainingSize - 1] = ransEncodingSymbols[remainingSymbol].putSymbolNx16(rans[remainingSize - 1], ptr);
- remainingSize --;
- reverseIndex ++;
- }
- final byte[] symbol = new byte[Nway];
- for (int i = (interleaveSize * Nway); i > 0; i -= Nway) {
- for (int r = Nway - 1; r >= 0; r--){
-
- // encode using Nway parallel rans states. Nway = 4 or 32
- symbol[r] = inBuffer.get(i - (Nway - r));
- rans[r] = ransEncodingSymbols[0xFF & symbol[r]].putSymbolNx16(rans[r], ptr);
- }
- }
-
- ptr.order(ByteOrder.BIG_ENDIAN);
- for (int i=Nway-1; i>=0; i--){
- ptr.putInt((int) rans[i]);
- }
- ptr.position();
- ptr.flip();
- final int compressedDataSize = ptr.limit();
-
- // since the data is encoded in reverse order,
- // reverse the compressed bytes, so that it is in correct order when uncompressed.
- Utils.reverse(ptr);
- inBuffer.position(inBuffer.limit());
- outBuffer.rewind(); // set position to 0
- outBuffer.limit(prefix_size + frequencyTableSize + compressedDataSize);
- }
-
- private void compressOrder1WayN (
- final ByteBuffer inBuffer,
- final RANSNx16Params ransNx16Params,
- final ByteBuffer outBuffer) {
- final int[][] frequencies = buildFrequenciesOrder1(inBuffer, ransNx16Params.getNumInterleavedRANSStates());
-
- // normalise frequencies with a variable shift calculated
- // using the minimum bit size that is needed to represent a frequency context array
- Utils.normaliseFrequenciesOrder1(frequencies, Constants.TOTAL_FREQ_SHIFT);
- final int prefix_size = outBuffer.position();
-
- ByteBuffer frequencyTable = CompressionUtils.allocateOutputBuffer(1);
- final ByteBuffer compressedFrequencyTable = CompressionUtils.allocateOutputBuffer(1);
-
- // uncompressed frequency table
- final int uncompressedFrequencyTableSize = writeFrequenciesOrder1(frequencyTable,frequencies);
- frequencyTable.limit(uncompressedFrequencyTableSize);
- frequencyTable.rewind();
-
- // Compress using RANSNx16 Order 0, Nway = 4.
- // formatFlags = (~RANSNx16Params.ORDER_FLAG_MASK & ~RANSNx16Params.N32_FLAG_MASK) = ~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK)
- compressOrder0WayN(frequencyTable, new RANSNx16Params(~(RANSNx16Params.ORDER_FLAG_MASK | RANSNx16Params.N32_FLAG_MASK)), compressedFrequencyTable);
- frequencyTable.rewind();
-
- // Moving initializeRANSEncoder() from the beginning of this method to this point in the code
- // due to the nested call to compressOrder0WayN, which also invokes the initializeRANSEncoder() method.
- // TODO: we should work on a more permanent solution for this issue!
- initializeRANSEncoder();
- final int compressedFrequencyTableSize = compressedFrequencyTable.limit();
- final ByteBuffer cp = CompressionUtils.slice(outBuffer);
-
- // spec: The order-1 frequency table itself may still be quite large,
- // so is optionally compressed using the order-0 rANSNx16 codec with a fixed 4-way interleaving.
- if (compressedFrequencyTableSize < uncompressedFrequencyTableSize) {
-
- // first byte
- cp.put((byte) (1 | Constants.TOTAL_FREQ_SHIFT << 4 ));
- CompressionUtils.writeUint7(uncompressedFrequencyTableSize,cp);
- CompressionUtils.writeUint7(compressedFrequencyTableSize,cp);
-
- // write bytes from compressedFrequencyTable to cp
- int i=0;
- while (i> 2: inputSize >> 5;
- final int[] interleaveStreamIndex = new int[Nway];
- final byte[] symbol = new byte[Nway];
- for (int r=0; r= 0) && (r!= Nway-1)){
- symbol[r] = inBuffer.get(interleaveStreamIndex[r] + 1);
- }
- if ( r == Nway-1 ){
- symbol[r] = inBuffer.get(inputSize - 1);
- }
- }
-
- // Slicing is needed for buffer reversing later.
- final ByteBuffer ptr = CompressionUtils.slice(cp);
- final RANSEncodingSymbol[][] ransEncodingSymbols = getEncodingSymbols();
- final byte[] context = new byte[Nway];
-
- // deal with the reminder
- for (
- interleaveStreamIndex[Nway - 1] = inputSize - 2;
- interleaveStreamIndex[Nway - 1] > Nway * interleaveSize - 2 && interleaveStreamIndex[Nway - 1] >= 0;
- interleaveStreamIndex[Nway - 1]-- ) {
- context[Nway - 1] = inBuffer.get(interleaveStreamIndex[Nway - 1]);
- rans[Nway - 1] = ransEncodingSymbols[0xFF & context[Nway - 1]][0xFF & symbol[Nway - 1]].putSymbolNx16(rans[Nway - 1], ptr);
- symbol[Nway - 1] = context[Nway - 1];
- }
-
- while (interleaveStreamIndex[0] >= 0) {
- for (int r=0; r=0; r-- ){
- ptr.putInt((int) rans[r]);
- }
-
- ptr.flip();
- final int compressedBlobSize = ptr.limit();
- Utils.reverse(ptr);
-
- /*
- * Depletion of the in buffer cannot be confirmed because of the get(int
- * position) method use during encoding, hence enforcing:
- */
- inBuffer.position(inBuffer.limit());
- outBuffer.rewind();
- outBuffer.limit(prefix_size + frequencyTableSize + compressedBlobSize);
- }
-
- private static int[] buildFrequenciesOrder0(final ByteBuffer inBuffer) {
- // Returns an array of raw symbol frequencies
- final int inSize = inBuffer.remaining();
- final int[] F = new int[Constants.NUMBER_OF_SYMBOLS];
- for (int i = 0; i < inSize; i++) {
- F[0xFF & inBuffer.get()]++;
- }
- return F;
- }
-
- private static int[][] buildFrequenciesOrder1(final ByteBuffer inBuffer, final int Nway) {
- // Returns an array of raw symbol frequencies
- final int inputSize = inBuffer.remaining();
-
- // context is stored in frequency[Constants.NUMBER_OF_SYMBOLS] array
- final int[][] frequency = new int[Constants.NUMBER_OF_SYMBOLS+1][Constants.NUMBER_OF_SYMBOLS];
-
- // ‘\0’ is the initial context
- byte contextSymbol = 0;
- for (int i = 0; i < inputSize; i++) {
-
- // update the context array
- frequency[Constants.NUMBER_OF_SYMBOLS][0xFF & contextSymbol]++;
- final byte srcSymbol = inBuffer.get(i);
- frequency[0xFF & contextSymbol][0xFF & srcSymbol ]++;
- contextSymbol = srcSymbol;
- }
- frequency[Constants.NUMBER_OF_SYMBOLS][0xFF & contextSymbol]++;
-
- // set ‘\0’ as context for the first byte in the N interleaved streams.
- // the first byte of the first interleaved stream is already accounted for.
- for (int n = 1; n < Nway; n++){
- // For Nway = 4, division by 4 is the same as right shift by 2 bits
- // For Nway = 32, division by 32 is the same as right shift by 5 bits
- final int symbol = Nway == 4 ? (0xFF & inBuffer.get((n*(inputSize >> 2)))) : (0xFF & inBuffer.get((n*(inputSize >> 5))));
- frequency[0][symbol]++;
- }
- frequency[Constants.NUMBER_OF_SYMBOLS][0] += Nway-1;
- return frequency;
- }
-
- private static int writeFrequenciesOrder0(final ByteBuffer cp, final int[] F) {
- // Order 0 frequencies store the complete alphabet of observed
- // symbols using run length encoding, followed by a table of frequencies
- // for each symbol in the alphabet.
- final int start = cp.position();
-
- // write the alphabet first and then their frequencies
- writeAlphabet(cp,F);
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[j] != 0) {
- if (F[j] < 128) {
- cp.put((byte) (F[j] & 0x7f));
- } else {
-
- // if F[j] >127, it is written in 2 bytes
- // right shift by 7 and get the most Significant Bits.
- // Set the Most Significant Bit of the first byte to 1 indicating that the frequency comprises of 2 bytes
- cp.put((byte) (128 | (F[j] >> 7)));
- cp.put((byte) (F[j] & 0x7f)); //Least Significant 7 Bits
- }
- }
- }
- return cp.position() - start;
- }
-
- private static int writeFrequenciesOrder1(final ByteBuffer cp, final int[][] F) {
- final int start = cp.position();
-
- // writeAlphabet uses rle to write all the symbols whose frequency!=0
- writeAlphabet(cp,F[Constants.NUMBER_OF_SYMBOLS]);
-
- for (int i=0; i 0) {
- run--;
- } else {
- CompressionUtils.writeUint7(F[i][j],cp);
- if (F[i][j] == 0) {
- // Count how many more zero-freqs we have
- for (int k = j+1; k < Constants.NUMBER_OF_SYMBOLS; k++) {
- if (F[Constants.NUMBER_OF_SYMBOLS][k] == 0) {
- continue;
- }
- if (F[i][k] == 0) {
- run++;
- } else {
- break;
- }
- }
- cp.put((byte) run);
- }
- }
- }
- }
- return cp.position() - start;
- }
-
- private static void writeAlphabet(final ByteBuffer cp, final int[] F) {
- // Uses Run Length Encoding to write all the symbols whose frequency!=0
- int rle = 0;
- for (int j = 0; j < Constants.NUMBER_OF_SYMBOLS; j++) {
- if (F[j] != 0) {
- if (rle != 0) {
- rle--;
- } else {
-
- // write the symbol if it is the first symbol or if rle = 0.
- // if rle != 0, then skip writing the symbol.
- cp.put((byte) j);
-
- // We've encoded two symbol frequencies in a row.
- // How many more are there? Store that count so
- // we can avoid writing consecutive symbols.
- // Note: maximum possible rle = 254
- // rle requires atmost 1 byte
- if (rle == 0 && j != 0 && F[j - 1] != 0) {
- for (rle = j + 1; rle < Constants.NUMBER_OF_SYMBOLS && F[rle] != 0; rle++);
- rle -= j + 1;
- cp.put((byte) rle);
- }
- }
- }
- }
-
- // write 0 indicating the end of alphabet
- cp.put((byte) 0);
- }
-
- private ByteBuffer encodeRLE(final ByteBuffer inBuffer, final ByteBuffer outBuffer, final RANSNx16Params ransNx16Params){
-
- // Find the symbols that benefit from RLE, i.e, the symbols that occur more than 2 times in succession.
- // spec: For symbols that occur many times in succession, we can replace them with a single symbol and a count.
- final int[] runCounts = new int[Constants.NUMBER_OF_SYMBOLS];
- int inputSize = inBuffer.remaining();
-
- int lastSymbol = -1;
- for (int i = 0; i < inputSize; i++) {
- final int currentSymbol = inBuffer.get(i)&0xFF;
- runCounts[currentSymbol] += (currentSymbol==lastSymbol ? 1:-1);
- lastSymbol = currentSymbol;
- }
-
- // numRLESymbols is the number of symbols that are run length encoded
- int numRLESymbols = 0;
- for (int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i++) {
- if (runCounts[i]>0) {
- numRLESymbols++;
- }
- }
-
- if (numRLESymbols==0) {
- // Format cannot cope with zero RLE symbols, so pick one!
- numRLESymbols = 1;
- runCounts[0] = 1;
- }
-
- // create rleMetaData buffer to store rle metadata.
- // This buffer will be compressed using compressOrder0WayN towards the end of this method
- // TODO: How did we come up with this calculation for Buffer size? numRLESymbols+1+inputSize
- final ByteBuffer rleMetaData = CompressionUtils.allocateByteBuffer(numRLESymbols+1+inputSize); // rleMetaData
-
- // write number of symbols that are run length encoded
- rleMetaData.put((byte) numRLESymbols);
-
- for (int i=0; i0){
- // write the symbols that are run length encoded
- rleMetaData.put((byte) i);
- }
-
- }
-
- // Apply RLE
- // encodedBuffer -> input src data without repetition
- final ByteBuffer encodedBuffer = CompressionUtils.allocateByteBuffer(inputSize); // rleInBuffer
- int encodedBufferIdx = 0; // rleInBufferIndex
-
- for (int i = 0; i < inputSize; i++) {
- encodedBuffer.put(encodedBufferIdx++,inBuffer.get(i));
- if (runCounts[inBuffer.get(i)&0xFF]>0) {
- lastSymbol = inBuffer.get(i) & 0xFF;
- int run = 0;
-
- // calculate the run value for current symbol
- while (i+run+1 < inputSize && (inBuffer.get(i+run+1)& 0xFF)==lastSymbol) {
- run++;
- }
-
- // write the run value to metadata
- CompressionUtils.writeUint7(run, rleMetaData);
-
- // go to the next element that is not equal to its previous element
- i += run;
- }
- }
-
- encodedBuffer.limit(encodedBufferIdx);
- // limit and rewind
- rleMetaData.limit(rleMetaData.position());
- rleMetaData.rewind();
-
- // compress the rleMetaData Buffer
- final ByteBuffer compressedRleMetaData = CompressionUtils.allocateOutputBuffer(rleMetaData.remaining());
-
- // compress using Order 0 and N = Nway
- compressOrder0WayN(rleMetaData, new RANSNx16Params(0x00 | ransNx16Params.getFormatFlags() & RANSNx16Params.N32_FLAG_MASK),compressedRleMetaData);
-
- // write to compressedRleMetaData to outBuffer
- CompressionUtils.writeUint7(rleMetaData.limit()*2, outBuffer);
- CompressionUtils.writeUint7(encodedBufferIdx, outBuffer);
- CompressionUtils.writeUint7(compressedRleMetaData.limit(),outBuffer);
-
- outBuffer.put(compressedRleMetaData);
-
- /*
- * Depletion of the inBuffer cannot be confirmed because of the get(int
- * position) method use during encoding, hence enforcing:
- */
- inBuffer.position(inBuffer.limit());
- return encodedBuffer;
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Params.java b/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Params.java
deleted file mode 100644
index 93bd529f27..0000000000
--- a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Params.java
+++ /dev/null
@@ -1,73 +0,0 @@
-package htsjdk.samtools.cram.compression.rans.ransnx16;
-
-import htsjdk.samtools.cram.compression.rans.RANSParams;
-
-public class RANSNx16Params implements RANSParams {
-
- // RANS Nx16 Bit Flags
- public static final int ORDER_FLAG_MASK = 0x01;
- public static final int N32_FLAG_MASK = 0x04;
- public static final int STRIPE_FLAG_MASK = 0x08;
- public static final int NOSZ_FLAG_MASK = 0x10;
- public static final int CAT_FLAG_MASK = 0x20;
- public static final int RLE_FLAG_MASK = 0x40;
- public static final int PACK_FLAG_MASK = 0x80;
-
- // format is the first byte of the compressed data stream,
- // which consists of all the bit-flags detailing the type of transformations
- // and entropy encoders to be combined
- private int formatFlags;
-
- private static final int FORMAT_FLAG_MASK = 0xFF;
-
- public RANSNx16Params(final int formatFlags) {
- this.formatFlags = formatFlags;
- }
-
- @Override
- public String toString() {
- return "RANSNx16Params{" + "formatFlags=" + formatFlags + "}";
- }
-
- @Override
- public ORDER getOrder() {
- // Rans Order ZERO or ONE encoding
- return ORDER.fromInt(formatFlags & ORDER_FLAG_MASK); //convert into order type
- }
-
- public int getFormatFlags(){
- // first byte of the encoded stream
- return formatFlags & FORMAT_FLAG_MASK;
- }
-
- public int getNumInterleavedRANSStates(){
- // Interleave N = 32 rANS states (else N = 4)
- return ((formatFlags & N32_FLAG_MASK) == 0) ? 4 : 32;
- }
-
- public boolean isStripe(){
- // multiway interleaving of byte streams
- return ((formatFlags & STRIPE_FLAG_MASK)!=0);
- }
-
- public boolean isNosz(){
- // original size is not recorded (for use by Stripe)
- return ((formatFlags & NOSZ_FLAG_MASK)!=0);
- }
-
- public boolean isCAT(){
- // Data is uncompressed
- return ((formatFlags & CAT_FLAG_MASK)!=0);
- }
-
- public boolean isRLE(){
- // Run length encoding, with runs and literals encoded separately
- return ((formatFlags & RLE_FLAG_MASK)!=0);
- }
-
- public boolean isPack(){
- // Pack 2, 4, 8 or infinite symbols per byte
- return ((formatFlags & PACK_FLAG_MASK)!=0);
- }
-
-}
\ No newline at end of file
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopCodec.java
index c96ad011fc..b07b6d4498 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopCodec.java
@@ -1,42 +1,49 @@
package htsjdk.samtools.cram.encoding.external;
-import htsjdk.samtools.util.RuntimeIOException;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
/**
- * Encode byte arrays by specifying a stop byte to separate the arrays.
- * This cannot be a byte that appears in the data.
+ * Encode/decode byte arrays delimited by a stop byte using an External Data Block.
+ * The stop byte must not appear in the data.
*/
final class ByteArrayStopCodec extends ExternalCodec {
private final int stop;
/**
- * Construct a Byte Array Stop Codec
+ * Create a codec that reads/writes byte arrays delimited by the given stop byte.
*
- * @param inputStream the input bytestream to read from
- * @param outputStream the output bytestream to write to
- * @param stopByte the byte used to mark array boundaries
+ * @param inputReader reader for the external data block (may be null if only writing)
+ * @param outputWriter writer for the external data block (may be null if only reading)
+ * @param stopByte the delimiter byte that terminates each encoded value
*/
- public ByteArrayStopCodec(final ByteArrayInputStream inputStream,
- final ByteArrayOutputStream outputStream,
+ public ByteArrayStopCodec(final CRAMByteReader inputReader,
+ final CRAMByteWriter outputWriter,
final byte stopByte) {
- super(inputStream, outputStream);
+ super(inputReader, outputWriter);
this.stop = 0xFF & stopByte;
}
@Override
public byte[] read() {
- final ByteArrayOutputStream readingBAOS = new ByteArrayOutputStream();
- int b;
- readingBAOS.reset();
- while ((b = inputStream.read()) != -1 && b != stop) {
- readingBAOS.write(b);
+ // Scan directly in the underlying byte[] for the stop byte instead of
+ // reading one byte at a time into a ByteArrayOutputStream.
+ final byte[] buf = inputReader.getBuffer();
+ final int startPos = inputReader.getPosition();
+ int scanPos = startPos;
+ while (scanPos < buf.length && (buf[scanPos] & 0xFF) != stop) {
+ scanPos++;
}
-
- return readingBAOS.toByteArray();
+ if (scanPos == buf.length) {
+ throw new CRAMException(
+ "Stop byte 0x" + Integer.toHexString(stop) +
+ " not found in external block (scanned " + (scanPos - startPos) + " bytes)");
+ }
+ final int len = scanPos - startPos;
+ final byte[] result = inputReader.readFully(len);
+ inputReader.read(); // consume the stop byte
+ return result;
}
@Override
@@ -46,11 +53,7 @@ public byte[] read(final int length) {
@Override
public void write(final byte[] value) {
- try {
- outputStream.write(value);
- outputStream.write(stop);
- } catch (IOException e) {
- throw new RuntimeIOException(e);
- }
+ outputWriter.write(value);
+ outputWriter.write(stop);
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopEncoding.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopEncoding.java
index de1335f5cc..5b496c99af 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopEncoding.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ByteArrayStopEncoding.java
@@ -19,13 +19,13 @@
import htsjdk.samtools.cram.encoding.CRAMCodec;
import htsjdk.samtools.cram.encoding.CRAMEncoding;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.structure.EncodingID;
import htsjdk.samtools.cram.structure.SliceBlocksReadStreams;
import htsjdk.samtools.cram.structure.SliceBlocksWriteStreams;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
@@ -71,9 +71,9 @@ public byte[] toSerializedEncodingParams() {
@Override
public CRAMCodec buildCodec(final SliceBlocksReadStreams sliceBlocksReadStreams, final SliceBlocksWriteStreams sliceBlocksWriteStreams) {
- final ByteArrayInputStream is = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalInputStream(externalId);
- final ByteArrayOutputStream os = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalOutputStream(externalId);
- return new ByteArrayStopCodec(is, os, stopByte);
+ final CRAMByteReader reader = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalReader(externalId);
+ final CRAMByteWriter writer = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalWriter(externalId);
+ return new ByteArrayStopCodec(reader, writer, stopByte);
}
@Override
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayCodec.java
index 90f6313683..dc62db2022 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayCodec.java
@@ -1,61 +1,26 @@
-/**
- * ****************************************************************************
- * Copyright 2013 EMBL-EBI
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License inputStream distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ****************************************************************************
- */
package htsjdk.samtools.cram.encoding.external;
-import htsjdk.samtools.cram.io.InputStreamUtils;
-import htsjdk.samtools.util.RuntimeIOException;
-
-import java.io.ByteArrayOutputStream;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
/**
- * Encode Byte Arrays using an External Data Block
+ * Encode/decode byte arrays using an External Data Block.
*/
public final class ExternalByteArrayCodec extends ExternalCodec {
- /**
- * Construct an External Codec for Byte Arrays
- *
- * @param inputStream the input bytestream to read from
- * @param outputStream the output bytestream to write to
- */
- public ExternalByteArrayCodec(final ByteArrayInputStream inputStream,
- final ByteArrayOutputStream outputStream) {
- super(inputStream, outputStream);
+
+ public ExternalByteArrayCodec(final CRAMByteReader inputReader, final CRAMByteWriter outputWriter) {
+ super(inputReader, outputWriter);
}
@Override
public byte[] read(final int length) {
- return InputStreamUtils.readFully(inputStream, length);
+ if (length == 0) return new byte[0];
+ return inputReader.readFully(length);
}
@Override
- public void write(final byte[] object) {
- try {
- outputStream.write(object);
- } catch (IOException e) {
- throw new RuntimeIOException(e);
- }
- }
+ public void write(final byte[] object) { outputWriter.write(object); }
@Override
- public byte[] read() {
- throw new RuntimeException("Cannot read byte array of unknown length.");
- }
-
+ public byte[] read() { throw new RuntimeException("Cannot read byte array of unknown length."); }
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayEncoding.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayEncoding.java
index b0d68fa177..356d93c272 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayEncoding.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteArrayEncoding.java
@@ -19,14 +19,13 @@
import htsjdk.samtools.cram.encoding.CRAMCodec;
import htsjdk.samtools.cram.encoding.CRAMEncoding;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.structure.EncodingID;
import htsjdk.samtools.cram.structure.SliceBlocksReadStreams;
import htsjdk.samtools.cram.structure.SliceBlocksWriteStreams;
-import java.io.ByteArrayOutputStream;
-import java.io.ByteArrayInputStream;
-
public final class ExternalByteArrayEncoding extends ExternalEncoding {
public ExternalByteArrayEncoding(final int externalBlockContentId) {
@@ -50,9 +49,9 @@ public byte[] toSerializedEncodingParams() {
@Override
public CRAMCodec buildCodec(final SliceBlocksReadStreams sliceBlocksReadStreams, final SliceBlocksWriteStreams sliceBlocksWriteStreams) {
- final ByteArrayInputStream is = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalInputStream(externalBlockContentId);
- final ByteArrayOutputStream os = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalOutputStream(externalBlockContentId);
- return new ExternalByteArrayCodec(is, os);
+ final CRAMByteReader reader = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalReader(externalBlockContentId);
+ final CRAMByteWriter writer = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalWriter(externalBlockContentId);
+ return new ExternalByteArrayCodec(reader, writer);
}
@Override
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteCodec.java
index b37ecc26a2..ca729cbe8c 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteCodec.java
@@ -1,53 +1,23 @@
-/**
- * ****************************************************************************
- * Copyright 2013 EMBL-EBI
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License inputStream distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ****************************************************************************
- */
package htsjdk.samtools.cram.encoding.external;
-import java.io.ByteArrayOutputStream;
-import java.io.ByteArrayInputStream;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
/**
- * Encode Bytes using an External Data Block
+ * Encode/decode single bytes using an External Data Block.
*/
final class ExternalByteCodec extends ExternalCodec {
- /**
- * Construct an External Codec for Bytes
- *
- * @param inputStream the input bytestream to read from
- * @param outputStream the output bytestream to write to
- */
- public ExternalByteCodec(final ByteArrayInputStream inputStream,
- final ByteArrayOutputStream outputStream) {
- super(inputStream, outputStream);
+ public ExternalByteCodec(final CRAMByteReader inputReader, final CRAMByteWriter outputWriter) {
+ super(inputReader, outputWriter);
}
@Override
- public Byte read() {
- return (byte) inputStream.read();
- }
+ public Byte read() { return (byte) inputReader.read(); }
@Override
- public void write(final Byte object) {
- outputStream.write(object);
- }
+ public void write(final Byte object) { outputWriter.write(object); }
@Override
- public Byte read(final int length) {
- throw new RuntimeException("Not implemented.");
- }
+ public Byte read(final int length) { throw new RuntimeException("Not implemented."); }
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteEncoding.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteEncoding.java
index ad7f7e2d26..f8064a6417 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteEncoding.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalByteEncoding.java
@@ -18,13 +18,12 @@
package htsjdk.samtools.cram.encoding.external;
import htsjdk.samtools.cram.encoding.CRAMCodec;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.structure.SliceBlocksReadStreams;
import htsjdk.samtools.cram.structure.SliceBlocksWriteStreams;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-
public final class ExternalByteEncoding extends ExternalEncoding {
public ExternalByteEncoding(final int externalBlockContentId) {
super(externalBlockContentId);
@@ -42,8 +41,8 @@ public static ExternalByteEncoding fromSerializedEncodingParams(byte[] serialize
@Override
public CRAMCodec buildCodec(final SliceBlocksReadStreams sliceBlocksReadStreams, final SliceBlocksWriteStreams sliceBlocksWriteStreams) {
- final ByteArrayInputStream is = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalInputStream(externalBlockContentId);
- final ByteArrayOutputStream os = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalOutputStream(externalBlockContentId);
- return new ExternalByteCodec(is, os);
+ final CRAMByteReader reader = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalReader(externalBlockContentId);
+ final CRAMByteWriter writer = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalWriter(externalBlockContentId);
+ return new ExternalByteCodec(reader, writer);
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalCodec.java
index e0be8bfa4d..9fb2ad3f47 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalCodec.java
@@ -1,28 +1,30 @@
package htsjdk.samtools.cram.encoding.external;
import htsjdk.samtools.cram.encoding.CRAMCodec;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
/**
- * Superclass of Codecs which operate on External Block byte streams
- * Contrast with {@link htsjdk.samtools.cram.encoding.core.CoreCodec} for Core Block bit streams
+ * Superclass of Codecs which operate on External Block byte streams.
+ * Uses unsynchronized {@link CRAMByteReader}/{@link CRAMByteWriter} instead of
+ * ByteArrayInputStream/ByteArrayOutputStream for performance.
+ *
+ *
Contrast with {@link htsjdk.samtools.cram.encoding.core.CoreCodec} for Core Block bit streams.
*
* @param data series type to be read or written
*/
abstract class ExternalCodec implements CRAMCodec {
- protected final ByteArrayInputStream inputStream;
- protected final ByteArrayOutputStream outputStream;
+ protected final CRAMByteReader inputReader;
+ protected final CRAMByteWriter outputWriter;
/**
- * Create new ExternalCodec with associated input and output byte streams
+ * Create new ExternalCodec with associated input and output byte streams.
*
- * @param inputStream byte stream for reading input
- * @param outputStream byte stream for writing output
+ * @param inputReader reader for decoding input (may be null if only writing)
+ * @param outputWriter writer for encoding output (may be null if only reading)
*/
- ExternalCodec(final ByteArrayInputStream inputStream, final ByteArrayOutputStream outputStream) {
- this.inputStream = inputStream;
- this.outputStream = outputStream;
+ ExternalCodec(final CRAMByteReader inputReader, final CRAMByteWriter outputWriter) {
+ this.inputReader = inputReader;
+ this.outputWriter = outputWriter;
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerCodec.java
index 94f0b7380f..36c3ed80b1 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerCodec.java
@@ -1,54 +1,24 @@
-/**
- * ****************************************************************************
- * Copyright 2013 EMBL-EBI
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License inputStream distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ****************************************************************************
- */
package htsjdk.samtools.cram.encoding.external;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-
/**
- * Encode Integers using an External Data Block
+ * Encode/decode integers (ITF8 encoded) using an External Data Block.
*/
final class ExternalIntegerCodec extends ExternalCodec {
- /**
- * Construct an External Codec for Integers
- *
- * @param inputStream the input bytestream to read from
- * @param outputStream the output bytestream to write to
- */
- public ExternalIntegerCodec(final ByteArrayInputStream inputStream,
- final ByteArrayOutputStream outputStream) {
- super(inputStream, outputStream);
+
+ public ExternalIntegerCodec(final CRAMByteReader inputReader, final CRAMByteWriter outputWriter) {
+ super(inputReader, outputWriter);
}
@Override
- public Integer read() {
- return ITF8.readUnsignedITF8(inputStream);
- }
+ public Integer read() { return ITF8.readUnsignedITF8(inputReader); }
@Override
- public void write(final Integer value) {
- ITF8.writeUnsignedITF8(value, outputStream);
- }
+ public void write(final Integer value) { ITF8.writeUnsignedITF8(value, outputWriter); }
@Override
- public Integer read(final int length) {
- throw new RuntimeException("Not implemented.");
- }
+ public Integer read(final int length) { throw new RuntimeException("Not implemented."); }
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerEncoding.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerEncoding.java
index 8026917dac..5d34bacf34 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerEncoding.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalIntegerEncoding.java
@@ -18,13 +18,12 @@
package htsjdk.samtools.cram.encoding.external;
import htsjdk.samtools.cram.encoding.CRAMCodec;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.structure.SliceBlocksReadStreams;
import htsjdk.samtools.cram.structure.SliceBlocksWriteStreams;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-
public final class ExternalIntegerEncoding extends ExternalEncoding {
public ExternalIntegerEncoding(final int externalBlockContentId) {
@@ -43,9 +42,9 @@ public static ExternalIntegerEncoding fromSerializedEncodingParams(byte[] serial
@Override
public CRAMCodec buildCodec(final SliceBlocksReadStreams sliceBlocksReadStreams, final SliceBlocksWriteStreams sliceBlocksWriteStreams) {
- final ByteArrayInputStream is = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalInputStream(externalBlockContentId);
- final ByteArrayOutputStream os = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalOutputStream(externalBlockContentId);
- return new ExternalIntegerCodec(is, os);
+ final CRAMByteReader reader = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalReader(externalBlockContentId);
+ final CRAMByteWriter writer = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalWriter(externalBlockContentId);
+ return new ExternalIntegerCodec(reader, writer);
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongCodec.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongCodec.java
index 42b8a9dd83..c0b139a222 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongCodec.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongCodec.java
@@ -1,54 +1,24 @@
-/**
- * ****************************************************************************
- * Copyright 2013 EMBL-EBI
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License inputStream distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ****************************************************************************
- */
package htsjdk.samtools.cram.encoding.external;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.LTF8;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-
/**
- * Encode Longs using an External Data Block
+ * Encode/decode longs (LTF8 encoded) using an External Data Block.
*/
final class ExternalLongCodec extends ExternalCodec {
- /**
- * Construct an External Codec for Longs
- *
- * @param inputStream the input bytestream to read from
- * @param outputStream the output bytestream to write to
- */
- public ExternalLongCodec(final ByteArrayInputStream inputStream,
- final ByteArrayOutputStream outputStream) {
- super(inputStream, outputStream);
+
+ public ExternalLongCodec(final CRAMByteReader inputReader, final CRAMByteWriter outputWriter) {
+ super(inputReader, outputWriter);
}
@Override
- public Long read() {
- return LTF8.readUnsignedLTF8(inputStream);
- }
+ public Long read() { return LTF8.readUnsignedLTF8(inputReader); }
@Override
- public void write(final Long value) {
- LTF8.writeUnsignedLTF8(value, outputStream);
- }
+ public void write(final Long value) { LTF8.writeUnsignedLTF8(value, outputWriter); }
@Override
- public Long read(final int length) {
- throw new RuntimeException("Not implemented.");
- }
+ public Long read(final int length) { throw new RuntimeException("Not implemented."); }
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongEncoding.java b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongEncoding.java
index 6d3e83a884..2e60f5de0d 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongEncoding.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/external/ExternalLongEncoding.java
@@ -18,13 +18,12 @@
package htsjdk.samtools.cram.encoding.external;
import htsjdk.samtools.cram.encoding.CRAMCodec;
+import htsjdk.samtools.cram.io.CRAMByteReader;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.structure.SliceBlocksReadStreams;
import htsjdk.samtools.cram.structure.SliceBlocksWriteStreams;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-
public final class ExternalLongEncoding extends ExternalEncoding {
public ExternalLongEncoding(final int externalBlockContentId) {
super(externalBlockContentId);
@@ -42,8 +41,8 @@ public static ExternalLongEncoding fromSerializedEncodingParams(byte[] serialize
@Override
public CRAMCodec buildCodec(final SliceBlocksReadStreams sliceBlocksReadStreams, final SliceBlocksWriteStreams sliceBlocksWriteStreams) {
- final ByteArrayInputStream is = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalInputStream(externalBlockContentId);
- final ByteArrayOutputStream os = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalOutputStream(externalBlockContentId);
- return new ExternalLongCodec(is, os);
+ final CRAMByteReader reader = sliceBlocksReadStreams == null ? null : sliceBlocksReadStreams.getExternalReader(externalBlockContentId);
+ final CRAMByteWriter writer = sliceBlocksWriteStreams == null ? null : sliceBlocksWriteStreams.getExternalWriter(externalBlockContentId);
+ return new ExternalLongCodec(reader, writer);
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/encoding/reader/CramRecordReader.java b/src/main/java/htsjdk/samtools/cram/encoding/reader/CramRecordReader.java
index d80f3901a7..5f9a260ed8 100644
--- a/src/main/java/htsjdk/samtools/cram/encoding/reader/CramRecordReader.java
+++ b/src/main/java/htsjdk/samtools/cram/encoding/reader/CramRecordReader.java
@@ -73,6 +73,9 @@ public final class CramRecordReader {
private final SliceBlocksReadStreams sliceBlocksReadStreams;
protected final ValidationStringency validationStringency;
+ /** Pre-resolved tag key info, indexed by [tagIdList][position within that list]. */
+ private final TagKeyCache.TagKeyInfo[][] resolvedTagKeys;
+
/**
* Initialize a Cram Record Reader
*
@@ -135,6 +138,18 @@ public CramRecordReader(
DataSeriesType.BYTE_ARRAY,
mapEntry.getValue(),
sliceBlocksReadStreams)));
+
+ // Pre-resolve cached tag key info for each dictionary entry to avoid per-record lookups
+ final TagKeyCache tagKeyCache = compressionHeader.getTagKeyCache();
+ final byte[][][] dictionary = compressionHeader.getTagIDDictionary();
+ resolvedTagKeys = new TagKeyCache.TagKeyInfo[dictionary.length][];
+ for (int i = 0; i < dictionary.length; i++) {
+ final byte[][] ids = dictionary[i];
+ resolvedTagKeys[i] = new TagKeyCache.TagKeyInfo[ids.length];
+ for (int j = 0; j < ids.length; j++) {
+ resolvedTagKeys[i][j] = tagKeyCache.get(ReadTag.name3BytesToInt(ids[j]));
+ }
+ }
}
/**
@@ -210,14 +225,14 @@ public CRAMCompressionRecord readCRAMRecord(
}
List readTags = null;
- final Integer tagIdList = tagIdListCodec.readData();
- final byte[][] ids = compressionHeader.getTagIDDictionary()[tagIdList];
- if (ids.length > 0) {
- readTags = new ArrayList<>(ids.length);
- for (int i = 0; i < ids.length; i++) {
- final int id = ReadTag.name3BytesToInt(ids[i]);
- final DataSeriesReader dataSeriesReader = tagValueCodecs.get(id);
- final ReadTag tag = new ReadTag(id, dataSeriesReader.readData(), validationStringency);
+ final int tagIdList = tagIdListCodec.readData();
+ final TagKeyCache.TagKeyInfo[] cachedKeys = resolvedTagKeys[tagIdList];
+ if (cachedKeys.length > 0) {
+ readTags = new ArrayList<>(cachedKeys.length);
+ for (int i = 0; i < cachedKeys.length; i++) {
+ final TagKeyCache.TagKeyInfo cached = cachedKeys[i];
+ final DataSeriesReader dataSeriesReader = tagValueCodecs.get(cached.keyType3BytesAsInt);
+ final ReadTag tag = new ReadTag(cached, dataSeriesReader.readData(), validationStringency);
readTags.add(tag);
}
}
diff --git a/src/main/java/htsjdk/samtools/cram/io/CRAMByteReader.java b/src/main/java/htsjdk/samtools/cram/io/CRAMByteReader.java
new file mode 100644
index 0000000000..c6bce9e1e8
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/io/CRAMByteReader.java
@@ -0,0 +1,85 @@
+package htsjdk.samtools.cram.io;
+
+/**
+ * Unsynchronized reader over a {@code byte[]} for CRAM codec decode operations. Replaces
+ * {@link java.io.ByteArrayInputStream} in the hot decode path to eliminate the overhead of
+ * synchronized {@code read()} methods (which showed up as ~10% of total decode CPU in profiling).
+ *
+ *
This is a final class (not an InputStream subclass) so the JIT can inline its methods.
+ * Thread safety is explicitly not provided — CRAM codec operations are single-threaded.
+ *
+ * @see CRAMByteWriter
+ */
+public final class CRAMByteReader {
+ private final byte[] buf;
+ private int pos;
+
+ /**
+ * Create a reader over the given byte array, starting at position 0.
+ *
+ * @param buf the data to read from (not copied — caller must not modify while reading)
+ */
+ public CRAMByteReader(final byte[] buf) {
+ this.buf = buf;
+ this.pos = 0;
+ }
+
+ /**
+ * Read one byte, returning it as an unsigned int (0-255), or -1 if at end of buffer.
+ * Matches the contract of {@link java.io.InputStream#read()}.
+ *
+ * @return the next byte as an unsigned int, or -1 at end of data
+ */
+ public int read() {
+ return pos < buf.length ? (buf[pos++] & 0xFF) : -1;
+ }
+
+ /**
+ * Read up to {@code len} bytes into the destination array.
+ *
+ * @param b destination array
+ * @param off offset in destination to start writing
+ * @param len maximum number of bytes to read
+ * @return the number of bytes actually read, or -1 if at end of data
+ */
+ public int read(final byte[] b, final int off, final int len) {
+ if (pos >= buf.length) return -1;
+ final int toRead = Math.min(len, buf.length - pos);
+ System.arraycopy(buf, pos, b, off, toRead);
+ pos += toRead;
+ return toRead;
+ }
+
+ /**
+ * Read exactly {@code len} bytes, returning them as a new array.
+ *
+ * @param len number of bytes to read
+ * @return a new byte array of length {@code len}
+ * @throws IllegalStateException if fewer than {@code len} bytes remain
+ */
+ public byte[] readFully(final int len) {
+ if (buf.length - pos < len) {
+ throw new IllegalStateException(
+ String.format("Attempted to read %d bytes but only %d remain", len, buf.length - pos));
+ }
+ final byte[] result = new byte[len];
+ System.arraycopy(buf, pos, result, 0, len);
+ pos += len;
+ return result;
+ }
+
+ /** Return the number of bytes remaining to be read. */
+ public int available() {
+ return buf.length - pos;
+ }
+
+ /** Return the current read position within the buffer. */
+ public int getPosition() {
+ return pos;
+ }
+
+ /** Return a reference to the underlying byte array. */
+ public byte[] getBuffer() {
+ return buf;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/io/CRAMByteWriter.java b/src/main/java/htsjdk/samtools/cram/io/CRAMByteWriter.java
new file mode 100644
index 0000000000..991d38ad46
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/io/CRAMByteWriter.java
@@ -0,0 +1,106 @@
+package htsjdk.samtools.cram.io;
+
+import java.util.Arrays;
+
+/**
+ * Unsynchronized growable byte writer for CRAM codec encode operations. Replaces
+ * {@link java.io.ByteArrayOutputStream} in the hot encode path to eliminate the overhead of
+ * synchronized {@code write()} methods.
+ *
+ *
The internal buffer doubles in size when full, matching the growth strategy of
+ * {@link java.io.ByteArrayOutputStream}. This is a final class (not an OutputStream subclass)
+ * so the JIT can inline its methods. Thread safety is explicitly not provided — CRAM codec
+ * operations are single-threaded.
+ *
+ * @see CRAMByteReader
+ */
+public final class CRAMByteWriter {
+ private byte[] buf;
+ private int pos;
+
+ /** Create a writer with a default initial capacity of 256 bytes. */
+ public CRAMByteWriter() {
+ this(256);
+ }
+
+ /**
+ * Create a writer with the specified initial capacity.
+ *
+ * @param initialCapacity initial buffer size in bytes
+ */
+ public CRAMByteWriter(final int initialCapacity) {
+ this.buf = new byte[initialCapacity];
+ this.pos = 0;
+ }
+
+ /**
+ * Write a single byte.
+ *
+ * @param b the byte to write (only the low 8 bits are used)
+ */
+ public void write(final int b) {
+ if (pos == buf.length) {
+ grow(pos + 1);
+ }
+ buf[pos++] = (byte) b;
+ }
+
+ /**
+ * Write all bytes from the given array.
+ *
+ * @param b the bytes to write
+ */
+ public void write(final byte[] b) {
+ write(b, 0, b.length);
+ }
+
+ /**
+ * Write {@code len} bytes from the given array starting at offset {@code off}.
+ *
+ * @param b source array
+ * @param off offset in source to start reading
+ * @param len number of bytes to write
+ */
+ public void write(final byte[] b, final int off, final int len) {
+ if (pos + len > buf.length) {
+ grow(pos + len);
+ }
+ System.arraycopy(b, off, buf, pos, len);
+ pos += len;
+ }
+
+ /**
+ * Return a copy of the bytes written so far. Matches the contract of
+ * {@link java.io.ByteArrayOutputStream#toByteArray()}.
+ *
+ * @return a new byte array containing the written data
+ */
+ public byte[] toByteArray() {
+ return Arrays.copyOf(buf, pos);
+ }
+
+ /** Return the number of bytes written so far. */
+ public int size() {
+ return pos;
+ }
+
+ /** Return the current write position (alias for {@link #size()}). */
+ public int getPosition() {
+ return pos;
+ }
+
+ /** Reset the writer to empty, reusing the existing buffer. */
+ public void reset() {
+ pos = 0;
+ }
+
+ private void grow(final int minCapacity) {
+ // Use Math.max to handle potential overflow when buf.length > 1GB
+ int newCapacity = Math.max(buf.length << 1, minCapacity);
+ if (newCapacity < 0) {
+ // Overflow — fall back to exact size needed
+ newCapacity = minCapacity;
+ }
+ buf = Arrays.copyOf(buf, newCapacity);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/io/ITF8.java b/src/main/java/htsjdk/samtools/cram/io/ITF8.java
index 48c7e2d226..008336ae3a 100644
--- a/src/main/java/htsjdk/samtools/cram/io/ITF8.java
+++ b/src/main/java/htsjdk/samtools/cram/io/ITF8.java
@@ -214,6 +214,75 @@ public static int writeUnsignedITF8(final int value, final ByteBuffer buffer) {
return 40;
}
+ /**
+ * Reads an unsigned ITF8 integer from a {@link CRAMByteReader}. Equivalent to the InputStream version
+ * but avoids synchronized method call overhead.
+ *
+ * @param reader the reader to read from
+ * @return the decoded value
+ */
+ public static int readUnsignedITF8(final CRAMByteReader reader) {
+ final int b1 = reader.read();
+ if (b1 == -1)
+ throw new RuntimeEOFException();
+
+ if ((b1 & 128) == 0)
+ return b1;
+
+ if ((b1 & 64) == 0)
+ return ((b1 & 127) << 8) | reader.read();
+
+ if ((b1 & 32) == 0) {
+ final int b2 = reader.read();
+ final int b3 = reader.read();
+ return ((b1 & 63) << 16) | b2 << 8 | b3;
+ }
+
+ if ((b1 & 16) == 0)
+ return ((b1 & 31) << 24) | reader.read() << 16 | reader.read() << 8 | reader.read();
+
+ return ((b1 & 15) << 28) | reader.read() << 20 | reader.read() << 12 | reader.read() << 4 | (15 & reader.read());
+ }
+
+ /**
+ * Writes an unsigned ITF8 integer to a {@link CRAMByteWriter}. Equivalent to the OutputStream version
+ * but avoids synchronized method call overhead.
+ *
+ * @param value the value to write
+ * @param writer the writer to write to
+ * @return number of bits written
+ */
+ public static int writeUnsignedITF8(final int value, final CRAMByteWriter writer) {
+ if ((value >>> 7) == 0) {
+ writer.write(value);
+ return 8;
+ }
+ if ((value >>> 14) == 0) {
+ writer.write(((value >> 8) | 0x80));
+ writer.write((value & 0xFF));
+ return 16;
+ }
+ if ((value >>> 21) == 0) {
+ writer.write(((value >> 16) | 0xC0));
+ writer.write(((value >> 8) & 0xFF));
+ writer.write((value & 0xFF));
+ return 24;
+ }
+ if ((value >>> 28) == 0) {
+ writer.write(((value >> 24) | 0xE0));
+ writer.write(((value >> 16) & 0xFF));
+ writer.write(((value >> 8) & 0xFF));
+ writer.write((value & 0xFF));
+ return 32;
+ }
+ writer.write(((value >> 28) | 0xF0));
+ writer.write(((value >> 20) & 0xFF));
+ writer.write(((value >> 12) & 0xFF));
+ writer.write(((value >> 4) & 0xFF));
+ writer.write((value & 0xFF));
+ return 40;
+ }
+
/**
* Writes an unsigned (32 bit) integer to a byte new array encoded as ITF8. The sign bit is interpreted as a value bit.
*
diff --git a/src/main/java/htsjdk/samtools/cram/io/LTF8.java b/src/main/java/htsjdk/samtools/cram/io/LTF8.java
index d9e12a0c86..946b04de65 100644
--- a/src/main/java/htsjdk/samtools/cram/io/LTF8.java
+++ b/src/main/java/htsjdk/samtools/cram/io/LTF8.java
@@ -104,6 +104,168 @@ public static long readUnsignedLTF8(final InputStream inputStream) {
}
}
+ /**
+ * Reads an unsigned LTF8 long from a {@link CRAMByteReader}. Equivalent to the InputStream version
+ * but avoids synchronized method call overhead.
+ *
+ * @param reader the reader to read from
+ * @return the decoded value
+ */
+ public static long readUnsignedLTF8(final CRAMByteReader reader) {
+ final int b1 = reader.read();
+ if (b1 == -1)
+ throw new RuntimeEOFException();
+
+ if ((b1 & 128) == 0)
+ return b1;
+
+ if ((b1 & 64) == 0)
+ return ((b1 & 127) << 8) | reader.read();
+
+ if ((b1 & 32) == 0)
+ return ((b1 & 63) << 16) | reader.read() << 8 | reader.read();
+
+ if ((b1 & 16) == 0) {
+ long result = ((long) (b1 & 31) << 24);
+ result |= reader.read() << 16;
+ result |= reader.read() << 8;
+ result |= reader.read();
+ return result;
+ }
+
+ if ((b1 & 8) == 0) {
+ long value = ((long) (b1 & 15) << 32);
+ value |= (0xFF & ((long) reader.read())) << 24;
+ value |= reader.read() << 16;
+ value |= reader.read() << 8;
+ value |= reader.read();
+ return value;
+ }
+
+ if ((b1 & 4) == 0) {
+ long result = ((long) (b1 & 7) << 40);
+ result |= (0xFF & ((long) reader.read())) << 32;
+ result |= (0xFF & ((long) reader.read())) << 24;
+ result |= reader.read() << 16;
+ result |= reader.read() << 8;
+ result |= reader.read();
+ return result;
+ }
+
+ if ((b1 & 2) == 0) {
+ long result = ((long) (b1 & 3) << 48);
+ result |= (0xFF & ((long) reader.read())) << 40;
+ result |= (0xFF & ((long) reader.read())) << 32;
+ result |= (0xFF & ((long) reader.read())) << 24;
+ result |= reader.read() << 16;
+ result |= reader.read() << 8;
+ result |= reader.read();
+ return result;
+ }
+
+ if ((b1 & 1) == 0) {
+ long result = (0xFF & ((long) reader.read())) << 48;
+ result |= (0xFF & ((long) reader.read())) << 40;
+ result |= (0xFF & ((long) reader.read())) << 32;
+ result |= (0xFF & ((long) reader.read())) << 24;
+ result |= reader.read() << 16;
+ result |= reader.read() << 8;
+ result |= reader.read();
+ return result;
+ }
+
+ long result = (0xFF & ((long) reader.read())) << 56;
+ result |= (0xFF & ((long) reader.read())) << 48;
+ result |= (0xFF & ((long) reader.read())) << 40;
+ result |= (0xFF & ((long) reader.read())) << 32;
+ result |= (0xFF & ((long) reader.read())) << 24;
+ result |= reader.read() << 16;
+ result |= reader.read() << 8;
+ result |= reader.read();
+ return result;
+ }
+
+ /**
+ * Writes an unsigned LTF8 long to a {@link CRAMByteWriter}. Equivalent to the OutputStream version
+ * but avoids synchronized method call overhead.
+ *
+ * @param value the value to write
+ * @param writer the writer to write to
+ * @return number of bits written
+ */
+ public static int writeUnsignedLTF8(final long value, final CRAMByteWriter writer) {
+ if ((value >>> 7) == 0) {
+ writer.write((int) value);
+ return 8;
+ }
+ if ((value >>> 14) == 0) {
+ writer.write((int) ((value >> 8) | 0x80));
+ writer.write((int) (value & 0xFF));
+ return 16;
+ }
+ if ((value >>> 21) == 0) {
+ writer.write((int) ((value >> 16) | 0xC0));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 24;
+ }
+ if ((value >>> 28) == 0) {
+ writer.write((int) ((value >> 24) | 0xE0));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 32;
+ }
+ if ((value >>> 35) == 0) {
+ writer.write((int) ((value >> 32) | 0xF0));
+ writer.write((int) ((value >> 24) & 0xFF));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 40;
+ }
+ if ((value >>> 42) == 0) {
+ writer.write((int) ((value >> 40) | 0xF8));
+ writer.write((int) ((value >> 32) & 0xFF));
+ writer.write((int) ((value >> 24) & 0xFF));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 48;
+ }
+ if ((value >>> 49) == 0) {
+ writer.write((int) ((value >> 48) | 0xFC));
+ writer.write((int) ((value >> 40) & 0xFF));
+ writer.write((int) ((value >> 32) & 0xFF));
+ writer.write((int) ((value >> 24) & 0xFF));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 56;
+ }
+ if ((value >>> 56) == 0) {
+ writer.write(0xFE);
+ writer.write((int) ((value >> 48) & 0xFF));
+ writer.write((int) ((value >> 40) & 0xFF));
+ writer.write((int) ((value >> 32) & 0xFF));
+ writer.write((int) ((value >> 24) & 0xFF));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 64;
+ }
+ writer.write(0xFF);
+ writer.write((int) ((value >> 56) & 0xFF));
+ writer.write((int) ((value >> 48) & 0xFF));
+ writer.write((int) ((value >> 40) & 0xFF));
+ writer.write((int) ((value >> 32) & 0xFF));
+ writer.write((int) ((value >> 24) & 0xFF));
+ writer.write((int) ((value >> 16) & 0xFF));
+ writer.write((int) ((value >> 8) & 0xFF));
+ writer.write((int) (value & 0xFF));
+ return 72;
+ }
+
/**
* Writes an unsigned long value to the output stream. The sign bit is interpreted just as other bits in the value.
*
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CRAMCodecModelContext.java b/src/main/java/htsjdk/samtools/cram/structure/CRAMCodecModelContext.java
index 9e2ac870ba..a9e74f78a7 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CRAMCodecModelContext.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CRAMCodecModelContext.java
@@ -1,7 +1,50 @@
package htsjdk.samtools.cram.structure;
+import java.util.List;
+
/**
- * Context model data/accumulators for use by CRAM 3.1 codec write implementations.
+ * Context model data for use by CRAM 3.1 codec write implementations that need per-record metadata
+ * beyond the raw byte stream. Populated during slice construction from the list of CRAM records,
+ * then passed through to {@link htsjdk.samtools.cram.compression.ExternalCompressor#compress} calls.
+ *
+ * Currently used by FQZComp, which needs per-record quality score lengths and BAM flags to
+ * properly compress quality scores with context modeling.
*/
public class CRAMCodecModelContext {
+
+ private int[] qualityScoreLengths;
+ private int[] bamFlags;
+
+ /**
+ * Populate this context from the records in a slice. Should be called during slice construction
+ * before records are written to blocks.
+ *
+ * @param records the CRAM records for this slice
+ */
+ public void populateFromRecords(final List records) {
+ qualityScoreLengths = new int[records.size()];
+ bamFlags = new int[records.size()];
+ for (int i = 0; i < records.size(); i++) {
+ final CRAMCompressionRecord record = records.get(i);
+ qualityScoreLengths[i] = CRAMCompressionRecord.isForcePreserveQualityScores(record.getCRAMFlags())
+ ? record.getReadLength()
+ : 0;
+ bamFlags[i] = record.getBAMFlags();
+ }
+ }
+
+ /** @return per-record quality score lengths (one per record in the slice), or null if not populated */
+ public int[] getQualityScoreLengths() {
+ return qualityScoreLengths;
+ }
+
+ /** @return per-record BAM flags (one per record in the slice), or null if not populated */
+ public int[] getBamFlags() {
+ return bamFlags;
+ }
+
+ /** @return number of records, or 0 if not populated */
+ public int getNumRecords() {
+ return qualityScoreLengths != null ? qualityScoreLengths.length : 0;
+ }
}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionProfile.java b/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionProfile.java
new file mode 100644
index 0000000000..cb10e2dc10
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionProfile.java
@@ -0,0 +1,318 @@
+package htsjdk.samtools.cram.structure;
+
+import htsjdk.samtools.cram.common.CRAMVersion;
+import htsjdk.samtools.cram.common.CramVersions;
+import htsjdk.samtools.cram.compression.range.RangeParams;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
+import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
+
+import java.util.EnumMap;
+
+/**
+ * Predefined CRAM compression profiles matching those in htslib/samtools. Each profile defines
+ * the CRAM version, compression level, reads-per-slice, and a per-{@link DataSeries} compressor
+ * assignment via {@link CompressorDescriptor}.
+ *
+ *
Usage:
+ *
+ * // Get a strategy for a specific profile:
+ * CRAMEncodingStrategy strategy = CRAMCompressionProfile.ARCHIVE.toStrategy();
+ *
+ * // Or apply a profile to an existing strategy:
+ * CRAMCompressionProfile.FAST.applyTo(existingStrategy);
+ *
+ *
+ * @see CRAMEncodingStrategy
+ * @see CompressorDescriptor
+ */
+public enum CRAMCompressionProfile {
+
+ /**
+ * Speed-optimized profile. Uses only GZIP at level 1. Writes CRAM 3.0 since no 3.1-specific
+ * codecs are used, avoiding the need for a 3.1-capable reader.
+ */
+ FAST(CramVersions.CRAM_v3, 1, 10_000),
+
+ /**
+ * Balanced profile (default). Uses rANS Nx16 for entropy-rich data series, FQZComp for quality
+ * scores, and Name Tokeniser for read names. Writes CRAM 3.1.
+ */
+ NORMAL(CramVersions.CRAM_v3_1, 5, 10_000),
+
+ /**
+ * Size-optimized profile. Uses GZIP at higher compression level with FQZComp for quality scores.
+ * Does not use Name Tokeniser or rANS (matching htslib SMALL behavior). Writes CRAM 3.1.
+ */
+ SMALL(CramVersions.CRAM_v3_1, 6, 25_000),
+
+ /**
+ * Maximum compression profile. Uses rANS Nx16 for entropy-rich data, FQZComp for quality scores,
+ * and Name Tokeniser for read names at higher compression settings. Writes CRAM 3.1.
+ *
+ *
This profile uses trial compression: multiple codecs are tried per block and the smallest
+ * result wins. Additional candidates include BZIP2, the Range (arithmetic) coder, and GZIP.
+ */
+ // Note: large slices increase tag dictionary size; CompressionHeader.internalWrite()
+ // uses a fixed 100KB buffer for the tag dictionary, which may need adjustment for very
+ // large slice sizes.
+ ARCHIVE(CramVersions.CRAM_v3_1, 7, 100_000);
+
+ private final CRAMVersion cramVersion;
+ private final int gzipLevel;
+ private final int readsPerSlice;
+
+ /**
+ * Look up a profile by name, ignoring case. For example, {@code "archive"}, {@code "ARCHIVE"},
+ * and {@code "Archive"} all return {@link #ARCHIVE}.
+ *
+ * @param name the profile name (case-insensitive)
+ * @return the matching profile
+ * @throws IllegalArgumentException if no profile matches
+ */
+ public static CRAMCompressionProfile valueOfCaseInsensitive(final String name) {
+ for (final CRAMCompressionProfile profile : values()) {
+ if (profile.name().equalsIgnoreCase(name)) {
+ return profile;
+ }
+ }
+ throw new IllegalArgumentException("Unknown CRAM compression profile: " + name +
+ ". Must be one of: fast, normal, small, archive");
+ }
+
+ CRAMCompressionProfile(final CRAMVersion cramVersion, final int gzipLevel, final int readsPerSlice) {
+ this.cramVersion = cramVersion;
+ this.gzipLevel = gzipLevel;
+ this.readsPerSlice = readsPerSlice;
+ }
+
+ /**
+ * Create a new {@link CRAMEncodingStrategy} configured with this profile's settings.
+ *
+ * @return a new strategy with this profile applied
+ */
+ public CRAMEncodingStrategy toStrategy() {
+ // Use the no-profile constructor to avoid infinite recursion (default constructor calls NORMAL.applyTo)
+ final CRAMEncodingStrategy strategy = new CRAMEncodingStrategy(false);
+ applyTo(strategy);
+ return strategy;
+ }
+
+ /**
+ * Apply this profile's settings to an existing strategy, overwriting the CRAM version,
+ * GZIP compression level, reads-per-slice, and compressor map.
+ *
+ * @param strategy the strategy to modify
+ */
+ public void applyTo(final CRAMEncodingStrategy strategy) {
+ strategy.setCramVersion(cramVersion);
+ strategy.setGZIPCompressionLevel(gzipLevel);
+ strategy.setReadsPerSlice(readsPerSlice);
+ strategy.setCompressorMap(buildCompressorMap());
+ strategy.setTrialCandidatesMap(buildTrialCandidatesMap());
+ }
+
+ /**
+ * Build the per-DataSeries compressor map for this profile. Only includes data series
+ * that are actually written by the htsjdk CRAM implementation (excludes obsolete TC, TN
+ * and unused BB, QQ series).
+ */
+ private EnumMap buildCompressorMap() {
+ final EnumMap map = new EnumMap<>(DataSeries.class);
+
+ switch (this) {
+ case FAST:
+ buildFastMap(map);
+ break;
+ case NORMAL:
+ buildNormalMap(map);
+ break;
+ case SMALL:
+ buildSmallMap(map);
+ break;
+ case ARCHIVE:
+ buildArchiveMap(map);
+ break;
+ }
+
+ return map;
+ }
+
+ /** FAST: all GZIP at level 1, no 3.1 codecs. */
+ private void buildFastMap(final EnumMap map) {
+ final CompressorDescriptor gzip = new CompressorDescriptor(BlockCompressionMethod.GZIP, gzipLevel);
+ for (final DataSeries ds : getWrittenDataSeries()) {
+ map.put(ds, gzip);
+ }
+ }
+
+ /** NORMAL: rANS Nx16 for low-entropy data, GZIP for positional/byte-array data, FQZComp for QS, NameTok for RN. */
+ private void buildNormalMap(final EnumMap map) {
+ final CompressorDescriptor gzip = new CompressorDescriptor(BlockCompressionMethod.GZIP, gzipLevel);
+ final CompressorDescriptor ransOrder0 = new CompressorDescriptor(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ZERO.ordinal());
+ final CompressorDescriptor ransOrder1 = new CompressorDescriptor(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ONE.ordinal());
+
+ // Default everything to GZIP — then override specific series with better codecs
+ for (final DataSeries ds : getWrittenDataSeries()) {
+ map.put(ds, gzip);
+ }
+
+ // rANS Nx16 Order 0 for position-like integer data with low entropy
+ map.put(DataSeries.AP_AlignmentPositionOffset, ransOrder0);
+ map.put(DataSeries.RI_RefId, ransOrder0);
+
+ // rANS Nx16 Order 1 for low-entropy integer data series where rANS outperforms GZIP
+ map.put(DataSeries.BA_Base, ransOrder1);
+ map.put(DataSeries.BF_BitFlags, ransOrder1);
+ map.put(DataSeries.BS_BaseSubstitutionCode, ransOrder1);
+ map.put(DataSeries.CF_CompressionBitFlags, ransOrder1);
+ map.put(DataSeries.FC_FeatureCode, ransOrder1);
+ map.put(DataSeries.FN_NumberOfReadFeatures, ransOrder1);
+ map.put(DataSeries.MF_MateBitFlags, ransOrder1);
+ map.put(DataSeries.MQ_MappingQualityScore, ransOrder1);
+ map.put(DataSeries.NS_NextFragmentReferenceSequenceID, ransOrder1);
+ map.put(DataSeries.RG_ReadGroup, ransOrder1);
+ map.put(DataSeries.RL_ReadLength, ransOrder1);
+ map.put(DataSeries.TL_TagIdList, ransOrder1);
+ map.put(DataSeries.TS_InsertSize, ransOrder1);
+
+ // Keep GZIP for high-entropy positional data where LZ77 helps
+ // NP (mate position), FP (feature position) — these have high variance
+ // IN (insertions), SC (soft clips) — byte arrays benefit from LZ77
+
+ // Specialized codecs
+ map.put(DataSeries.QS_QualityScore, new CompressorDescriptor(BlockCompressionMethod.FQZCOMP));
+ map.put(DataSeries.RN_ReadName, new CompressorDescriptor(BlockCompressionMethod.NAME_TOKENISER));
+ }
+
+ /** SMALL: Same codec assignments as NORMAL but at higher compression level. Trial compression
+ * adds BZIP2 alongside rANS/GZIP to let the trial pick the best per data series. */
+ private void buildSmallMap(final EnumMap map) {
+ buildNormalMap(map);
+ }
+
+ /** ARCHIVE: Same primary codecs as NORMAL but at higher compression, plus larger slices.
+ * Trial compression candidates (BZIP2, Range coder) are provided via buildTrialCandidatesMap. */
+ private void buildArchiveMap(final EnumMap map) {
+ buildNormalMap(map);
+ }
+
+ /**
+ * Build the trial compression candidates map for this profile. Only ARCHIVE and SMALL profiles
+ * currently use trial compression. For data series with trial candidates, the primary compressor
+ * (from buildCompressorMap) plus these additional candidates are all tried, and the smallest wins.
+ *
+ * @return the trial candidates map, or null if this profile doesn't use trial compression
+ */
+ private EnumMap> buildTrialCandidatesMap() {
+ if (this != ARCHIVE && this != SMALL) {
+ return null;
+ }
+
+ final EnumMap> trialMap = new EnumMap<>(DataSeries.class);
+
+ // BZIP2 as an alternative for general data series
+ final CompressorDescriptor bzip2 = new CompressorDescriptor(BlockCompressionMethod.BZIP2);
+
+ // Range (ARITH) coder variants as alternatives to rANS Nx16
+ final CompressorDescriptor arithOrder0 = new CompressorDescriptor(BlockCompressionMethod.ADAPTIVE_ARITHMETIC, 0);
+ final CompressorDescriptor arithOrder1 = new CompressorDescriptor(
+ BlockCompressionMethod.ADAPTIVE_ARITHMETIC, RangeParams.ORDER_FLAG_MASK);
+
+ // GZIP as a fallback candidate (may win for small blocks)
+ final CompressorDescriptor gzip = new CompressorDescriptor(BlockCompressionMethod.GZIP, gzipLevel);
+
+ if (this == ARCHIVE) {
+ // For entropy-rich data series that use rANS Nx16: also try Range coder and BZIP2
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.BA_Base, DataSeries.BF_BitFlags, DataSeries.CF_CompressionBitFlags,
+ DataSeries.NS_NextFragmentReferenceSequenceID, DataSeries.RG_ReadGroup,
+ DataSeries.RL_ReadLength, DataSeries.TS_InsertSize}) {
+ trialMap.put(ds, java.util.List.of(arithOrder1, bzip2, gzip));
+ }
+ // Position-like data: also try Range order 0
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.AP_AlignmentPositionOffset, DataSeries.RI_RefId}) {
+ trialMap.put(ds, java.util.List.of(arithOrder0, bzip2, gzip));
+ }
+ // GZIP-compressed data series: also try BZIP2 and rANS
+ final CompressorDescriptor ransOrder1 = new CompressorDescriptor(
+ BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ONE.ordinal());
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.BS_BaseSubstitutionCode, DataSeries.DL_DeletionLength,
+ DataSeries.FC_FeatureCode, DataSeries.FN_NumberOfReadFeatures,
+ DataSeries.FP_FeaturePosition, DataSeries.HC_HardClip,
+ DataSeries.MF_MateBitFlags, DataSeries.MQ_MappingQualityScore,
+ DataSeries.NF_RecordsToNextFragment, DataSeries.NP_NextFragmentAlignmentStart,
+ DataSeries.PD_padding, DataSeries.RS_RefSkip, DataSeries.TL_TagIdList}) {
+ trialMap.put(ds, java.util.List.of(bzip2, ransOrder1));
+ }
+ } else if (this == SMALL) {
+ // SMALL: same as NORMAL primary codecs but with BZIP2 added to trial candidates.
+ // htslib SMALL (level 6, use_rans=1, use_bz2=1) trials GZIP + BZIP2 + all rANS variants.
+ // For rANS-primary series: also try BZIP2 and GZIP
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.BA_Base, DataSeries.BF_BitFlags, DataSeries.CF_CompressionBitFlags,
+ DataSeries.NS_NextFragmentReferenceSequenceID, DataSeries.RG_ReadGroup,
+ DataSeries.RL_ReadLength, DataSeries.TS_InsertSize}) {
+ trialMap.put(ds, java.util.List.of(bzip2, gzip));
+ }
+ // For rANS Order 0 series: also try BZIP2 and GZIP
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.AP_AlignmentPositionOffset, DataSeries.RI_RefId}) {
+ trialMap.put(ds, java.util.List.of(bzip2, gzip));
+ }
+ // For GZIP-primary series: also try BZIP2 and rANS
+ final CompressorDescriptor ransOrder1 = new CompressorDescriptor(
+ BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ONE.ordinal());
+ for (final DataSeries ds : new DataSeries[]{
+ DataSeries.BS_BaseSubstitutionCode, DataSeries.DL_DeletionLength,
+ DataSeries.FC_FeatureCode, DataSeries.FN_NumberOfReadFeatures,
+ DataSeries.FP_FeaturePosition, DataSeries.HC_HardClip,
+ DataSeries.MF_MateBitFlags, DataSeries.MQ_MappingQualityScore,
+ DataSeries.NF_RecordsToNextFragment, DataSeries.NP_NextFragmentAlignmentStart,
+ DataSeries.PD_padding, DataSeries.RS_RefSkip, DataSeries.TL_TagIdList,
+ DataSeries.IN_Insertion, DataSeries.SC_SoftClip}) {
+ trialMap.put(ds, java.util.List.of(bzip2, ransOrder1));
+ }
+ }
+
+ return trialMap;
+ }
+
+ /**
+ * Returns the set of DataSeries values that are actually written by the htsjdk CRAM implementation.
+ * Excludes obsolete (TC, TN) and unused (QQ) series.
+ */
+ private static final DataSeries[] WRITTEN_DATA_SERIES = {
+ DataSeries.AP_AlignmentPositionOffset,
+ DataSeries.BA_Base,
+ DataSeries.BF_BitFlags,
+ DataSeries.BS_BaseSubstitutionCode,
+ DataSeries.CF_CompressionBitFlags,
+ DataSeries.DL_DeletionLength,
+ DataSeries.FC_FeatureCode,
+ DataSeries.FN_NumberOfReadFeatures,
+ DataSeries.FP_FeaturePosition,
+ DataSeries.HC_HardClip,
+ DataSeries.IN_Insertion,
+ DataSeries.MF_MateBitFlags,
+ DataSeries.MQ_MappingQualityScore,
+ DataSeries.NF_RecordsToNextFragment,
+ DataSeries.NP_NextFragmentAlignmentStart,
+ DataSeries.NS_NextFragmentReferenceSequenceID,
+ DataSeries.PD_padding,
+ DataSeries.QS_QualityScore,
+ DataSeries.RG_ReadGroup,
+ DataSeries.RI_RefId,
+ DataSeries.RL_ReadLength,
+ DataSeries.RN_ReadName,
+ DataSeries.RS_RefSkip,
+ DataSeries.SC_SoftClip,
+ DataSeries.TL_TagIdList,
+ DataSeries.TS_InsertSize,
+ };
+
+ private static DataSeries[] getWrittenDataSeries() {
+ return WRITTEN_DATA_SERIES;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionRecord.java b/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionRecord.java
index 95c32aadc5..9b2f2fa95f 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionRecord.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionRecord.java
@@ -25,6 +25,7 @@
package htsjdk.samtools.cram.structure;
import htsjdk.samtools.*;
+import htsjdk.samtools.SAMTag;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.common.CRAMVersion;
import htsjdk.samtools.cram.common.CramVersions;
@@ -72,7 +73,7 @@ public class CRAMCompressionRecord {
private final CRAMRecordReadFeatures readFeatures;
private final int mappingQuality;
private final int readGroupID;
- private final List tags;
+ private List tags;
private final long sequentialIndex; // 1 based sequential index of this record in the cram stream
private int bamFlags;
@@ -83,6 +84,7 @@ public class CRAMCompressionRecord {
// the contents hasher doesn't handle nulls
private byte[] readBases;
private byte[] qualityScores;
+ private Cigar cachedCigar; // populated by restoreBasesAndTags, used by toSAMRecord
private MutableInt tagIdsIndex = new MutableInt(0);
//mate info
@@ -200,14 +202,46 @@ public CRAMCompressionRecord(
NO_READGROUP_ID :
readGroupMap.get(readGroup.getId());
- if (samRecord.getAttributes().size() > 0) {
- tags = new ArrayList();
- for (final SAMRecord.SAMTagAndValue tagAndValue : samRecord.getAttributes()) {
- // Skip read group, since read group have a dedicated data series
- if (!SAMTag.RG.name().equals(tagAndValue.tag)) {
- tags.add(ReadTag.deriveTypeFromValue(tagAndValue.tag, tagAndValue.value));
+ // Tag handling: NM:i and MD:Z are stripped for mapped reads (matching htslib default)
+ // and regenerated from read features + reference during decode. If the stored NM/MD
+ // values don't match what would be recomputed (non-standard values), they are kept verbatim.
+ // RG is also skipped since read groups have a dedicated data series.
+ boolean stripNM = !samRecord.getReadUnmappedFlag() && !encodingStrategy.getStoreNM();
+ boolean stripMD = !samRecord.getReadUnmappedFlag() && !encodingStrategy.getStoreMD();
+
+ // Validate that stored NM/MD match recomputed values; keep non-standard values verbatim
+ if ((stripNM || stripMD) && referenceBases != null &&
+ samRecord.getCigar() != null && !samRecord.getCigar().isEmpty() &&
+ samRecord.getReadBases() != null && samRecord.getReadBases().length > 0) {
+ final htsjdk.samtools.util.Tuple computed =
+ htsjdk.samtools.util.SequenceUtil.calculateMdAndNm(
+ samRecord.getCigar().getCigarElements(),
+ samRecord.getReadBases(),
+ referenceBases,
+ 0,
+ samRecord.getAlignmentStart());
+ if (stripNM && samRecord.getAttribute(SAMTag.NM) != null) {
+ final int storedNM = ((Number) samRecord.getAttribute(SAMTag.NM)).intValue();
+ if (storedNM != computed.b) {
+ stripNM = false;
+ }
+ }
+ if (stripMD && samRecord.getAttribute(SAMTag.MD) != null) {
+ final Object mdValue = samRecord.getAttribute(SAMTag.MD);
+ if (!(mdValue instanceof String) || !computed.a.equals(mdValue)) {
+ stripMD = false;
}
}
+ }
+
+ if (!samRecord.getAttributes().isEmpty()) {
+ tags = new ArrayList<>(samRecord.getAttributes().size());
+ for (final SAMRecord.SAMTagAndValue tagAndValue : samRecord.getAttributes()) {
+ if (SAMTag.RG.name().equals(tagAndValue.tag)) continue;
+ if (stripNM && SAMTag.NM.name().equals(tagAndValue.tag)) continue;
+ if (stripMD && SAMTag.MD.name().equals(tagAndValue.tag)) continue;
+ tags.add(ReadTag.deriveTypeFromValue(tagAndValue.tag, tagAndValue.value));
+ }
} else {
tags = null;
}
@@ -313,7 +347,7 @@ public SAMRecord toSAMRecord(final SAMFileHeader samFileHeader) {
if (isSegmentUnmapped()) {
samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
} else {
- samRecord.setCigar(readFeatures.getCigarForReadFeatures(readLength));
+ samRecord.setCigar(cachedCigar != null ? cachedCigar : readFeatures.getCigarForReadFeatures(readLength));
}
if (samRecord.getReadPairedFlag()) {
@@ -346,8 +380,19 @@ public SAMRecord toSAMRecord(final SAMFileHeader samFileHeader) {
return samRecord;
}
- //TODO: how to resolve readnames when we don’t save them for supplementary / secondary reads that don’t
- //appear near their primaries and don’t have a primary linking to them?
+ /**
+ * Assign a synthetic read name based on the sequential index if no name was decoded.
+ * Propagates the name to linked next/previous segments.
+ *
+ *
Note: supplementary and secondary reads are always DETACHED (never mate-linked), so they
+ * have no nextSegment/previousSegment and name propagation won’t help them. This is only safe
+ * because htsjdk always sets preserveReadNames=true when encoding (see
+ * {@link htsjdk.samtools.cram.build.CompressionHeaderFactory#createCompressionHeader}). If lossy
+ * read name mode were ever implemented, supplementary/secondary reads in different slices from
+ * their primaries would receive synthetic names that don’t match. htslib avoids this by forcing
+ * name preservation whenever SA tags are present and when not all template reads are in the
+ * same slice.
+ */
public void assignReadName() {
if (readName == null) {
readName = Long.toString(getSequentialIndex());
@@ -440,7 +485,7 @@ public void resolveQualityScores() {
* @param secondEnd second mate of the pair
* @return template length
*/
- private static int computeInsertSize(final CRAMCompressionRecord firstEnd, final CRAMCompressionRecord secondEnd) {
+ static int computeInsertSize(final CRAMCompressionRecord firstEnd, final CRAMCompressionRecord secondEnd) {
if (firstEnd.isSegmentUnmapped() ||
secondEnd.isSegmentUnmapped()||
firstEnd.referenceIndex != secondEnd.referenceIndex) {
@@ -477,9 +522,79 @@ public void restoreReadBases(CRAMReferenceRegion cramReferenceRegion, final Subs
}
}
+ /**
+ * Fused single-pass method: restore read bases from the reference + read features, build the
+ * CIGAR, and compute NM/MD tags, all in one iteration through the features. Replaces the
+ * previous separate calls to {@code restoreReadBases} + {@code restoreNmAndMd}.
+ *
+ *
The CIGAR is cached on this record for use by {@link #toSAMRecord()}.
+ */
+ void restoreBasesAndTags(final CRAMReferenceRegion cramReferenceRegion,
+ final SubstitutionMatrix substitutionMatrix) {
+ // Handle the cF internal tag from htslib's embed_ref=2 mode
+ boolean suppressMD = false;
+ boolean suppressNM = false;
+ if (tags != null) {
+ for (int i = tags.size() - 1; i >= 0; i--) {
+ if ("cF".equals(tags.get(i).getKey())) {
+ final int cf = ((Number) tags.get(i).getValue()).intValue();
+ suppressMD = (cf & 1) != 0;
+ suppressNM = (cf & 2) != 0;
+ tags.remove(i);
+ break;
+ }
+ }
+ }
+
+ // Determine if MD/NM computation is needed
+ boolean hasNM = false;
+ boolean hasMD = false;
+ if (tags != null) {
+ for (final ReadTag tag : tags) {
+ if ("NM".equals(tag.getKey())) hasNM = true;
+ if ("MD".equals(tag.getKey())) hasMD = true;
+ }
+ }
+ final boolean needMD = !hasMD && !suppressMD;
+ final boolean needNM = !hasNM && !suppressNM;
+
+ final boolean computeMdNm = (needMD || needNM) &&
+ readLength > 0 &&
+ readFeatures != null &&
+ cramReferenceRegion.getCurrentReferenceBases() != null;
+
+ final CRAMRecordReadFeatures.DecodeResult result = CRAMRecordReadFeatures.restoreBasesAndTags(
+ readFeatures == null ? Collections.emptyList() : readFeatures.getReadFeaturesList(),
+ isUnknownBases(),
+ alignmentStart,
+ readLength,
+ cramReferenceRegion,
+ substitutionMatrix,
+ computeMdNm);
+
+ this.readBases = result.readBases;
+ this.cachedCigar = result.cigar;
+
+ if (computeMdNm) {
+ if (tags == null) {
+ tags = new ArrayList<>(2);
+ }
+ if (needMD && result.mdString != null) {
+ tags.add(ReadTag.deriveTypeFromValue("MD", result.mdString));
+ }
+ if (needNM && result.nmCount >= 0) {
+ tags.add(ReadTag.deriveTypeFromValue("NM", result.nmCount));
+ }
+ }
+ }
+
//////////////////////////////////////
// Start Mate code
//////////////////////////////////////
+ /**
+ * Restore mate information by walking the linked list of segments starting from this record,
+ * setting mate fields on each segment pair, and computing template length.
+ */
public void restoreMateInfo() {
if (getNextSegment() == null) {
return;
@@ -499,12 +614,18 @@ public void restoreMateInfo() {
last.templateSize = -templateLength;
}
+ /** Mark this record as detached — mate info stored explicitly via MF, NS, NP, TS data series. */
public void setToDetachedState() {
setDetached(true);
setHasMateDownStream(false);
recordsToNextFragment = -1;
}
+ /** Set the NF (records-to-next-fragment) offset for attached mate pairs. */
+ void setRecordsToNextFragment(final int recordsToNextFragment) {
+ this.recordsToNextFragment = recordsToNextFragment;
+ }
+
private void setNextMate(final CRAMCompressionRecord next) {
mateAlignmentStart = next.alignmentStart;
setMateUnmapped(next.isSegmentUnmapped());
@@ -588,7 +709,9 @@ public List getReadFeatures() {
public int getMateAlignmentStart() { return mateAlignmentStart; }
public void setTagIdsIndex(MutableInt tagIdsIndex) {
- //TODO: why is this value deliberately shared across records
+ // The MutableInt is intentionally shared by reference across all records with the same tag
+ // combination. CompressionHeaderFactory.buildTagIdsFromCRAMRecords() groups records by their
+ // tag set and assigns a shared MutableInt to each group for counting and dictionary indexing.
this.tagIdsIndex = tagIdsIndex;
}
@@ -624,6 +747,7 @@ public void setPreviousSegment(CRAMCompressionRecord previousSegment) {
this.previousSegment = previousSegment;
}
+ /** Return true if this record has the secondary alignment BAM flag set. */
public boolean isSecondaryAlignment() {
return (bamFlags & SAMFlag.SECONDARY_ALIGNMENT.intValue()) != 0;
}
@@ -634,34 +758,43 @@ private void setSecondaryAlignment(final boolean secondaryAlignment) {
bamFlags & ~SAMFlag.SECONDARY_ALIGNMENT.intValue();
}
+ /** Return true if this record's CRAM flags indicate a downstream mate in the same slice. */
public boolean isHasMateDownStream() {
return isHasMateDownStream(cramFlags);
}
+ /** Test the has-mate-downstream bit in the given CRAM flags value. */
public static boolean isHasMateDownStream(final int cramFlags) {
return (cramFlags & CF_HAS_MATE_DOWNSTREAM) != 0;
}
+ /** Return true if this record stores mate information explicitly (detached state). */
public boolean isDetached() {
return isDetached(cramFlags);
}
+ /** Test the detached bit in the given CRAM flags value. */
public static boolean isDetached(final int cramFlags) { return (cramFlags & CF_DETACHED) != 0; }
+ /** Return true if quality scores are preserved as a full array for this record. */
public boolean isForcePreserveQualityScores() {
return isForcePreserveQualityScores(cramFlags);
}
+ /** Test the quality-scores-preserved-as-array bit in the given CRAM flags value. */
public static boolean isForcePreserveQualityScores(final int cramFlags) {return (cramFlags & CF_QS_PRESERVED_AS_ARRAY) != 0; }
+ /** Return true if the original sequence was unknown (SEQ="*"). */
public boolean isUnknownBases() {
return isUnknownBases(cramFlags);
}
+ /** Test the unknown-bases bit in the given CRAM flags value. */
public static boolean isUnknownBases(final int cramFlags) {
return (cramFlags & CF_UNKNOWN_BASES) != 0;
}
+ /** Return true if this record has the read-paired BAM flag set. */
public boolean isReadPaired() {
return (bamFlags & SAMFlag.READ_PAIRED.intValue()) != 0;
}
@@ -682,12 +815,14 @@ public boolean isSegmentUnmapped() {
return isSegmentUnmapped(bamFlags);
}
+ /** Test the segment-unmapped bit in the given BAM flags value. */
public static boolean isSegmentUnmapped(final int bamFlags) { return (bamFlags & SAMFlag.READ_UNMAPPED.intValue()) != 0; }
private void setSegmentUnmapped(final boolean segmentUnmapped) {
bamFlags = segmentUnmapped ? bamFlags | SAMFlag.READ_UNMAPPED.intValue() : bamFlags & ~SAMFlag.READ_UNMAPPED.intValue();
}
+ /** Return true if this record is the first segment in the template. */
public boolean isFirstSegment() {
return (bamFlags & SAMFlag.FIRST_OF_PAIR.intValue()) != 0;
}
@@ -696,6 +831,7 @@ private void setFirstSegment(final boolean firstSegment) {
bamFlags = firstSegment ? bamFlags | SAMFlag.FIRST_OF_PAIR.intValue() : bamFlags & ~SAMFlag.FIRST_OF_PAIR.intValue();
}
+ /** Return true if this record is the last segment in the template. */
public boolean isLastSegment() {
return (bamFlags & SAMFlag.SECOND_OF_PAIR.intValue()) != 0;
}
@@ -770,10 +906,12 @@ private void setMateNegativeStrand(final boolean mateNegativeStrand) {
bamFlags & ~SAMFlag.MATE_REVERSE_STRAND.intValue();
}
- private void setHasMateDownStream(final boolean hasMateDownStream) {
+ /** Set or clear the has-mate-downstream CRAM flag. */
+ void setHasMateDownStream(final boolean hasMateDownStream) {
cramFlags = hasMateDownStream ? cramFlags | CF_HAS_MATE_DOWNSTREAM : cramFlags & ~CF_HAS_MATE_DOWNSTREAM;
}
+ /** Set or clear the detached CRAM flag (mate info stored explicitly). */
public void setDetached(final boolean detached) {
cramFlags = detached ? cramFlags | CF_DETACHED : cramFlags & ~CF_DETACHED;
}
@@ -782,7 +920,8 @@ private void setUnknownBases(final boolean unknownBases) {
cramFlags = unknownBases ? cramFlags | CF_UNKNOWN_BASES : cramFlags & ~CF_UNKNOWN_BASES;
}
- private boolean isSupplementary() {
+ /** Return true if this record has the supplementary alignment BAM flag set. */
+ boolean isSupplementary() {
return (bamFlags & SAMFlag.SUPPLEMENTARY_ALIGNMENT.intValue()) != 0;
}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CRAMEncodingStrategy.java b/src/main/java/htsjdk/samtools/cram/structure/CRAMEncodingStrategy.java
index 4a2762187c..9713a2107c 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CRAMEncodingStrategy.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CRAMEncodingStrategy.java
@@ -25,57 +25,99 @@
package htsjdk.samtools.cram.structure;
import htsjdk.samtools.Defaults;
-import htsjdk.utils.ValidationUtils;
+import htsjdk.samtools.cram.common.CRAMVersion;
+import htsjdk.samtools.cram.common.CramVersions;
import htsjdk.samtools.cram.ref.ReferenceContextType;
+import htsjdk.utils.ValidationUtils;
+
+import java.util.EnumMap;
/**
- * Parameters that can be set to control the encoding strategy used when writing CRAM.
+ * Parameters that control the encoding strategy used when writing CRAM. Includes the CRAM version,
+ * compression level, container/slice sizing, and per-{@link DataSeries} compressor assignments.
+ *
+ *
The default constructor applies the {@link CRAMCompressionProfile#NORMAL} profile. Use
+ * {@link CRAMCompressionProfile#toStrategy()} or {@link CRAMCompressionProfile#applyTo(CRAMEncodingStrategy)}
+ * to configure a specific profile.
+ *
+ * @see CRAMCompressionProfile
+ * @see CompressorDescriptor
*/
public class CRAMEncodingStrategy {
- // Default value for the minimum number of reads we need to have seen to emit a single-reference slice.
- // If we've see fewer than this number, and we have more reads from a different reference context, we prefer to
- // switch to, and subsequently emit, a multiple reference slice, rather than a small single-reference
- // that contains fewer than this number of records.
public static final int DEFAULT_MINIMUM_SINGLE_REFERENCE_SLICE_THRESHOLD = 1000;
-
- // This number must be >= DEFAULT_MINIMUM_SINGLE_REFERENCE_SLICE_THRESHOLD (required by ContainerFactory).
public static final int DEFAULT_READS_PER_SLICE = 10000;
- // encoding strategies
- private CompressionHeaderEncodingMap customCompressionHeaderEncodingMap;
-
- //Note: should this have separate values for tags (separate from CRAMRecord data) ?
+ private CRAMVersion cramVersion = CramVersions.CRAM_v3_1;
private int gzipCompressionLevel = Defaults.COMPRESSION_LEVEL;
-
- // The minimum number of reads we need to have seen to emit a single-reference slice. If we've seen
- // fewer than this number, and we have more reads from a different reference context, we prefer to
- // switch to, and subsequently emit, a multiple reference slice, rather than a small single-reference
- // that contains fewer than this number of records. This number must be < readsPerSlice.
private int minimumSingleReferenceSliceSize = DEFAULT_MINIMUM_SINGLE_REFERENCE_SLICE_THRESHOLD;
private int readsPerSlice = DEFAULT_READS_PER_SLICE;
private int slicesPerContainer = 1;
+ private EnumMap compressorMap;
+
+ // Optional: additional trial compression candidates per data series. When present for a data series,
+ // the primary compressor from compressorMap plus these additional candidates are wrapped in a
+ // TrialCompressor that tries all and picks the smallest output.
+ private EnumMap> trialCandidatesMap;
+
+ // Whether to store NM:i and MD:Z tags verbatim. When false (default), these tags are stripped
+ // during encoding for mapped reads and regenerated from the reference during decoding. Matches
+ // htslib's store_nm/store_md options (both default to 0/false).
+ private boolean storeNM = false;
+ private boolean storeMD = false;
+
+ // Advanced override: a pre-built encoding map that bypasses the compressor map entirely.
+ // Used by tests that need low-level control over encoding descriptors.
+ private CompressionHeaderEncodingMap customCompressionHeaderEncodingMap;
+
/**
- * Create an encoding strategy that uses all default values.
+ * Create an encoding strategy with the {@link CRAMCompressionProfile#NORMAL} profile applied.
*/
public CRAMEncodingStrategy() {
- // use defaults;
+ CRAMCompressionProfile.NORMAL.applyTo(this);
}
/**
- * Set number of slices per container. In some cases, a container containing fewer slices than the
+ * Package-private constructor that skips profile application. Used by
+ * {@link CRAMCompressionProfile#toStrategy()} to avoid infinite recursion.
+ *
+ * @param applyDefaultProfile ignored — exists only to differentiate from the default constructor
+ */
+ CRAMEncodingStrategy(final boolean applyDefaultProfile) {
+ // no profile applied; caller is responsible for calling applyTo()
+ }
+
+ /** @return the CRAM version to write */
+ public CRAMVersion getCramVersion() {
+ return cramVersion;
+ }
+
+ /**
+ * Set the CRAM version to write.
+ *
+ * @param cramVersion the CRAM version (e.g., {@link CramVersions#CRAM_v3} or {@link CramVersions#CRAM_v3_1})
+ * @return this strategy for chaining
+ */
+ public CRAMEncodingStrategy setCramVersion(final CRAMVersion cramVersion) {
+ ValidationUtils.nonNull(cramVersion, "CRAM version must not be null");
+ this.cramVersion = cramVersion;
+ return this;
+ }
+
+ /**
+ * Set number of reads per slice. In some cases, a container containing fewer slices than the
* requested value will be produced in order to honor the specification rule that all slices in a
* container must have the same {@link ReferenceContextType}.
*
* Note: this value must be >= {@link #getMinimumSingleReferenceSliceSize}.
*
- * @param readsPerSlice number of slices written per container
+ * @param readsPerSlice number of reads written per slice
* @return updated CRAMEncodingStrategy
*/
public CRAMEncodingStrategy setReadsPerSlice(final int readsPerSlice) {
ValidationUtils.validateArg(
readsPerSlice > 0 && readsPerSlice >= minimumSingleReferenceSliceSize,
- String.format("Reads per slice must be > 0 and < minimum single reference slice size (%d)",
+ String.format("Reads per slice must be > 0 and >= minimum single reference slice size (%d)",
minimumSingleReferenceSliceSize));
this.readsPerSlice = readsPerSlice;
return this;
@@ -87,14 +129,15 @@ public CRAMEncodingStrategy setReadsPerSlice(final int readsPerSlice) {
* switch to, and subsequently emit, a multiple reference slice, rather than a small single-reference
* that contains fewer than this number of records.
*
- * This number must be < the value for {@link #getReadsPerSlice}
+ * This number must be <= the value for {@link #getReadsPerSlice}
*
- * @param minimumSingleReferenceSliceSize
+ * @param minimumSingleReferenceSliceSize the minimum slice size
+ * @return this strategy for chaining
*/
public CRAMEncodingStrategy setMinimumSingleReferenceSliceSize(int minimumSingleReferenceSliceSize) {
ValidationUtils.validateArg(
minimumSingleReferenceSliceSize <= readsPerSlice,
- String.format("Minimm single reference slice size must be < the reads per slice size (%d)", readsPerSlice));
+ String.format("Minimum single reference slice size must be <= the reads per slice size (%d)", readsPerSlice));
this.minimumSingleReferenceSliceSize = minimumSingleReferenceSliceSize;
return this;
}
@@ -103,9 +146,15 @@ public int getMinimumSingleReferenceSliceSize() {
return minimumSingleReferenceSliceSize;
}
+ /**
+ * Set the GZIP compression level used for data series compressed with GZIP.
+ *
+ * @param compressionLevel GZIP compression level (0-10)
+ * @return this strategy for chaining
+ */
public CRAMEncodingStrategy setGZIPCompressionLevel(final int compressionLevel) {
- ValidationUtils.validateArg(compressionLevel >=0 && compressionLevel <= 10,
- "cram gzip compression level must be > 0 and <= 10");
+ ValidationUtils.validateArg(compressionLevel >= 0 && compressionLevel <= 10,
+ "cram gzip compression level must be >= 0 and <= 10");
this.gzipCompressionLevel = compressionLevel;
return this;
}
@@ -113,26 +162,103 @@ public CRAMEncodingStrategy setGZIPCompressionLevel(final int compressionLevel)
/**
* Set the number of slices per container. If > 1, multiple slices will be placed in the same container
* if the slices share the same reference context (container records mapped to the same contig). MULTI-REF
- * slices are always emitted as a single contain to avoid conferring MULTI-REF on the next slice, which
+ * slices are always emitted as a single container to avoid conferring MULTI-REF on the next slice, which
* might otherwise be single-ref; the spec requires a MULTI_REF container to only contain multi-ref slices).
- * @param slicesPerContainer - requested number of slices per container
- * @return CRAMEncodingStrategy
+ *
+ * @param slicesPerContainer requested number of slices per container
+ * @return this strategy for chaining
*/
public CRAMEncodingStrategy setSlicesPerContainer(final int slicesPerContainer) {
- ValidationUtils.validateArg(slicesPerContainer >=0, "slicesPerContainer must be > 0");
+ ValidationUtils.validateArg(slicesPerContainer > 0, "slicesPerContainer must be > 0");
this.slicesPerContainer = slicesPerContainer;
return this;
}
/**
- * Set the {@link CompressionHeaderEncodingMap} to use.
+ * Set the per-DataSeries compressor map. Each entry maps a {@link DataSeries} to the
+ * {@link CompressorDescriptor} that should be used to compress its block.
*
- * @param encodingMap the encoding map to use
+ * @param compressorMap the compressor map (defensively copied)
+ * @return this strategy for chaining
+ */
+ public CRAMEncodingStrategy setCompressorMap(final EnumMap compressorMap) {
+ ValidationUtils.nonNull(compressorMap, "compressor map must not be null");
+ this.compressorMap = new EnumMap<>(compressorMap);
+ return this;
+ }
+
+ /** @return the per-DataSeries compressor map, or null if not set */
+ public EnumMap getCompressorMap() {
+ return compressorMap;
+ }
+
+ /**
+ * Set additional trial compression candidates per DataSeries. For data series with entries in
+ * this map, a {@link htsjdk.samtools.cram.compression.TrialCompressor} will be created that
+ * tries the primary compressor plus all listed candidates, selecting the smallest output.
+ *
+ * @param trialCandidatesMap map of data series to additional candidate descriptors
+ * @return this strategy for chaining
+ */
+ public CRAMEncodingStrategy setTrialCandidatesMap(
+ final EnumMap> trialCandidatesMap) {
+ this.trialCandidatesMap = trialCandidatesMap != null ? new EnumMap<>(trialCandidatesMap) : null;
+ return this;
+ }
+
+ /** @return the trial candidates map, or null if trial compression is not configured */
+ public EnumMap> getTrialCandidatesMap() {
+ return trialCandidatesMap;
+ }
+
+ /**
+ * Set a pre-built {@link CompressionHeaderEncodingMap} that bypasses the compressor map.
+ * This is an advanced override intended for tests that need low-level control over encoding
+ * descriptors. When set, {@link htsjdk.samtools.cram.build.CompressionHeaderFactory} will use
+ * this map directly instead of building one from the compressor map.
+ *
+ * @param encodingMap the encoding map to use, or null to use the compressor map
*/
public void setCustomCompressionHeaderEncodingMap(final CompressionHeaderEncodingMap encodingMap) {
this.customCompressionHeaderEncodingMap = encodingMap;
}
- public CompressionHeaderEncodingMap getCustomCompressionHeaderEncodingMap() { return customCompressionHeaderEncodingMap; }
+
+ /** @return the custom encoding map, or null if the compressor map should be used */
+ public CompressionHeaderEncodingMap getCustomCompressionHeaderEncodingMap() {
+ return customCompressionHeaderEncodingMap;
+ }
+
+ /**
+ * Set whether to store the NM:i tag verbatim. When false (default), NM is stripped during
+ * encoding for mapped reads and regenerated from features + reference during decoding.
+ * Matches htslib's {@code CRAM_OPT_STORE_NM} option.
+ *
+ * @param storeNM true to store NM verbatim, false to strip and regenerate
+ * @return this strategy for chaining
+ */
+ public CRAMEncodingStrategy setStoreNM(final boolean storeNM) {
+ this.storeNM = storeNM;
+ return this;
+ }
+
+ /** @return whether NM:i tags are stored verbatim (false = stripped and regenerated) */
+ public boolean getStoreNM() { return storeNM; }
+
+ /**
+ * Set whether to store the MD:Z tag verbatim. When false (default), MD is stripped during
+ * encoding for mapped reads and regenerated from features + reference during decoding.
+ * Matches htslib's {@code CRAM_OPT_STORE_MD} option.
+ *
+ * @param storeMD true to store MD verbatim, false to strip and regenerate
+ * @return this strategy for chaining
+ */
+ public CRAMEncodingStrategy setStoreMD(final boolean storeMD) {
+ this.storeMD = storeMD;
+ return this;
+ }
+
+ /** @return whether MD:Z tags are stored verbatim (false = stripped and regenerated) */
+ public boolean getStoreMD() { return storeMD; }
public int getGZIPCompressionLevel() { return gzipCompressionLevel; }
public int getReadsPerSlice() { return readsPerSlice; }
@@ -141,12 +267,13 @@ public void setCustomCompressionHeaderEncodingMap(final CompressionHeaderEncodin
@Override
public String toString() {
return "CRAMEncodingStrategy{" +
- ", customCompressionMap='" + customCompressionHeaderEncodingMap + '\'' +
+ "cramVersion=" + cramVersion +
", gzipCompressionLevel=" + gzipCompressionLevel +
", readsPerSlice=" + readsPerSlice +
", slicesPerContainer=" + slicesPerContainer +
'}';
}
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
@@ -158,20 +285,20 @@ public boolean equals(Object o) {
if (getMinimumSingleReferenceSliceSize() != that.getMinimumSingleReferenceSliceSize()) return false;
if (getReadsPerSlice() != that.getReadsPerSlice()) return false;
if (getSlicesPerContainer() != that.getSlicesPerContainer()) return false;
- return getCustomCompressionHeaderEncodingMap() != null ?
- getCustomCompressionHeaderEncodingMap().equals(that.getCustomCompressionHeaderEncodingMap()) :
- that.getCustomCompressionHeaderEncodingMap() == null;
+ if (!cramVersion.equals(that.cramVersion)) return false;
+ return compressorMap != null ?
+ compressorMap.equals(that.compressorMap) :
+ that.compressorMap == null;
}
@Override
public int hashCode() {
- int result = getCustomCompressionHeaderEncodingMap() != null ?
- getCustomCompressionHeaderEncodingMap().hashCode() : 0;
+ int result = cramVersion.hashCode();
result = 31 * result + gzipCompressionLevel;
result = 31 * result + getMinimumSingleReferenceSliceSize();
result = 31 * result + getReadsPerSlice();
result = 31 * result + getSlicesPerContainer();
+ result = 31 * result + (compressorMap != null ? compressorMap.hashCode() : 0);
return result;
}
-
}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CRAMRecordReadFeatures.java b/src/main/java/htsjdk/samtools/cram/structure/CRAMRecordReadFeatures.java
index 1c081e6dc2..57c241ed96 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CRAMRecordReadFeatures.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CRAMRecordReadFeatures.java
@@ -41,6 +41,8 @@
* Class for handling the read features for a {@link CRAMCompressionRecord}.
*/
public class CRAMRecordReadFeatures {
+ private static final byte[] BAM_READ_BASE_LOOKUP = SequenceUtil.getBamReadBaseLookup();
+
final List readFeatures;
/**
@@ -129,6 +131,7 @@ public CRAMRecordReadFeatures(final SAMRecord samRecord, final byte[] bamReadBas
}
}
+ /** Return the list of read features for this record. */
public final List getReadFeaturesList() { return readFeatures; }
private void addSoftClip(
@@ -213,6 +216,13 @@ static void addMismatchReadFeatures(
}
}
+ /**
+ * Compute the alignment end position from the read features, alignment start, and read length.
+ *
+ * @param alignmentStart 1-based alignment start position
+ * @param readLength length of the read in bases
+ * @return 1-based alignment end position
+ */
public int getAlignmentEnd(int alignmentStart, int readLength) {
int alignmentSpan = readLength;
if (readFeatures != null) {
@@ -244,9 +254,10 @@ public int getAlignmentEnd(int alignmentStart, int readLength) {
}
/**
- * Get a Cigar fo this set of read features.
- * @param readLength
- * @return
+ * Build a {@link Cigar} from these read features and the given read length.
+ *
+ * @param readLength the length of the read in bases
+ * @return the reconstructed CIGAR
*/
public Cigar getCigarForReadFeatures(final int readLength) {
if (readFeatures == null) {
@@ -503,6 +514,342 @@ private static byte getByteOrDefault(final byte[] array, final int pos, final by
array[pos];
}
+ /**
+ * Result of the fused single-pass decode: read bases, CIGAR, and optionally MD string + NM count.
+ */
+ public static final class DecodeResult {
+ public final byte[] readBases;
+ public final Cigar cigar;
+ public final String mdString; // null if not computed
+ public final int nmCount; // -1 if not computed
+
+ DecodeResult(final byte[] readBases, final Cigar cigar, final String mdString, final int nmCount) {
+ this.readBases = readBases;
+ this.cigar = cigar;
+ this.mdString = mdString;
+ this.nmCount = nmCount;
+ }
+ }
+
+ /**
+ * Fused single-pass decode: restore read bases from the reference + read features, build the CIGAR,
+ * and optionally compute the MD string and NM edit distance, all in a single iteration through the
+ * features list. This replaces the previous 3-4 pass approach (restoreReadBases + getCigarForReadFeatures
+ * + calculateMdAndNm + toBamReadBasesInPlace).
+ *
+ *
Base normalization (upper-casing, replacing invalid bases with N) is done inline as bases are
+ * written, eliminating the need for a separate {@code toBamReadBasesInPlace} pass.
+ *
+ * @param readFeatures list of read features (may be null for pure reference matches)
+ * @param isUnknownBases true if the CF_UNKNOWN_BASES flag is set
+ * @param readAlignmentStart 1-based alignment start
+ * @param readLength read length
+ * @param cramReferenceRegion reference region covering this read's span
+ * @param substitutionMatrix substitution matrix for base resolution
+ * @param computeMdNm whether to compute MD string and NM count
+ * @return DecodeResult containing bases, CIGAR, and optionally MD/NM
+ */
+ public static DecodeResult restoreBasesAndTags(
+ final List readFeatures,
+ final boolean isUnknownBases,
+ final int readAlignmentStart,
+ final int readLength,
+ final CRAMReferenceRegion cramReferenceRegion,
+ final SubstitutionMatrix substitutionMatrix,
+ final boolean computeMdNm) {
+
+ if (readLength == 0) {
+ final Cigar cigar = new Cigar(Collections.singletonList(new CigarElement(readLength, CigarOperator.M)));
+ return new DecodeResult(SAMRecord.NULL_SEQUENCE, cigar, null, -1);
+ }
+
+ // When isUnknownBases (CF_UNKNOWN_BASES / seq '*'), we still need to process read features
+ // to reconstruct the CIGAR (e.g. soft clips stored in SC data series), but skip all base
+ // restoration, reference lookups, and MD/NM computation.
+ final byte[] bases = isUnknownBases ? null : new byte[readLength];
+ final int alignmentStart = readAlignmentStart - 1; // 0-based
+ final int refOffset = isUnknownBases ? 0 : cramReferenceRegion.getRegionStart();
+ final byte[] refBases = isUnknownBases ? null : cramReferenceRegion.getCurrentReferenceBases();
+ final boolean doBasesAndMdNm = !isUnknownBases;
+
+ // MD/NM state — mdActive tracks whether we're still within the reference boundary.
+ // Once we exceed the reference, we stop MD/NM computation (matching calculateMdAndNm's break behavior).
+ int nmCount = 0;
+ final boolean actuallyComputeMdNm = computeMdNm && doBasesAndMdNm;
+ final StringBuilder mdString = actuallyComputeMdNm ? new StringBuilder(readLength) : null;
+ int mdMatchRun = 0;
+ boolean mdActive = actuallyComputeMdNm;
+
+ // No features: pure reference match (fast path)
+ if (readFeatures == null || readFeatures.isEmpty()) {
+ if (isUnknownBases) {
+ final Cigar cigar = new Cigar(Collections.singletonList(new CigarElement(readLength, CigarOperator.M)));
+ return new DecodeResult(SAMRecord.NULL_SEQUENCE, cigar, null, -1);
+ }
+ final int srcStart = alignmentStart - refOffset;
+ final int copyLen = Math.min(readLength, Math.max(0, refBases.length - srcStart));
+ if (copyLen < readLength) {
+ Arrays.fill(bases, (byte) 'N');
+ if (copyLen > 0) System.arraycopy(refBases, srcStart, bases, 0, copyLen);
+ } else {
+ System.arraycopy(refBases, srcStart, bases, 0, readLength);
+ }
+
+ // Normalize bases and compute MD/NM — only within reference boundary
+ for (int i = 0; i < readLength; i++) {
+ final byte rawRef = bases[i];
+ bases[i] = BAM_READ_BASE_LOOKUP[rawRef & 0x7F];
+ if (computeMdNm && i < copyLen) {
+ if (SequenceUtil.basesEqual(bases[i], rawRef) || bases[i] == 0) {
+ mdMatchRun++;
+ } else {
+ mdString.append(mdMatchRun);
+ mdString.append((char) (rawRef & 0xFF));
+ mdMatchRun = 0;
+ nmCount++;
+ }
+ }
+ }
+
+ if (mdActive) mdString.append(mdMatchRun);
+ final Cigar cigar = new Cigar(Collections.singletonList(new CigarElement(readLength, CigarOperator.M)));
+ return new DecodeResult(bases, cigar,
+ computeMdNm ? mdString.toString() : null,
+ computeMdNm ? nmCount : -1);
+ }
+
+ // CIGAR building state
+ final List cigarElements = new ArrayList<>();
+ CigarOperator lastCigOp = CigarOperator.MATCH_OR_MISMATCH;
+ int lastCigLen = 0;
+ int lastCigPos = 1;
+
+ // Position tracking (1-based read position, 0-based ref offset from alignment start)
+ int posInRead = 1;
+ int posInSeq = 0;
+
+ for (final ReadFeature feature : readFeatures) {
+ final int featurePos = feature.getPosition();
+
+ // Fill gap from reference (advance positions; fill bases only when not unknownBases)
+ if (doBasesAndMdNm) {
+ while (posInRead < featurePos) {
+ final int rp = alignmentStart + posInSeq - refOffset;
+ if (rp >= refBases.length) mdActive = false;
+ final byte rawRef = getByteOrDefault(refBases, rp, (byte) 'N');
+ final byte nb = BAM_READ_BASE_LOOKUP[rawRef & 0x7F];
+ bases[posInRead - 1] = nb;
+ if (mdActive) {
+ if (SequenceUtil.basesEqual(nb, rawRef) || nb == 0) { mdMatchRun++; }
+ else { mdString.append(mdMatchRun); mdString.append((char)(rawRef & 0xFF)); mdMatchRun = 0; nmCount++; }
+ }
+ posInRead++;
+ posInSeq++;
+ }
+ } else {
+ final int gap = featurePos - posInRead;
+ posInSeq += gap;
+ posInRead = featurePos;
+ }
+
+ // Deactivate MD/NM if the current reference position is beyond the reference boundary,
+ // flushing any accumulated match run first
+ if (mdActive && (alignmentStart + posInSeq - refOffset) >= refBases.length) {
+ mdString.append(mdMatchRun);
+ mdMatchRun = 0;
+ mdActive = false;
+ }
+
+ // CIGAR gap
+ final int gap = featurePos - (lastCigPos + lastCigLen);
+ if (gap > 0) {
+ if (lastCigOp != CigarOperator.MATCH_OR_MISMATCH) {
+ cigarElements.add(new CigarElement(lastCigLen, lastCigOp));
+ lastCigPos += lastCigLen;
+ lastCigLen = gap;
+ } else {
+ lastCigLen += gap;
+ }
+ lastCigOp = CigarOperator.MATCH_OR_MISMATCH;
+ }
+
+ CigarOperator featureCigOp;
+ int featureCigLen;
+
+ switch (feature.getOperator()) {
+ case Substitution.operator: {
+ if (doBasesAndMdNm) {
+ final int rp = alignmentStart + posInSeq - refOffset;
+ final byte rawRef = getByteOrDefault(refBases, rp, (byte) 'N');
+ final byte normRef = Utils.normalizeBase(rawRef);
+ bases[posInRead - 1] = BAM_READ_BASE_LOOKUP[
+ substitutionMatrix.base(normRef, ((Substitution) feature).getCode()) & 0x7F];
+ if (mdActive) { mdString.append(mdMatchRun); mdString.append((char)(rawRef & 0xFF)); mdMatchRun = 0; nmCount++; }
+ }
+ posInRead++;
+ posInSeq++;
+ featureCigOp = CigarOperator.MATCH_OR_MISMATCH;
+ featureCigLen = 1;
+ break;
+ }
+ case ReadBase.operator: {
+ if (doBasesAndMdNm) {
+ final byte readBase = BAM_READ_BASE_LOOKUP[((ReadBase) feature).getBase() & 0x7F];
+ bases[posInRead - 1] = readBase;
+ if (mdActive) {
+ final int rp = alignmentStart + posInSeq - refOffset;
+ final byte rawRef = getByteOrDefault(refBases, rp, (byte) 'N');
+ if (SequenceUtil.basesEqual(readBase, rawRef)) { mdMatchRun++; }
+ else { mdString.append(mdMatchRun); mdString.append((char)(rawRef & 0xFF)); mdMatchRun = 0; nmCount++; }
+ }
+ }
+ posInRead++;
+ posInSeq++;
+ featureCigOp = CigarOperator.MATCH_OR_MISMATCH;
+ featureCigLen = 1;
+ break;
+ }
+ case Bases.operator: {
+ final byte[] fb = ((Bases) feature).getBases();
+ if (doBasesAndMdNm) {
+ for (int i = 0; i < fb.length; i++) {
+ bases[posInRead - 1 + i] = BAM_READ_BASE_LOOKUP[fb[i] & 0x7F];
+ if (mdActive) {
+ final int rp = alignmentStart + posInSeq + i - refOffset;
+ final byte rawRef = getByteOrDefault(refBases, rp, (byte) 'N');
+ if (SequenceUtil.basesEqual(bases[posInRead - 1 + i], rawRef)) { mdMatchRun++; }
+ else { mdString.append(mdMatchRun); mdString.append((char)(rawRef & 0xFF)); mdMatchRun = 0; nmCount++; }
+ }
+ }
+ }
+ posInRead += fb.length;
+ posInSeq += fb.length;
+ continue; // Bases are within M region, no CIGAR update
+ }
+ case Insertion.operator: {
+ final byte[] seq = ((Insertion) feature).getSequence();
+ if (doBasesAndMdNm) {
+ for (int i = 0; i < seq.length; i++) bases[posInRead - 1 + i] = BAM_READ_BASE_LOOKUP[seq[i] & 0x7F];
+ if (mdActive) nmCount += seq.length;
+ }
+ posInRead += seq.length;
+ featureCigOp = CigarOperator.INSERTION;
+ featureCigLen = seq.length;
+ break;
+ }
+ case InsertBase.operator: {
+ if (doBasesAndMdNm) {
+ bases[posInRead - 1] = BAM_READ_BASE_LOOKUP[((InsertBase) feature).getBase() & 0x7F];
+ if (mdActive) nmCount++;
+ }
+ posInRead++;
+ featureCigOp = CigarOperator.INSERTION;
+ featureCigLen = 1;
+ break;
+ }
+ case SoftClip.operator: {
+ final byte[] seq = ((SoftClip) feature).getSequence();
+ if (doBasesAndMdNm) {
+ for (int i = 0; i < seq.length; i++) bases[posInRead - 1 + i] = BAM_READ_BASE_LOOKUP[seq[i] & 0x7F];
+ }
+ posInRead += seq.length;
+ featureCigOp = CigarOperator.SOFT_CLIP;
+ featureCigLen = seq.length;
+ break;
+ }
+ case Deletion.operator: {
+ final int delLen = ((Deletion) feature).getLength();
+ if (mdActive) {
+ mdString.append(mdMatchRun);
+ mdMatchRun = 0;
+ mdString.append('^');
+ for (int i = 0; i < delLen; i++) {
+ final int rp = alignmentStart + posInSeq + i - refOffset;
+ final byte rawRef = getByteOrDefault(refBases, rp, (byte) 'N');
+ mdString.append((char) (rawRef & 0xFF));
+ }
+ nmCount += delLen;
+ }
+ posInSeq += delLen;
+ featureCigOp = CigarOperator.DELETION;
+ featureCigLen = delLen;
+ break;
+ }
+ case RefSkip.operator:
+ posInSeq += ((RefSkip) feature).getLength();
+ featureCigOp = CigarOperator.SKIPPED_REGION;
+ featureCigLen = ((RefSkip) feature).getLength();
+ break;
+ case HardClip.operator:
+ featureCigOp = CigarOperator.HARD_CLIP;
+ featureCigLen = ((HardClip) feature).getLength();
+ break;
+ case Padding.operator:
+ featureCigOp = CigarOperator.PADDING;
+ featureCigLen = ((Padding) feature).getLength();
+ break;
+ case Scores.operator:
+ case BaseQualityScore.operator:
+ continue;
+ default:
+ throw new CRAMException(String.format("Unrecognized read feature code: %c", feature.getOperator()));
+ }
+
+ // Update CIGAR
+ if (lastCigOp != featureCigOp) {
+ if (lastCigLen > 0) cigarElements.add(new CigarElement(lastCigLen, lastCigOp));
+ lastCigOp = featureCigOp;
+ lastCigLen = featureCigLen;
+ lastCigPos = feature.getPosition();
+ } else {
+ lastCigLen += featureCigLen;
+ }
+ if (!featureCigOp.consumesReadBases()) lastCigPos -= featureCigLen;
+ }
+
+ // Fill trailing reference bases (skip when unknownBases -- just advance positions)
+ if (doBasesAndMdNm) {
+ while (posInRead <= readLength) {
+ final int rp = alignmentStart + posInSeq - refOffset;
+ if (rp >= refBases.length) {
+ if (mdActive) { mdString.append(mdMatchRun); mdMatchRun = 0; mdActive = false; }
+ while (posInRead <= readLength) { bases[posInRead - 1] = 'N'; posInRead++; }
+ break;
+ }
+ final byte rawRef = refBases[rp];
+ bases[posInRead - 1] = BAM_READ_BASE_LOOKUP[rawRef & 0x7F];
+ if (mdActive) {
+ if (SequenceUtil.basesEqual(bases[posInRead - 1], rawRef) || bases[posInRead - 1] == 0) { mdMatchRun++; }
+ else { mdString.append(mdMatchRun); mdString.append((char)(rawRef & 0xFF)); mdMatchRun = 0; nmCount++; }
+ }
+ posInRead++;
+ posInSeq++;
+ }
+ } else {
+ posInRead = readLength + 1;
+ }
+
+ // Finalize CIGAR
+ if (lastCigOp != CigarOperator.M) {
+ if (lastCigLen > 0) cigarElements.add(new CigarElement(lastCigLen, lastCigOp));
+ if (readLength >= lastCigPos + lastCigLen) {
+ cigarElements.add(new CigarElement(readLength - (lastCigLen + lastCigPos) + 1, CigarOperator.M));
+ }
+ } else if (readLength > lastCigPos - 1) {
+ cigarElements.add(new CigarElement(readLength - lastCigPos + 1, CigarOperator.M));
+ }
+
+ final Cigar cigar = cigarElements.isEmpty()
+ ? new Cigar(Collections.singletonList(new CigarElement(readLength, CigarOperator.M)))
+ : new Cigar(cigarElements);
+
+ if (mdActive) mdString.append(mdMatchRun);
+
+ return new DecodeResult(isUnknownBases ? SAMRecord.NULL_SEQUENCE : bases, cigar,
+ actuallyComputeMdNm ? mdString.toString() : null,
+ actuallyComputeMdNm ? nmCount : -1);
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeader.java b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeader.java
index 9f013662e4..a1c33068b3 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeader.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeader.java
@@ -52,6 +52,7 @@ public class CompressionHeader {
private final Map tagEncodingMap = new TreeMap<>();
private SubstitutionMatrix substitutionMatrix;
private byte[][][] tagIDDictionary;
+ private TagKeyCache tagKeyCache;
/**
* Create a CompressionHeader using the default {@link CRAMEncodingStrategy}
@@ -153,6 +154,15 @@ public byte[][][] getTagIDDictionary() {
public void setTagIdDictionary(final byte[][][] dictionary) {
this.tagIDDictionary = dictionary;
+ this.tagKeyCache = new TagKeyCache(dictionary);
+ }
+
+ /**
+ * Returns the {@link TagKeyCache} for looking up pre-computed tag key metadata.
+ * Built from the tag ID dictionary when the compression header is parsed.
+ */
+ public TagKeyCache getTagKeyCache() {
+ return tagKeyCache;
}
public void setSubstitutionMatrix(final SubstitutionMatrix substitutionMatrix) {
@@ -240,6 +250,7 @@ else if (TD_tagIdsDictionary.equals(key)) {
final byte[] dictionaryBytes = new byte[size];
buffer.get(dictionaryBytes);
tagIDDictionary = parseDictionary(dictionaryBytes);
+ tagKeyCache = new TagKeyCache(tagIDDictionary);
} else if (SM_substitutionMatrix.equals(key)) {
// parse subs matrix here:
final byte[] matrixBytes = new byte[SubstitutionMatrix.BASES_SIZE];
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java
index 4394b49eab..34643cef8d 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java
@@ -26,7 +26,10 @@
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.ExternalCompressor;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Params;
+import htsjdk.samtools.cram.compression.TrialCompressor;
+import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode;
+import htsjdk.samtools.cram.compression.range.RangeParams;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
import htsjdk.samtools.cram.encoding.CRAMEncoding;
import htsjdk.samtools.cram.encoding.external.ByteArrayStopEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalByteEncoding;
@@ -38,6 +41,7 @@
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.utils.ValidationUtils;
+import java.util.EnumMap;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -100,53 +104,84 @@ public class CompressionHeaderEncodingMap {
private final CompressorCache compressorCache = new CompressorCache();
/**
- * Constructor used to create the default encoding map for writing CRAMs. The encoding strategy
- * parameter values are used to set compression levels, etc, but any encoding map embedded is ignored
- * since this uses the default strategy.
+ * Constructor used to create the encoding map for writing CRAMs. The per-DataSeries compressor
+ * assignments are read from the strategy's compressor map (set by {@link CRAMCompressionProfile}).
*
- * @param encodingStrategy {@link CRAMEncodingStrategy} containing parameter values to use when creating
- * the encoding map
+ *
Most data series use a plain external encoding with the specified compressor. Special cases:
+ *
+ *
{@code RN_ReadName} with {@code NAME_TOKENISER}: uses {@code ByteArrayStopEncoding} with
+ * the name tokeniser separator
+ *
{@code IN_Insertion} and {@code SC_SoftClip}: use {@code ByteArrayStopEncoding} with tab delimiter
+ *
{@code RN_ReadName} without {@code NAME_TOKENISER}: uses {@code ByteArrayStopEncoding} with tab
+ *
+ *
+ * @param encodingStrategy {@link CRAMEncodingStrategy} containing the compressor map and compression levels
*/
public CompressionHeaderEncodingMap(final CRAMEncodingStrategy encodingStrategy) {
ValidationUtils.nonNull(encodingStrategy, "An encoding strategy must be provided");
- ValidationUtils.validateArg(
- encodingStrategy.getCustomCompressionHeaderEncodingMap() == null,
- "A custom compression map cannot be used with this constructor");
-
- // NOTE: all of these encodings use external blocks and compressors for actual CRAM
- // data. The only use of core block encodings are as params for other (external)
- // encodings, i.e., the ByteArrayLenEncoding used for tag data uses a core (sub-)encoding
- // to store the length of the array that is stored in an external block.
- putExternalRansOrderZeroEncoding(DataSeries.AP_AlignmentPositionOffset);
- putExternalRansOrderOneEncoding(DataSeries.BA_Base);
- // the BB data series is not used by this implementation when writing CRAMs
- putExternalRansOrderOneEncoding(DataSeries.BF_BitFlags);
- putExternalGzipEncoding(encodingStrategy, DataSeries.BS_BaseSubstitutionCode);
- putExternalRansOrderOneEncoding(DataSeries.CF_CompressionBitFlags);
- putExternalGzipEncoding(encodingStrategy, DataSeries.DL_DeletionLength);
- putExternalGzipEncoding(encodingStrategy, DataSeries.FC_FeatureCode);
- putExternalGzipEncoding(encodingStrategy, DataSeries.FN_NumberOfReadFeatures);
- putExternalGzipEncoding(encodingStrategy, DataSeries.FP_FeaturePosition);
- putExternalGzipEncoding(encodingStrategy, DataSeries.HC_HardClip);
- putExternalByteArrayStopTabGzipEncoding(encodingStrategy, DataSeries.IN_Insertion);
- putExternalGzipEncoding(encodingStrategy, DataSeries.MF_MateBitFlags);
- putExternalGzipEncoding(encodingStrategy, DataSeries.MQ_MappingQualityScore);
- putExternalGzipEncoding(encodingStrategy, DataSeries.NF_RecordsToNextFragment);
- putExternalGzipEncoding(encodingStrategy, DataSeries.NP_NextFragmentAlignmentStart);
- putExternalRansOrderOneEncoding(DataSeries.NS_NextFragmentReferenceSequenceID);
- putExternalGzipEncoding(encodingStrategy, DataSeries.PD_padding);
- // the QQ data series is not used by this implementation when writing CRAMs
- putExternalRansOrderOneEncoding(DataSeries.QS_QualityScore);
- putExternalRansOrderOneEncoding(DataSeries.RG_ReadGroup);
- putExternalRansOrderZeroEncoding(DataSeries.RI_RefId);
- putExternalRansOrderOneEncoding(DataSeries.RL_ReadLength);
- putExternalByteArrayStopTabGzipEncoding(encodingStrategy, DataSeries.RN_ReadName);
- putExternalGzipEncoding(encodingStrategy, DataSeries.RS_RefSkip);
- putExternalByteArrayStopTabGzipEncoding(encodingStrategy, DataSeries.SC_SoftClip);
- // the TC data series is obsolete
- putExternalGzipEncoding(encodingStrategy, DataSeries.TL_TagIdList);
- // the TN data series is obsolete
- putExternalRansOrderOneEncoding(DataSeries.TS_InsertSize);
+ final EnumMap compressorMap = encodingStrategy.getCompressorMap();
+ ValidationUtils.nonNull(compressorMap, "Encoding strategy must have a compressor map");
+ final EnumMap> trialMap =
+ encodingStrategy.getTrialCandidatesMap();
+
+ for (final Map.Entry entry : compressorMap.entrySet()) {
+ final DataSeries ds = entry.getKey();
+ final CompressorDescriptor desc = entry.getValue();
+
+ // Build the compressor, potentially wrapping in TrialCompressor if trial candidates exist
+ final ExternalCompressor compressor = buildCompressor(ds, desc, trialMap);
+
+ // Data series with special encoding types
+ if (ds == DataSeries.RN_ReadName) {
+ if (desc.method() == BlockCompressionMethod.NAME_TOKENISER) {
+ putExternalEncoding(ds,
+ new ByteArrayStopEncoding(NameTokenisationDecode.NAME_SEPARATOR,
+ ds.getExternalBlockContentId()).toEncodingDescriptor(),
+ compressor);
+ } else {
+ putExternalEncoding(ds,
+ new ByteArrayStopEncoding((byte) '\t',
+ ds.getExternalBlockContentId()).toEncodingDescriptor(),
+ compressor);
+ }
+ } else if (ds == DataSeries.IN_Insertion || ds == DataSeries.SC_SoftClip) {
+ putExternalEncoding(ds,
+ new ByteArrayStopEncoding((byte) '\t',
+ ds.getExternalBlockContentId()).toEncodingDescriptor(),
+ compressor);
+ } else {
+ putExternalEncoding(ds, compressor);
+ }
+ }
+ }
+
+ /**
+ * Build a compressor for a data series, wrapping in {@link TrialCompressor} if additional
+ * trial candidates are configured for that series.
+ */
+ /**
+ * Build a compressor for a data series, wrapping in {@link TrialCompressor} if additional
+ * trial candidates are configured for that series.
+ */
+ private ExternalCompressor buildCompressor(
+ final DataSeries ds,
+ final CompressorDescriptor primaryDesc,
+ final EnumMap> trialMap) {
+ final ExternalCompressor primary = compressorCache.getCompressorForMethod(primaryDesc.method(), primaryDesc.arg());
+
+ if (trialMap != null && trialMap.containsKey(ds)) {
+ final java.util.List trialDescs = trialMap.get(ds);
+ if (trialDescs != null && !trialDescs.isEmpty()) {
+ final java.util.List candidates = new java.util.ArrayList<>();
+ candidates.add(primary);
+ for (final CompressorDescriptor td : trialDescs) {
+ candidates.add(compressorCache.getCompressorForMethod(td.method(), td.arg()));
+ }
+ return new TrialCompressor(candidates);
+ }
+ }
+
+ return primary;
}
/**
@@ -221,10 +256,27 @@ public EncodingDescriptor getEncodingDescriptorForDataSeries(final DataSeries da
public Block createCompressedBlockForStream(final CRAMCodecModelContext contextModel, final Integer contentId, final ByteArrayOutputStream outputStream) {
final ExternalCompressor compressor = externalCompressors.get(contentId);
final byte[] rawContent = outputStream.toByteArray();
+ // Compress first, then query the method — TrialCompressor determines its method
+ // during the first call to compress().
+ final byte[] compressedContent = compressor.compress(rawContent, contextModel);
+ return Block.createExternalBlock(
+ compressor.getMethod(),
+ contentId,
+ compressedContent,
+ rawContent.length);
+ }
+
+ /**
+ * Same as {@link #createCompressedBlockForStream} but accepts a {@link htsjdk.samtools.cram.io.CRAMByteWriter}.
+ */
+ public Block createCompressedBlockForWriter(final CRAMCodecModelContext contextModel, final Integer contentId, final htsjdk.samtools.cram.io.CRAMByteWriter writer) {
+ final ExternalCompressor compressor = externalCompressors.get(contentId);
+ final byte[] rawContent = writer.toByteArray();
+ final byte[] compressedContent = compressor.compress(rawContent, contextModel);
return Block.createExternalBlock(
compressor.getMethod(),
contentId,
- compressor.compress(rawContent, contextModel),
+ compressedContent,
rawContent.length);
}
@@ -287,13 +339,13 @@ public ExternalCompressor getBestExternalCompressor(final byte[] data, final CRA
final int gzipLen = gzip.compress(data, null).length;
final ExternalCompressor rans0 = compressorCache.getCompressorForMethod(
- BlockCompressionMethod.RANS,
- RANS4x8Params.ORDER.ZERO.ordinal());
+ BlockCompressionMethod.RANSNx16,
+ RANSNx16Params.ORDER.ZERO.ordinal());
final int rans0Len = rans0.compress(data,null).length;
final ExternalCompressor rans1 = compressorCache.getCompressorForMethod(
- BlockCompressionMethod.RANS,
- RANS4x8Params.ORDER.ONE.ordinal());
+ BlockCompressionMethod.RANSNx16,
+ RANSNx16Params.ORDER.ONE.ordinal());
final int rans1Len = rans1.compress(data, null).length;
// find the best of general purpose codecs:
@@ -368,7 +420,13 @@ private void putEncoding(final DataSeries dataSeries, final EncodingDescriptor e
encodingMap.put(dataSeries, encodingDescriptor);
}
- // add an external encoding and corresponding compressor
+ /**
+ * Add an external encoding and its corresponding compressor to the encoding map.
+ *
+ * @param dataSeries the data series to encode
+ * @param encodingDescriptor the encoding descriptor (must be an external encoding)
+ * @param compressor the external compressor to use for this data series' block
+ */
public void putExternalEncoding(final DataSeries dataSeries,
final EncodingDescriptor encodingDescriptor,
final ExternalCompressor compressor) {
@@ -386,6 +444,15 @@ private void putExternalByteArrayStopTabGzipEncoding(final CRAMEncodingStrategy
compressorCache.getCompressorForMethod(BlockCompressionMethod.GZIP, encodingStrategy.getGZIPCompressionLevel()));
}
+ private void putByteArrayStopNameTokEncoding(final CRAMEncodingStrategy encodingStrategy, final DataSeries dataSeries) {
+ // ByteArrayStopEncoding is paired with name tokenisation since using it with the
+ // NameTokenisationDecode.NAME_SEPARATOR conveniently writes the read name data in the NAME_SEPARATOR
+ // delimited/terminated format that is expected by the downstream tokenisation compressor code
+ putExternalEncoding(dataSeries,
+ new ByteArrayStopEncoding(NameTokenisationDecode.NAME_SEPARATOR, dataSeries.getExternalBlockContentId()).toEncodingDescriptor(),
+ compressorCache.getCompressorForMethod(BlockCompressionMethod.NAME_TOKENISER, 0));
+ }
+
// add an external encoding appropriate for the dataSeries value type, with a GZIP compressor
private void putExternalGzipEncoding(final CRAMEncodingStrategy encodingStrategy, final DataSeries dataSeries) {
putExternalEncoding(
@@ -397,14 +464,37 @@ private void putExternalGzipEncoding(final CRAMEncodingStrategy encodingStrategy
private void putExternalRansOrderOneEncoding(final DataSeries dataSeries) {
putExternalEncoding(
dataSeries,
- compressorCache.getCompressorForMethod(BlockCompressionMethod.RANS, RANS4x8Params.ORDER.ONE.ordinal()));
+ compressorCache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ONE.ordinal()));
}
// add an external encoding appropriate for the dataSeries value type, with a RANS order 0 compressor
private void putExternalRansOrderZeroEncoding(final DataSeries dataSeries) {
putExternalEncoding(
dataSeries,
- compressorCache.getCompressorForMethod(BlockCompressionMethod.RANS, RANS4x8Params.ORDER.ZERO.ordinal()));
+ compressorCache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ZERO.ordinal()));
+ }
+
+ // add an external encoding appropriate for the dataSeries value type, with a FQZComp quality score compressor
+ private void putExternalFQZCompEncoding(final DataSeries dataSeries) {
+ putExternalEncoding(
+ dataSeries,
+ compressorCache.getCompressorForMethod(BlockCompressionMethod.FQZCOMP, 0));
+ }
+
+ // add an external encoding appropriate for the dataSeries value type, with a Range (arithmetic) order 1 compressor
+ private void putExternalRangeOrderOneEncoding(final DataSeries dataSeries) {
+ putExternalEncoding(
+ dataSeries,
+ compressorCache.getCompressorForMethod(
+ BlockCompressionMethod.ADAPTIVE_ARITHMETIC,
+ RangeParams.ORDER_FLAG_MASK));
+ }
+
+ // add an external encoding appropriate for the dataSeries value type, with a Range (arithmetic) order 0 compressor
+ private void putExternalRangeOrderZeroEncoding(final DataSeries dataSeries) {
+ putExternalEncoding(
+ dataSeries,
+ compressorCache.getCompressorForMethod(BlockCompressionMethod.ADAPTIVE_ARITHMETIC, 0));
}
@Override
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java b/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java
index a7e28511d8..3139711b44 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java
@@ -27,12 +27,12 @@
import htsjdk.samtools.cram.compression.ExternalCompressor;
import htsjdk.samtools.cram.compression.RANS4x8ExternalCompressor;
import htsjdk.samtools.cram.compression.RANSNx16ExternalCompressor;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
-import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Params;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
-import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Decode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Encode;
+import htsjdk.samtools.cram.compression.rans.RANS4x8Params;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Decode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Encode;
+import htsjdk.samtools.cram.compression.rans.RANSNx16Params;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.utils.ValidationUtils;
@@ -79,8 +79,8 @@ public ExternalCompressor getCompressorForMethod(
return getCachedCompressorForMethod(compressionMethod, compressorSpecificArg);
case RANS: {
- // for efficiency, we want to share the same underlying RANS object with both order-0 and
- // order-1 ExternalCompressors
+ // in previous implementations, we would cache separate order-0 and order-1 compressors for performance
+ // reasons; we no longer NEED to do so but retain this structure for now
final int ransArg = compressorSpecificArg == ExternalCompressor.NO_COMPRESSION_ARG ?
RANS4x8Params.ORDER.ZERO.ordinal() :
compressorSpecificArg;
@@ -103,8 +103,8 @@ public ExternalCompressor getCompressorForMethod(
}
case RANSNx16: {
- // for efficiency, we want to share the same underlying RANSNx16 object with both order-0 and
- // order-1 ExternalCompressors
+ // in previous implementations, we would cache separate order-0 and order-1 compressors for performance
+ // reasons; we no longer NEED to do so but retain this structure for now
final int ransArg = compressorSpecificArg == ExternalCompressor.NO_COMPRESSION_ARG ?
RANSNx16Params.ORDER.ZERO.ordinal() :
compressorSpecificArg;
diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressorDescriptor.java b/src/main/java/htsjdk/samtools/cram/structure/CompressorDescriptor.java
new file mode 100644
index 0000000000..be8b1935e7
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/structure/CompressorDescriptor.java
@@ -0,0 +1,26 @@
+package htsjdk.samtools.cram.structure;
+
+import htsjdk.samtools.cram.compression.ExternalCompressor;
+import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
+
+/**
+ * Describes which compression method and parameters to use for a CRAM data series block.
+ * Pairs a {@link BlockCompressionMethod} with an optional compressor-specific integer argument
+ * (e.g., GZIP compression level, rANS order). Maps 1:1 to
+ * {@link ExternalCompressor#getCompressorForMethod(BlockCompressionMethod, int)}.
+ *
+ * @param method the block compression method
+ * @param arg compressor-specific argument, or {@link ExternalCompressor#NO_COMPRESSION_ARG} if none
+ */
+public record CompressorDescriptor(BlockCompressionMethod method, int arg) {
+
+ /**
+ * Create a descriptor for a compression method that takes no argument
+ * (e.g., RAW, BZIP2, LZMA, NAME_TOKENISER, FQZCOMP).
+ *
+ * @param method the block compression method
+ */
+ public CompressorDescriptor(final BlockCompressionMethod method) {
+ this(method, ExternalCompressor.NO_COMPRESSION_ARG);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/Container.java b/src/main/java/htsjdk/samtools/cram/structure/Container.java
index 890fb9db81..33aa77e7aa 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/Container.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/Container.java
@@ -317,11 +317,10 @@ public List getSAMRecords(
final SAMFileHeader samFileHeader) {
final List samRecords = new ArrayList<>(getContainerHeader().getNumberOfRecords());
for (final Slice slice : getSlices()) {
- final List cramCompressionRecords = slice.deserializeCRAMRecords(compressorCache, validationStringency);
// before we convert to SAMRecord, we need to normalize the CRAMCompressionRecord in each Slice
- slice.normalizeCRAMRecords(
- cramCompressionRecords,
- cramReferenceRegion);
+ final List cramCompressionRecords = slice.deserializeCRAMRecords(compressorCache, validationStringency);
+ slice.normalizeCRAMRecords( cramCompressionRecords, cramReferenceRegion);
+
for (final CRAMCompressionRecord cramCompressionRecord : cramCompressionRecords) {
final SAMRecord samRecord = cramCompressionRecord.toSAMRecord(samFileHeader);
samRecord.setValidationStringency(validationStringency);
@@ -380,6 +379,9 @@ public List getBAIEntries(final CompressorCache compressorCache) {
* @throws CRAMException when the Container is in an invalid state
*/
private void distributeIndexingParametersToSlices() {
+ if (slices.isEmpty()) {
+ return;
+ }
final int lastSliceIndex = slices.size() - 1;
for (int i = 0; i < lastSliceIndex; i++) {
final Slice slice = slices.get(i);
diff --git a/src/main/java/htsjdk/samtools/cram/structure/DataSeries.java b/src/main/java/htsjdk/samtools/cram/structure/DataSeries.java
index 354d50cff2..a7589a8166 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/DataSeries.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/DataSeries.java
@@ -34,73 +34,75 @@
*/
public enum DataSeries {
- // in rough encoding/decoding order, by group
+ // Content IDs match htslib's cram_DS_ID enum (cram_structs.h) for easier cross-implementation
+ // debugging. These IDs are written into each container's compression header encoding map and
+ // are only used for newly written files — existing files encode their own ID mapping.
// Main
- BF_BitFlags (DataSeriesType.INT, "BF", 1),
- CF_CompressionBitFlags (DataSeriesType.INT, "CF", 2),
+ BF_BitFlags (DataSeriesType.INT, "BF", 15),
+ CF_CompressionBitFlags (DataSeriesType.INT, "CF", 16),
// Positional
- RI_RefId (DataSeriesType.INT, "RI", 3),
- RL_ReadLength (DataSeriesType.INT, "RL", 4),
- AP_AlignmentPositionOffset (DataSeriesType.INT, "AP", 5),
- RG_ReadGroup (DataSeriesType.INT, "RG", 6),
+ RI_RefId (DataSeriesType.INT, "RI", 33),
+ RL_ReadLength (DataSeriesType.INT, "RL", 25),
+ AP_AlignmentPositionOffset (DataSeriesType.INT, "AP", 17),
+ RG_ReadGroup (DataSeriesType.INT, "RG", 18),
// Read Name
- RN_ReadName (DataSeriesType.BYTE_ARRAY, "RN", 7),
+ RN_ReadName (DataSeriesType.BYTE_ARRAY, "RN", 11),
// Mate Record
- NF_RecordsToNextFragment (DataSeriesType.INT, "NF", 8),
- MF_MateBitFlags (DataSeriesType.INT, "MF", 9),
- NS_NextFragmentReferenceSequenceID (DataSeriesType.INT, "NS", 10),
- NP_NextFragmentAlignmentStart (DataSeriesType.INT, "NP", 11),
- TS_InsertSize (DataSeriesType.INT, "TS", 12),
+ NF_RecordsToNextFragment (DataSeriesType.INT, "NF", 24),
+ MF_MateBitFlags (DataSeriesType.INT, "MF", 21),
+ NS_NextFragmentReferenceSequenceID (DataSeriesType.INT, "NS", 20),
+ NP_NextFragmentAlignmentStart (DataSeriesType.INT, "NP", 23),
+ TS_InsertSize (DataSeriesType.INT, "TS", 22),
// Auxiliary Tags
- TL_TagIdList (DataSeriesType.INT, "TL", 13),
+ TL_TagIdList (DataSeriesType.INT, "TL", 32),
// Retained for backward compatibility on CRAM read. See https://github.com/samtools/hts-specs/issues/598
// https://github.com/samtools/htsjdk/issues/1571
- TC_TagCount (DataSeriesType.INT, "TC", 14),
- TN_TagNameAndType (DataSeriesType.INT, "TN", 15),
+ TC_TagCount (DataSeriesType.INT, "TC", 44),
+ TN_TagNameAndType (DataSeriesType.INT, "TN", 39),
// Mapped Reads
- MQ_MappingQualityScore (DataSeriesType.INT, "MQ", 16),
+ MQ_MappingQualityScore (DataSeriesType.INT, "MQ", 19),
// Read Feature Records
- FN_NumberOfReadFeatures (DataSeriesType.INT, "FN", 17),
- FP_FeaturePosition (DataSeriesType.INT, "FP", 18),
- FC_FeatureCode (DataSeriesType.BYTE, "FC", 19),
+ FN_NumberOfReadFeatures (DataSeriesType.INT, "FN", 26),
+ FP_FeaturePosition (DataSeriesType.INT, "FP", 28),
+ FC_FeatureCode (DataSeriesType.BYTE, "FC", 27),
// Read Feature Codes
- BB_Bases (DataSeriesType.BYTE_ARRAY, "BB", 20),
- QQ_scores (DataSeriesType.BYTE_ARRAY, "QQ", 21),
- BA_Base (DataSeriesType.BYTE, "BA", 22),
+ BB_Bases (DataSeriesType.BYTE_ARRAY, "BB", 37),
+ QQ_scores (DataSeriesType.BYTE_ARRAY, "QQ", 38),
+ BA_Base (DataSeriesType.BYTE, "BA", 30),
// NOTE: the CramRecordReader and CramRecordWriter split the QS_QualityScore into two separate
// DataSeriesReader/Writer(s), one uses the params described here (BYTE) and one uses BYTE_ARRAY
- QS_QualityScore (DataSeriesType.BYTE, "QS", 23),
- BS_BaseSubstitutionCode (DataSeriesType.BYTE, "BS", 24),
- IN_Insertion (DataSeriesType.BYTE_ARRAY, "IN", 25),
- DL_DeletionLength (DataSeriesType.INT, "DL", 26),
- RS_RefSkip (DataSeriesType.INT, "RS", 27),
- SC_SoftClip (DataSeriesType.BYTE_ARRAY, "SC", 28),
- PD_padding (DataSeriesType.INT, "PD", 29),
- HC_HardClip (DataSeriesType.INT, "HC", 30),
+ QS_QualityScore (DataSeriesType.BYTE, "QS", 12),
+ BS_BaseSubstitutionCode (DataSeriesType.BYTE, "BS", 31),
+ IN_Insertion (DataSeriesType.BYTE_ARRAY, "IN", 13),
+ DL_DeletionLength (DataSeriesType.INT, "DL", 29),
+ RS_RefSkip (DataSeriesType.INT, "RS", 34),
+ SC_SoftClip (DataSeriesType.BYTE_ARRAY, "SC", 14),
+ PD_padding (DataSeriesType.INT, "PD", 35),
+ HC_HardClip (DataSeriesType.INT, "HC", 36),
- // For Testing Only
+ // For Testing Only — IDs match htslib's DS_TM=45, DS_TV=46
// NOTE: these are not listed in the spec
- TM_TestMark (DataSeriesType.INT, "TM", 31),
- TV_TestMark (DataSeriesType.INT, "TV", 32);
+ TM_TestMark (DataSeriesType.INT, "TM", 45),
+ TV_TestMark (DataSeriesType.INT, "TV", 46);
private final DataSeriesType type;
private final String canonicalName;
diff --git a/src/main/java/htsjdk/samtools/cram/structure/ReadTag.java b/src/main/java/htsjdk/samtools/cram/structure/ReadTag.java
index 9d1a44127d..41a30574cf 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/ReadTag.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/ReadTag.java
@@ -48,6 +48,13 @@ public class ReadTag implements Comparable {
private short code;
private byte index;
+ /**
+ * Construct a ReadTag from a 3-byte tag ID and raw value bytes.
+ *
+ * @param id the tag ID packed as an int (2 bytes tag name + 1 byte type)
+ * @param dataAsByteArray the raw tag value bytes
+ * @param validationStringency validation stringency for parsing
+ */
public ReadTag(final int id, final byte[] dataAsByteArray, ValidationStringency validationStringency) {
this.type = (char) (0xFF & id);
key = new String(new char[]{(char) ((id >> 16) & 0xFF), (char) ((id >> 8) & 0xFF)});
@@ -59,6 +66,22 @@ public ReadTag(final int id, final byte[] dataAsByteArray, ValidationStringency
code = SAMTag.makeBinaryTag(this.key);
}
+ /**
+ * Construct a ReadTag using pre-cached key metadata to avoid repeated String allocation.
+ *
+ * @param cached pre-computed key metadata from the {@link TagKeyCache}
+ * @param dataAsByteArray the raw tag value bytes
+ * @param validationStringency validation stringency for parsing
+ */
+ public ReadTag(final TagKeyCache.TagKeyInfo cached, final byte[] dataAsByteArray, ValidationStringency validationStringency) {
+ this.type = cached.type;
+ this.key = cached.key;
+ this.keyType3Bytes = cached.keyType3Bytes;
+ this.keyType3BytesAsInt = cached.keyType3BytesAsInt;
+ this.code = cached.code;
+ this.value = restoreValueFromByteArray(type, dataAsByteArray, validationStringency);
+ }
+
private ReadTag(final String key, final char type, final Object value) {
if (key == null)
throw new NullPointerException("Tag key cannot be null.");
@@ -83,7 +106,12 @@ private ReadTag(final String key, final char type, final Object value) {
code = SAMTag.makeBinaryTag(this.key);
}
- // two bytes are tag name and one byte is type
+ /**
+ * Pack a 3-byte tag ID (2 bytes name + 1 byte type) into an int.
+ *
+ * @param name byte array of length 3 (tag name char 1, char 2, type char)
+ * @return the packed int representation
+ */
public static int name3BytesToInt(final byte[] name) {
int value = 0xFF & name[0];
value <<= 8;
@@ -94,6 +122,13 @@ public static int name3BytesToInt(final byte[] name) {
return value;
}
+ /**
+ * Pack a 2-character tag name and a type character into a 3-byte int.
+ *
+ * @param name two-character tag name (e.g. "NM")
+ * @param type single-character type code (e.g. 'i', 'Z')
+ * @return the packed int representation
+ */
public static int nameType3BytesToInt(final String name, final char type) {
int value = 0xFF & name.charAt(0);
value <<= 8;
@@ -104,28 +139,46 @@ public static int nameType3BytesToInt(final String name, final char type) {
return value;
}
- // two bytes are tag name and one byte is type
- public static String intToNameType3Bytes(final int value) {
+ /**
+ * Unpack a 3-byte tag ID int into a String. If {@code withColon} is false, returns a
+ * 3-character string like "NMi"; if true, returns a 4-character string like "NM:i".
+ *
+ * @param value the packed int
+ * @param withColon if true, insert ':' between the 2-char name and the type char
+ * @return unpacked tag ID string
+ */
+ public static String intToNameType(final int value, final boolean withColon) {
final byte b3 = (byte) (0xFF & value);
final byte b2 = (byte) (0xFF & (value >> 8));
final byte b1 = (byte) (0xFF & (value >> 16));
- return new String(new byte[]{b1, b2, b3});
+ return withColon
+ ? new String(new byte[]{b1, b2, ':', b3})
+ : new String(new byte[]{b1, b2, b3});
}
- //TODO: consolidate this with the method above, and add some tests
- public static String intToNameType4Bytes(final int value) {
- final byte b3 = (byte) (0xFF & value);
- final byte b2 = (byte) (0xFF & (value >> 8));
- final byte b1 = (byte) (0xFF & (value >> 16));
+ /** Shorthand for {@link #intToNameType(int, boolean) intToNameType(value, false)}. */
+ public static String intToNameType3Bytes(final int value) {
+ return intToNameType(value, false);
+ }
- return new String(new byte[]{b1, b2, ':', b3});
+ /** Shorthand for {@link #intToNameType(int, boolean) intToNameType(value, true)}. */
+ public static String intToNameType4Bytes(final int value) {
+ return intToNameType(value, true);
}
+ /** Create a {@link SAMTagAndValue} from this ReadTag's key and value. */
public SAMTagAndValue createSAMTag() {
return new SAMTagAndValue(key, value);
}
+ /**
+ * Create a ReadTag from a 4-character "XX:T" key-and-type string and a value.
+ *
+ * @param keyAndType 4-character string in "XX:T" format (e.g. "NM:i")
+ * @param value the tag value
+ * @return a new ReadTag
+ */
public static ReadTag deriveTypeFromKeyAndType(final String keyAndType, final Object value) {
if (keyAndType.length() != 4)
throw new RuntimeException("Tag key and type must be 4 char long: " + keyAndType);
@@ -133,6 +186,13 @@ public static ReadTag deriveTypeFromKeyAndType(final String keyAndType, final Ob
return new ReadTag(keyAndType.substring(0, 2), keyAndType.charAt(3), value);
}
+ /**
+ * Create a ReadTag by inferring the CRAM type code from the Java type of the value.
+ *
+ * @param key two-character tag name (e.g. "NM")
+ * @param value the tag value (String, Character, Number, or array)
+ * @return a new ReadTag
+ */
public static ReadTag deriveTypeFromValue(final String key, final Object value) {
if (key.length() != 2)
throw new RuntimeException("Tag key must be 2 char long: " + key);
@@ -161,6 +221,7 @@ public String getKeyAndType() {
return keyAndType;
}
+ /** Serialize this tag's value to a byte array using CRAM/BAM binary encoding. */
public byte[] getValueAsByteArray() {
return writeSingleValue((byte) type, value, false);
}
@@ -253,6 +314,14 @@ protected ByteBuffer initialValue() {
private static final Charset charset = Charset.forName("US-ASCII");
+ /**
+ * Serialize a single tag value to a byte array in BAM binary format.
+ *
+ * @param tagType the BAM type code (e.g. 'i', 'Z', 'B')
+ * @param value the value to serialize
+ * @param isUnsignedArray if true and the value is an array, use unsigned array sub-type codes
+ * @return the serialized bytes
+ */
public static byte[] writeSingleValue(final byte tagType, final Object value,
final boolean isUnsignedArray) {
final ByteBuffer buffer = bufferLocal.get();
@@ -348,6 +417,14 @@ private static void writeArray(final Object value,
+ value.getClass());
}
+ /**
+ * Read a single tag value from a ByteBuffer in BAM binary format.
+ *
+ * @param tagType the BAM type code (e.g. 'i', 'Z', 'B')
+ * @param byteBuffer little-endian ByteBuffer positioned at the start of the value
+ * @param validationStringency validation stringency for error handling
+ * @return the deserialized value as the appropriate Java type
+ */
public static Object readSingleValue(final byte tagType,
final ByteBuffer byteBuffer, ValidationStringency validationStringency) {
switch (tagType) {
diff --git a/src/main/java/htsjdk/samtools/cram/structure/Slice.java b/src/main/java/htsjdk/samtools/cram/structure/Slice.java
index 99b2e0dd02..15b0498890 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/Slice.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/Slice.java
@@ -24,7 +24,6 @@
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.common.CRAMVersion;
import htsjdk.samtools.cram.common.CramVersions;
-import htsjdk.samtools.cram.digest.ContentDigests;
import htsjdk.samtools.cram.encoding.reader.CramRecordReader;
import htsjdk.samtools.cram.encoding.writer.CramRecordWriter;
import htsjdk.samtools.cram.io.CramIntArray;
@@ -201,7 +200,9 @@ public Slice(
this.compressionHeader = compressionHeader;
this.byteOffsetOfContainer = containerByteOffset;
- final ContentDigests hasher = ContentDigests.create(ContentDigests.ALL);
+ // htslib does not write content digest tags (BD/SD/B5/S5/B1/S1) into slice headers.
+ // These are optional per the spec, and computing SHA-512 + SHA-1 per record is very expensive.
+ // Block-level CRC32 (required by CRAM 3.0+) provides data integrity verification.
final Set referenceContexts = new HashSet<>();
// ignore these values if we later determine this Slice is not single-ref
int singleRefAlignmentStart = Integer.MAX_VALUE;
@@ -209,7 +210,6 @@ public Slice(
int baseCount = 0;
for (final CRAMCompressionRecord record : records) {
- hasher.add(record);
baseCount += record.getReadLength();
if (record.isPlaced()) {
@@ -239,11 +239,19 @@ public Slice(
singleRefAlignmentStart,
singleRefAlignmentEnd);
- sliceTags = hasher.getAsTags();
+ sliceTags = null;
nRecords = records.size();
this.baseCount = baseCount;
this.globalRecordCounter = globalRecordCounter;
+ // Populate context model with per-record metadata needed by codecs like FQZComp
+ contextModel.populateFromRecords(records);
+
+ // Link mate pairs within this slice as "attached" instead of "detached".
+ // Attached mates only store a record offset (NF) instead of full mate info (MF, NS, NP, TS),
+ // significantly reducing the compressed size for coordinate-sorted paired-end data.
+ linkMatesWithinSlice(records);
+
final CramRecordWriter writer = new CramRecordWriter(this);
sliceBlocks = writer.writeToSliceBlocks(contextModel, records, alignmentContext.getAlignmentStart());
@@ -251,6 +259,66 @@ public Slice(
nSliceBlocks = caclulateNumberOfBlocks();
}
+ /**
+ * Scan records in this slice for mate pairs and link them as "attached" instead of "detached".
+ * For each pair of records with the same read name that are both paired and neither is
+ * secondary/supplementary, the earlier record is marked with {@code CF_HAS_MATE_DOWNSTREAM}
+ * and the NF (records-to-next-fragment) offset is set. The later record remains detached
+ * but will have its mate info restored from the linked record during decode.
+ *
+ *
Records without a mate in this slice, or that are secondary/supplementary, remain detached.
+ *
+ * @param records the CRAM records in this slice
+ */
+ private static void linkMatesWithinSlice(final List records) {
+ // Map read name → index of first occurrence (for mate pairing)
+ final java.util.HashMap readNameToIndex = new java.util.HashMap<>(records.size());
+
+ for (int i = 0; i < records.size(); i++) {
+ final CRAMCompressionRecord record = records.get(i);
+ if (!record.isReadPaired() || record.isSecondaryAlignment() || record.isSupplementary()) {
+ // Unpaired, secondary, or supplementary reads stay detached
+ continue;
+ }
+
+ final String readName = record.getReadName();
+ final Integer previousIndex = readNameToIndex.get(readName);
+
+ if (previousIndex == null) {
+ // First occurrence of this read name — remember it
+ readNameToIndex.put(readName, i);
+ } else {
+ // Second occurrence — attempt to link as attached mate pair
+ final CRAMCompressionRecord previous = records.get(previousIndex);
+
+ // Validate that TLEN is consistent — if the recomputed insert size
+ // would differ from the original, keep both records detached to preserve
+ // the original TLEN values (matching htslib's cross-validation behavior)
+ final int computedTlen = CRAMCompressionRecord.computeInsertSize(previous, record);
+ if (previous.getTemplateSize() != computedTlen ||
+ record.getTemplateSize() != -computedTlen) {
+ // TLEN mismatch — keep both detached
+ readNameToIndex.remove(readName);
+ continue;
+ }
+
+ // Mark the earlier record as having its mate downstream
+ previous.setDetached(false);
+ previous.setHasMateDownStream(true);
+ previous.setRecordsToNextFragment(i - previousIndex - 1);
+
+ // The later record is the downstream mate — it's not detached but also
+ // doesn't have a mate downstream (it IS the downstream mate)
+ record.setDetached(false);
+ record.setHasMateDownStream(false);
+
+ // Remove from map so we don't match a third record with the same name
+ // (supplementary/secondary reads are already filtered above)
+ readNameToIndex.remove(readName);
+ }
+ }
+ }
+
public CRAMVersion getCramVersion() { return cramVersion; }
// May be null
@@ -510,7 +578,7 @@ public void normalizeCRAMRecords(final List cramCompressi
record.getAlignmentStart() - 1, // 1 based to 0-based
record.getAlignmentEnd() - record.getAlignmentStart() + 1);
}
- record.restoreReadBases(
+ record.restoreBasesAndTags(
cramReferenceRegion,
getCompressionHeader().getSubstitutionMatrix());
}
@@ -665,9 +733,9 @@ private void validateAlignmentSpanForReference(final CRAMReferenceRegion cramRef
throw new CRAMException ("No reference bases found for mapped slice .");
}
- //TODO: CRAMComplianceTest/c1#bounds triggers this (the reads are mapped beyond reference length),
- // and CRAMEdgeCasesTest.testNullsAndBeyondRef seems to deliberately test that reads that extend
- // beyond the reference length should be ok ?
+ // Reads are permitted to extend beyond the reference length (tested by CRAMComplianceTest/c1#bounds
+ // and CRAMEdgeCasesTest.testNullsAndBeyondRef). This matches samtools/htslib behavior. Log a warning
+ // but don't fail, since BAMs produced by some aligners contain such reads.
if (((alignmentContext.getAlignmentStart()-1) < cramReferenceRegion.getRegionStart()) ||
(alignmentContext.getAlignmentSpan() > cramReferenceRegion.getRegionLength())) {
log.warn(String.format(
@@ -730,7 +798,8 @@ public void setReferenceMD5(final CRAMReferenceRegion cramReferenceRegion) {
validateAlignmentSpanForReference(cramReferenceRegion);
final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases();
- //TODO: how can an alignment context have a start "< 1" ?
+ // Multi-ref and unmapped/unplaced slices can have alignmentStart < 1 (e.g. 0 for unmapped).
+ // In that case there's no meaningful reference span, so use a zeroed MD5.
if (! alignmentContext.getReferenceContext().isMappedSingleRef() && alignmentContext.getAlignmentStart() < 1) {
referenceMD5 = new byte[MD5_BYTE_SIZE];
} else {
diff --git a/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksReadStreams.java b/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksReadStreams.java
index 4befa833a9..705a18bcb3 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksReadStreams.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksReadStreams.java
@@ -1,31 +1,8 @@
-/*
- * Copyright (c) 2019 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
package htsjdk.samtools.cram.structure;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.io.BitInputStream;
+import htsjdk.samtools.cram.io.CRAMByteReader;
import htsjdk.samtools.cram.io.DefaultBitInputStream;
import htsjdk.samtools.cram.structure.block.Block;
import htsjdk.utils.ValidationUtils;
@@ -39,13 +16,16 @@
* Provides a layer over a {@link SliceBlocks} object and acts as a bridge between the DataSeries codecs
* and their underlying blocks when reading a CRAM stream by presenting a bit (core) or byte (external) stream
* for each block.
+ *
+ *
External block streams use unsynchronized {@link CRAMByteReader} instead of
+ * {@link ByteArrayInputStream} to eliminate synchronized method call overhead in the hot decode path.
*/
public class SliceBlocksReadStreams {
// bit input stream for the core block
private final BitInputStream coreBlockInputStream;
- // Map of ByteArrayInputStreams for all external contentIDs, including tag blocks, by content ID
- private final Map externalInputStreams = new HashMap<>();
+ // Map of CRAMByteReaders for all external contentIDs, including tag blocks, by content ID
+ private final Map externalReaders = new HashMap<>();
/**
* @param sliceBlocks {@link SliceBlocks} that have been populated from a CRAM stream
@@ -57,6 +37,7 @@ public SliceBlocksReadStreams(final SliceBlocks sliceBlocks, final CompressorCac
if (sliceBlocks.getCoreBlock() == null || sliceBlocks.getNumberOfExternalBlocks() == 0) {
throw new CRAMException("slice blocks must be initialized before being used with a reader");
}
+ // Core block still uses DefaultBitInputStream (bit-level access needed for Huffman codecs)
coreBlockInputStream = new DefaultBitInputStream(
new ByteArrayInputStream(
sliceBlocks.getCoreBlock().getUncompressedContent(compressorCache)));
@@ -64,12 +45,12 @@ public SliceBlocksReadStreams(final SliceBlocks sliceBlocks, final CompressorCac
final List externalContentIDs = sliceBlocks.getExternalContentIDs();
for (final Integer contentID : externalContentIDs) {
final Block block = sliceBlocks.getExternalBlock(contentID);
- externalInputStreams.put(contentID, new ByteArrayInputStream(block.getUncompressedContent(compressorCache)));
+ externalReaders.put(contentID, new CRAMByteReader(block.getUncompressedContent(compressorCache)));
}
}
/**
- * Get the {@link BitInputStream} for this {@link SliceBlocks} core block
+ * Get the {@link BitInputStream} for this {@link SliceBlocks} core block.
* @return {@link BitInputStream} for the core block
*/
public BitInputStream getCoreBlockInputStream() {
@@ -77,9 +58,11 @@ public BitInputStream getCoreBlockInputStream() {
}
/**
- * Get the ByteArrayInputStream for the given contentID.
- * @param contentID
- * @return ByteArrayInputStream for contentID
+ * Get the {@link CRAMByteReader} for the given content ID.
+ * @param contentID the external block content ID
+ * @return CRAMByteReader for the content ID
*/
- public ByteArrayInputStream getExternalInputStream(final Integer contentID) { return externalInputStreams.get(contentID); }
+ public CRAMByteReader getExternalReader(final Integer contentID) {
+ return externalReaders.get(contentID);
+ }
}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksWriteStreams.java b/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksWriteStreams.java
index 6209e18f3d..e1ef6292b8 100644
--- a/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksWriteStreams.java
+++ b/src/main/java/htsjdk/samtools/cram/structure/SliceBlocksWriteStreams.java
@@ -1,31 +1,8 @@
-/*
- * Copyright (c) 2019 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
package htsjdk.samtools.cram.structure;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.io.BitOutputStream;
+import htsjdk.samtools.cram.io.CRAMByteWriter;
import htsjdk.samtools.cram.io.DefaultBitOutputStream;
import htsjdk.samtools.cram.structure.block.Block;
import htsjdk.samtools.util.RuntimeIOException;
@@ -41,6 +18,9 @@
* Provides a layer over a {@link SliceBlocks} object and acts as a bridge between the DataSeries codecs
* and their underlying blocks when writing a CRAM stream by presenting a bit (core) or byte (external) stream
* for each block.
+ *
+ *
External block streams use unsynchronized {@link CRAMByteWriter} instead of
+ * {@link ByteArrayOutputStream} to eliminate synchronized method call overhead in the hot encode path.
*/
public class SliceBlocksWriteStreams {
@@ -48,8 +28,8 @@ public class SliceBlocksWriteStreams {
private final ByteArrayOutputStream coreBlockByteOutputStream;
private final BitOutputStream coreBlockBitOutputStream;
- // content ID to ByteArrayOutputStream
- private final Map externalOutputStreams = new TreeMap<>();
+ // content ID to CRAMByteWriter
+ private final Map externalWriters = new TreeMap<>();
/**
* @param compressionHeader {@link CompressionHeader} for the container containing the slice
@@ -57,58 +37,53 @@ public class SliceBlocksWriteStreams {
public SliceBlocksWriteStreams(final CompressionHeader compressionHeader) {
this.compressionHeader = compressionHeader;
+ // Core block still uses DefaultBitOutputStream (bit-level access needed for Huffman codecs)
coreBlockByteOutputStream = new ByteArrayOutputStream();
coreBlockBitOutputStream = new DefaultBitOutputStream(coreBlockByteOutputStream);
- // Create an output stream for each external content ID in the encoding map
+ // Create a writer for each external content ID in the encoding map
for (final Integer contentID : compressionHeader.getEncodingMap().getExternalIDs()) {
- externalOutputStreams.put(contentID, new ByteArrayOutputStream());
+ externalWriters.put(contentID, new CRAMByteWriter());
}
}
/**
- * @return the {@link BitOutputStream} for the core block
+ * @return the {@link BitOutputStream} for the core block
*/
public BitOutputStream getCoreOutputStream() { return coreBlockBitOutputStream; }
/**
- * Get the ByteArrayOutputStream corresponding to the requested contentID
+ * Get the {@link CRAMByteWriter} corresponding to the requested content ID.
* @param contentID ID of content being requested
- * @return ByteArrayOutputStream for contentID
+ * @return CRAMByteWriter for the content ID
*/
- public ByteArrayOutputStream getExternalOutputStream(final Integer contentID) { return externalOutputStreams.get(contentID); }
+ public CRAMByteWriter getExternalWriter(final Integer contentID) { return externalWriters.get(contentID); }
/**
- * Compress and write each stream to a corresponding Block (note that this does not write
- * the blocks themselves to a container output stream - that can't happen until the slice is aggregated
- * into a container.
+ * Compress and write each stream to a corresponding Block.
*/
public SliceBlocks flushStreamsToBlocks(final CRAMCodecModelContext contextModel) {
- closeAllStreams();
+ closeCoreStream();
// core block is raw (no compression) and must be written first (prescribed by the spec)
final Block coreBlock = Block.createRawCoreDataBlock(coreBlockByteOutputStream.toByteArray());
final List externalBlocks = new ArrayList<>();
- externalOutputStreams.forEach((contentId, contentStream) -> {
+ externalWriters.forEach((contentId, writer) -> {
if (contentId.equals(Block.NO_CONTENT_ID)) {
throw new CRAMException("A valid content ID is required. Given: " + contentId);
}
- externalBlocks.add(compressionHeader.getEncodingMap().createCompressedBlockForStream(contextModel, contentId, contentStream));
+ externalBlocks.add(compressionHeader.getEncodingMap().createCompressedBlockForWriter(contextModel, contentId, writer));
});
return new SliceBlocks(coreBlock, externalBlocks);
}
- private void closeAllStreams() {
+ private void closeCoreStream() {
try {
getCoreOutputStream().close();
- for (ByteArrayOutputStream baos : externalOutputStreams.values()) {
- baos.close();
- }
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
-
}
diff --git a/src/main/java/htsjdk/samtools/cram/structure/TagKeyCache.java b/src/main/java/htsjdk/samtools/cram/structure/TagKeyCache.java
new file mode 100644
index 0000000000..bb4bf1b8bd
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/cram/structure/TagKeyCache.java
@@ -0,0 +1,105 @@
+package htsjdk.samtools.cram.structure;
+
+import htsjdk.samtools.SAMTag;
+
+/**
+ * Caches the per-tag-ID metadata that is invariant across all records in a slice.
+ *
+ *
In CRAM, each tag is identified by a 3-byte ID (2 bytes tag name + 1 byte type) packed
+ * into an int. The tag ID dictionary in the compression header defines the small set of
+ * unique tag IDs used in a slice (typically 5-20). This class pre-computes and caches
+ * the derived String keys, binary tag codes, and type characters so they can be reused
+ * across millions of records without repeated allocation.
+ *
+ *
Internally uses parallel arrays with linear scan lookup, which is optimal for the
+ * small number of entries typical in CRAM slices (fits in 1-2 cache lines).
+ */
+public final class TagKeyCache {
+
+ /** Pre-computed metadata for a single tag ID. */
+ public static final class TagKeyInfo {
+ /** Two-character tag name, e.g. "NM", "MD", "RG". */
+ public final String key;
+ /** Three-character tag name + type, e.g. "NMi", "MDZ". */
+ public final String keyType3Bytes;
+ /** The 3-byte tag ID packed as an int (name high bytes, type low byte). */
+ public final int keyType3BytesAsInt;
+ /** Binary tag code as computed by {@link SAMTag#makeBinaryTag}. */
+ public final short code;
+ /** The single-character type code, e.g. 'i', 'Z', 'A'. */
+ public final char type;
+
+ private TagKeyInfo(final int id) {
+ final char c1 = (char) ((id >> 16) & 0xFF);
+ final char c2 = (char) ((id >> 8) & 0xFF);
+ this.type = (char) (id & 0xFF);
+ this.key = new String(new char[]{c1, c2});
+ this.keyType3Bytes = new String(new char[]{c1, c2, this.type});
+ this.keyType3BytesAsInt = id;
+ this.code = SAMTag.makeBinaryTag(this.key);
+ }
+ }
+
+ private final int[] ids;
+ private final TagKeyInfo[] infos;
+ private final int size;
+
+ /**
+ * Creates a TagKeyCache from a tag ID dictionary.
+ *
+ * @param tagIDDictionary the tag ID dictionary from the compression header, where each
+ * entry in the outer array is a combination of tag IDs (as 3-byte arrays)
+ * that appear together on records
+ */
+ public TagKeyCache(final byte[][][] tagIDDictionary) {
+ // Collect unique tag IDs across all dictionary entries
+ // Use a simple approach: accumulate into oversized arrays, then we'll use them directly.
+ // Worst case there are ~50 unique tags; typical is 5-20.
+ int capacity = 0;
+ for (final byte[][] entry : tagIDDictionary) {
+ capacity += entry.length;
+ }
+
+ final int[] tempIds = new int[capacity];
+ final TagKeyInfo[] tempInfos = new TagKeyInfo[capacity];
+ int count = 0;
+
+ for (final byte[][] entry : tagIDDictionary) {
+ for (final byte[] tagBytes : entry) {
+ final int id = ReadTag.name3BytesToInt(tagBytes);
+ // Check if we already have this ID (linear scan is fine for small N)
+ boolean found = false;
+ for (int i = 0; i < count; i++) {
+ if (tempIds[i] == id) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ tempIds[count] = id;
+ tempInfos[count] = new TagKeyInfo(id);
+ count++;
+ }
+ }
+ }
+
+ this.ids = tempIds;
+ this.infos = tempInfos;
+ this.size = count;
+ }
+
+ /**
+ * Looks up the cached metadata for the given 3-byte tag ID.
+ *
+ * @param id the tag ID as a packed int (2 bytes name + 1 byte type)
+ * @return the cached metadata, or {@code null} if the ID is not in the cache
+ */
+ public TagKeyInfo get(final int id) {
+ for (int i = 0; i < size; i++) {
+ if (ids[i] == id) {
+ return infos[i];
+ }
+ }
+ return null;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/util/GzipCodec.java b/src/main/java/htsjdk/samtools/util/GzipCodec.java
new file mode 100644
index 0000000000..a891d41a85
--- /dev/null
+++ b/src/main/java/htsjdk/samtools/util/GzipCodec.java
@@ -0,0 +1,377 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2024 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package htsjdk.samtools.util;
+
+import htsjdk.samtools.Defaults;
+import htsjdk.samtools.util.zip.DeflaterFactory;
+import htsjdk.samtools.util.zip.InflaterFactory;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.zip.CRC32;
+import java.util.zip.DataFormatException;
+import java.util.zip.Deflater;
+import java.util.zip.Inflater;
+
+/**
+ * A reusable codec for compressing and decompressing GZIP and BGZF data using direct
+ * {@link Deflater}/{@link Inflater} operations on {@link ByteBuffer}s. Designed to be
+ * allocated once and reused across many compress/decompress operations.
+ *
+ *
Supports two output formats for compression:
+ *
+ *
{@link Format#GZIP} — standard 10-byte GZIP header (RFC 1952)
+ *
{@link Format#BGZF} — BGZF header with BC extra subfield (SAM/BAM spec)
+ *
+ *
+ *
Decompression handles both formats transparently by parsing the FLG byte and
+ * skipping any optional GZIP fields.
+ *
+ *
Not thread-safe. Use one instance per thread.
+ */
+public class GzipCodec {
+
+ /** The output format for compression. */
+ public enum Format { GZIP, BGZF }
+
+ // Standard GZIP header: 10 bytes (RFC 1952)
+ private static final int GZIP_HEADER_SIZE = 10;
+
+ // BGZF header: 18 bytes (standard GZIP + FEXTRA with BC subfield)
+ private static final int BGZF_HEADER_SIZE = BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
+
+ // GZIP trailer: CRC32 (4 bytes) + ISIZE (4 bytes)
+ private static final int GZIP_TRAILER_SIZE = 8;
+
+ // GZIP magic bytes
+ private static final byte GZIP_ID1 = BlockCompressedStreamConstants.GZIP_ID1;
+ private static final byte GZIP_ID2 = (byte) BlockCompressedStreamConstants.GZIP_ID2;
+ private static final byte GZIP_CM_DEFLATE = BlockCompressedStreamConstants.GZIP_CM_DEFLATE;
+
+ // GZIP FLG bits
+ private static final int FTEXT = 1;
+ private static final int FHCRC = 2;
+ private static final int FEXTRA = 4;
+ private static final int FNAME = 8;
+ private static final int FCOMMENT = 16;
+
+ private final Deflater deflater;
+ private final Inflater inflater;
+ private final CRC32 crc32 = new CRC32();
+ private boolean checkCrcs = false;
+
+ /** Create a codec with the default compression level and default strategy. */
+ public GzipCodec() {
+ this(Defaults.COMPRESSION_LEVEL, Deflater.DEFAULT_STRATEGY);
+ }
+
+ /** Create a codec with the specified compression level and default strategy. */
+ public GzipCodec(final int compressionLevel) {
+ this(compressionLevel, Deflater.DEFAULT_STRATEGY);
+ }
+
+ /** Create a codec with the specified compression level and deflate strategy. */
+ public GzipCodec(final int compressionLevel, final int deflateStrategy) {
+ this(compressionLevel, deflateStrategy, new DeflaterFactory(), new InflaterFactory());
+ }
+
+ /**
+ * Create a codec with full control over compression parameters and factory implementations.
+ *
+ * @param compressionLevel deflate compression level (0-9)
+ * @param deflateStrategy deflate strategy (e.g., {@link Deflater#DEFAULT_STRATEGY}, {@link Deflater#FILTERED})
+ * @param deflaterFactory factory for creating Deflater instances
+ * @param inflaterFactory factory for creating Inflater instances
+ */
+ public GzipCodec(final int compressionLevel, final int deflateStrategy,
+ final DeflaterFactory deflaterFactory, final InflaterFactory inflaterFactory) {
+ // nowrap=true: we produce raw deflate and handle GZIP framing ourselves
+ this.deflater = deflaterFactory.makeDeflater(compressionLevel, true);
+ this.deflater.setStrategy(deflateStrategy);
+ this.inflater = inflaterFactory.makeInflater(true);
+ }
+
+ /** Enable or disable CRC32 validation during decompression. */
+ public void setCheckCrcs(final boolean check) {
+ this.checkCrcs = check;
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Compression
+ // --------------------------------------------------------------------------------------------
+
+ /**
+ * Compress data from {@code input} into {@code output} using standard GZIP format.
+ *
+ * @param input data to compress (from position to limit; position is advanced to limit)
+ * @param output buffer to write compressed data into (from position; position is advanced)
+ * @return number of bytes written to output
+ */
+ public int compress(final ByteBuffer input, final ByteBuffer output) {
+ return compress(input, output, Format.GZIP);
+ }
+
+ /**
+ * Compress data from {@code input} into {@code output} using the specified format.
+ *
+ * @param input data to compress (from position to limit; position is advanced to limit)
+ * @param output buffer to write compressed data into (from position; position is advanced)
+ * @param format the output format ({@link Format#GZIP} or {@link Format#BGZF})
+ * @return number of bytes written to output
+ */
+ public int compress(final ByteBuffer input, final ByteBuffer output, final Format format) {
+ final int outputStart = output.position();
+ final int inputSize = input.remaining();
+
+ // Compute CRC32 over the uncompressed input
+ crc32.reset();
+ final int inputPos = input.position();
+ // Use a slice to avoid disturbing input's position
+ final ByteBuffer crcSlice = input.slice();
+ crc32.update(crcSlice);
+
+ // Write header (reserves space; for BGZF the block size is patched after deflation)
+ final int headerSize = writeHeader(output, format);
+
+ // Extract input bytes for deflater (byte[] API for compatibility with LibdeflateDeflater)
+ final byte[] inputBytes;
+ final int inputOff;
+ input.position(inputPos);
+ if (input.hasArray()) {
+ inputBytes = input.array();
+ inputOff = input.arrayOffset() + inputPos;
+ } else {
+ inputBytes = new byte[inputSize];
+ input.get(inputBytes);
+ inputOff = 0;
+ }
+
+ // Deflate into a temporary byte[] then copy to output buffer
+ deflater.reset();
+ deflater.setInput(inputBytes, inputOff, inputSize);
+ deflater.finish();
+ while (!deflater.finished()) {
+ final int n = deflater.deflate(output.array(), output.arrayOffset() + output.position(), output.remaining());
+ output.position(output.position() + n);
+ }
+
+ // Write trailer: CRC32 + ISIZE (little-endian)
+ output.order(ByteOrder.LITTLE_ENDIAN);
+ output.putInt((int) crc32.getValue());
+ output.putInt(inputSize);
+
+ // For BGZF, patch the total block size into the header
+ if (format == Format.BGZF) {
+ final int totalBlockSize = output.position() - outputStart;
+ output.order(ByteOrder.LITTLE_ENDIAN);
+ output.putShort(outputStart + BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET,
+ (short) (totalBlockSize - 1));
+ }
+
+ return output.position() - outputStart;
+ }
+
+ /**
+ * Compress data and return a new ByteBuffer containing the compressed result.
+ *
+ * @param input data to compress (from position to limit; position is advanced to limit)
+ * @return a new ByteBuffer containing the compressed data, positioned at 0 with limit at the end
+ */
+ public ByteBuffer compress(final ByteBuffer input) {
+ return compress(input, Format.GZIP);
+ }
+
+ /**
+ * Compress data and return a new ByteBuffer containing the compressed result.
+ *
+ * @param input data to compress (from position to limit; position is advanced to limit)
+ * @param format the output format
+ * @return a new ByteBuffer containing the compressed data, positioned at 0 with limit at the end
+ */
+ public ByteBuffer compress(final ByteBuffer input, final Format format) {
+ // Worst case: incompressible data + header + trailer. Deflater overhead is at most
+ // 5 bytes per 32KB block + a few bytes for the zlib wrapper.
+ final int maxCompressed = input.remaining() + (input.remaining() / 16000 + 1) * 5 + 256;
+ final int headerSize = format == Format.BGZF ? BGZF_HEADER_SIZE : GZIP_HEADER_SIZE;
+ final ByteBuffer output = ByteBuffer.allocate(headerSize + maxCompressed + GZIP_TRAILER_SIZE);
+ compress(input, output, format);
+ output.flip();
+ return output;
+ }
+
+ /** Write a GZIP or BGZF header to the output buffer. Returns the header size. */
+ private int writeHeader(final ByteBuffer output, final Format format) {
+ if (format == Format.BGZF) {
+ output.put(GZIP_ID1);
+ output.put(GZIP_ID2);
+ output.put(GZIP_CM_DEFLATE);
+ output.put((byte) FEXTRA); // FLG: FEXTRA set
+ output.putInt(0); // MTIME
+ output.put((byte) 0); // XFL
+ output.put((byte) 0xFF); // OS: unknown
+ output.order(ByteOrder.LITTLE_ENDIAN);
+ output.putShort(BlockCompressedStreamConstants.GZIP_XLEN); // XLEN = 6
+ output.put(BlockCompressedStreamConstants.BGZF_ID1); // SI1 = 'B'
+ output.put(BlockCompressedStreamConstants.BGZF_ID2); // SI2 = 'C'
+ output.putShort(BlockCompressedStreamConstants.BGZF_LEN); // SLEN = 2
+ output.putShort((short) 0); // BSIZE placeholder — patched after deflation
+ return BGZF_HEADER_SIZE;
+ } else {
+ output.put(GZIP_ID1);
+ output.put(GZIP_ID2);
+ output.put(GZIP_CM_DEFLATE);
+ output.put((byte) 0); // FLG: no optional fields
+ output.putInt(0); // MTIME
+ output.put((byte) 0); // XFL
+ output.put((byte) 0xFF); // OS: unknown
+ return GZIP_HEADER_SIZE;
+ }
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Decompression
+ // --------------------------------------------------------------------------------------------
+
+ /**
+ * Decompress GZIP or BGZF data from {@code input} into {@code output}.
+ * Handles both standard GZIP and BGZF transparently.
+ *
+ * @param input compressed data (from position to limit; position is advanced)
+ * @param output buffer to write decompressed data into (from position; position is advanced)
+ * @return number of decompressed bytes written to output
+ */
+ public int decompress(final ByteBuffer input, final ByteBuffer output) {
+ input.order(ByteOrder.LITTLE_ENDIAN);
+
+ // Parse and validate the GZIP header
+ if (input.remaining() < GZIP_HEADER_SIZE + GZIP_TRAILER_SIZE) {
+ throw new IllegalArgumentException("Input too small to be a valid GZIP block");
+ }
+
+ final byte id1 = input.get();
+ final byte id2 = input.get();
+ final byte cm = input.get();
+ final int flg = input.get() & 0xFF;
+ if (id1 != GZIP_ID1 || id2 != GZIP_ID2 || cm != GZIP_CM_DEFLATE) {
+ throw new IllegalArgumentException("Invalid GZIP header");
+ }
+
+ input.position(input.position() + 6); // skip MTIME(4) + XFL(1) + OS(1)
+
+ // Handle optional GZIP fields based on FLG bits
+ if ((flg & FEXTRA) != 0) {
+ final int xlen = input.getShort() & 0xFFFF;
+ input.position(input.position() + xlen); // skip extra field (includes BGZF subfield if present)
+ }
+ if ((flg & FNAME) != 0) {
+ while (input.get() != 0) {} // skip null-terminated filename
+ }
+ if ((flg & FCOMMENT) != 0) {
+ while (input.get() != 0) {} // skip null-terminated comment
+ }
+ if ((flg & FHCRC) != 0) {
+ input.position(input.position() + 2); // skip header CRC16
+ }
+
+ // The deflated data is between the current position and 8 bytes before the end
+ final int deflatedStart = input.position();
+ final int deflatedEnd = input.limit() - GZIP_TRAILER_SIZE;
+ final int deflatedSize = deflatedEnd - deflatedStart;
+ if (deflatedSize < 0) {
+ throw new IllegalArgumentException("Invalid GZIP block: no room for deflated data and trailer");
+ }
+
+ // Extract deflated bytes for inflater (byte[] API for compatibility with LibdeflateInflater)
+ final byte[] deflatedBytes;
+ final int deflatedOff;
+ if (input.hasArray()) {
+ deflatedBytes = input.array();
+ deflatedOff = input.arrayOffset() + deflatedStart;
+ } else {
+ deflatedBytes = new byte[deflatedSize];
+ input.position(deflatedStart);
+ input.get(deflatedBytes);
+ deflatedOff = 0;
+ }
+
+ inflater.reset();
+ inflater.setInput(deflatedBytes, deflatedOff, deflatedSize);
+
+ // Inflate into output
+ try {
+ int totalInflated = 0;
+ while (!inflater.finished() && output.hasRemaining()) {
+ final int n = inflater.inflate(output.array(), output.arrayOffset() + output.position(), output.remaining());
+ output.position(output.position() + n);
+ totalInflated += n;
+ }
+
+ // Read trailer: CRC32 + ISIZE
+ input.position(deflatedEnd);
+ final int expectedCrc = input.getInt();
+ final int expectedSize = input.getInt();
+
+ if (totalInflated != expectedSize) {
+ throw new IllegalStateException(
+ String.format("GZIP ISIZE mismatch: expected %d, got %d", expectedSize, totalInflated));
+ }
+
+ // Validate CRC32 if enabled
+ if (checkCrcs) {
+ crc32.reset();
+ final ByteBuffer outputSlice = output.duplicate();
+ outputSlice.flip();
+ // Position to where we started writing
+ outputSlice.position(output.position() - totalInflated);
+ crc32.update(outputSlice);
+ if ((int) crc32.getValue() != expectedCrc) {
+ throw new IllegalStateException(
+ String.format("GZIP CRC32 mismatch: expected %08x, got %08x",
+ expectedCrc, (int) crc32.getValue()));
+ }
+ }
+
+ return totalInflated;
+ } catch (final DataFormatException e) {
+ throw new IllegalStateException("Error inflating GZIP data", e);
+ }
+ }
+
+ /**
+ * Decompress GZIP or BGZF data and return a new ByteBuffer containing the result.
+ * Reads the ISIZE field from the GZIP trailer to determine the output size.
+ *
+ * @param input compressed data (from position to limit; position is advanced)
+ * @return a new ByteBuffer containing the decompressed data, positioned at 0 with limit at the end
+ */
+ public ByteBuffer decompress(final ByteBuffer input) {
+ // Read ISIZE from the last 4 bytes of the GZIP block to size the output
+ final int isizeOffset = input.limit() - 4;
+ final int isize = input.duplicate().order(ByteOrder.LITTLE_ENDIAN).position(isizeOffset).getInt();
+ final ByteBuffer output = ByteBuffer.allocate(isize);
+ decompress(input, output);
+ output.flip();
+ return output;
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java
index 18e524ae35..3a75c2a068 100644
--- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java
+++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java
@@ -63,9 +63,7 @@ public class SequenceUtil {
private static final int BASES_ARRAY_LENGTH = 127;
private static final int SHIFT_TO_LOWER_CASE = a - A;
- /**
- * A lookup table to find a corresponding BAM read base.
- */
+ /** Lookup table mapping any byte to its BAM-valid upper-case equivalent (or N if invalid). */
private static final byte[] bamReadBaseLookup = new byte[BASES_ARRAY_LENGTH];
static {
Arrays.fill(bamReadBaseLookup, N);
@@ -75,6 +73,21 @@ public class SequenceUtil {
}
}
+ /**
+ * Returns a defensive copy of the BAM read base lookup table. The table maps each byte
+ * value (indexed by {@code value & 0x7F}) to its BAM-valid upper-case base equivalent
+ * (one of A, C, G, T, N, M, R, W, S, Y, K, V, H, D, B), or 'N' if the input is not a
+ * recognized base. Both upper and lower case inputs map to the upper case base.
+ *
+ *
Callers that need repeated lookups on a hot path should store the returned array
+ * in a local or static field rather than calling this method repeatedly.
+ *
+ * @return a new copy of the 127-element lookup table
+ */
+ public static byte[] getBamReadBaseLookup() {
+ return bamReadBaseLookup.clone();
+ }
+
private static final byte A_MASK = 1;
private static final byte C_MASK = 2;
private static final byte G_MASK = 4;
@@ -939,59 +952,72 @@ public static String md5DigestToString(final byte[] digest) {
}
- public static byte[] calculateMD5(final byte[] data, final int offset, final int len) {
- final MessageDigest md5_MessageDigest;
+ private static final ThreadLocal md5Digest = ThreadLocal.withInitial(() -> {
try {
- md5_MessageDigest = MessageDigest.getInstance("MD5");
- md5_MessageDigest.reset();
-
- md5_MessageDigest.update(data, offset, len);
- return md5_MessageDigest.digest();
+ return MessageDigest.getInstance("MD5");
} catch (final NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
+ });
+
+ public static byte[] calculateMD5(final byte[] data, final int offset, final int len) {
+ final MessageDigest md = md5Digest.get();
+ md.reset();
+ md.update(data, offset, len);
+ return md.digest();
}
/**
- * Calculate MD and NM similarly to Samtools, except that N->N is a match.
+ * Compute MD string and NM count from a read's CIGAR, bases, and a reference sequence slice.
+ * This is the core implementation shared by {@link #calculateMdAndNmTags(SAMRecord, byte[], boolean, boolean)}
+ * and the CRAM decoder's NM/MD regeneration.
*
- * @param record Input record for which to calculate NM and MD.
- * The appropriate tags will be added/updated in the record
- * @param ref The reference bases for the sequence to which the record is mapped
- * @param calcMD A flag indicating whether to update the MD tag in the record
- * @param calcNM A flag indicating whether to update the NM tag in the record
+ *
The reference bases are accessed starting at {@code refOffset} — i.e., {@code referenceBases[0]}
+ * corresponds to the genomic position {@code refOffset + 1} (1-based). The read's alignment start
+ * (1-based) determines where in the reference to begin comparing.
+ *
+ *
Matches are determined by upper-casing both bases before comparison. N-to-N is treated as a match
+ * (matching samtools behavior).
+ *
+ * @param cigarElements the CIGAR elements for the read
+ * @param readBases the read's base sequence
+ * @param referenceBases the reference bases covering the read's alignment region
+ * @param refOffset the 0-based genomic offset of the first base in {@code referenceBases}
+ * @param alignmentStart the 1-based alignment start position of the read
+ * @return a Tuple of (MD string, NM count)
*/
- public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref,
- final boolean calcMD, final boolean calcNM) {
- if (!calcMD && !calcNM)
- return;
-
- final Cigar cigar = record.getCigar();
- final List cigarElements = cigar.getCigarElements();
- final byte[] seq = record.getReadBases();
- final int alignmentStart = record.getAlignmentStart() - 1;
- int cigarIndex, blockRefPos, blockReadStart, matchCount = 0;
+ public static Tuple calculateMdAndNm(
+ final List cigarElements,
+ final byte[] readBases,
+ final byte[] referenceBases,
+ final int refOffset,
+ final int alignmentStart) {
+
+ // blockRefPos is the 0-based position in the reference array, adjusted for the offset
+ final int startInRef = alignmentStart - 1 - refOffset;
+ int blockRefPos = startInRef;
+ int blockReadStart = 0;
+ int matchCount = 0;
int nmCount = 0;
final StringBuilder mdString = new StringBuilder();
- final int nElements = cigarElements.size();
- for (cigarIndex = blockReadStart = 0, blockRefPos = alignmentStart; cigarIndex < nElements; ++cigarIndex) {
- final CigarElement ce = cigarElements.get(cigarIndex);
+ for (final CigarElement ce : cigarElements) {
int inBlockOffset;
final int blockLength = ce.getLength();
final CigarOperator op = ce.getOperator();
+
if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ
|| op == CigarOperator.X) {
for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
final int readOffset = blockReadStart + inBlockOffset;
+ final int refIdx = blockRefPos + inBlockOffset;
- if (ref.length <= blockRefPos + inBlockOffset) break; // out of boundary
+ if (refIdx >= referenceBases.length) break; // out of boundary
- final byte readBase = seq[readOffset];
- final byte refBase = ref[blockRefPos + inBlockOffset];
+ final byte readBase = readBases[readOffset];
+ final byte refBase = referenceBases[refIdx];
if ((bases[readBase] == bases[refBase]) || readBase == 0) {
- // a match
++matchCount;
} else {
mdString.append(matchCount);
@@ -1007,15 +1033,15 @@ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref
mdString.append(matchCount);
mdString.append('^');
for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
- if (ref[blockRefPos + inBlockOffset] == 0) break;
- mdString.appendCodePoint(ref[blockRefPos + inBlockOffset]);
+ final int refIdx = blockRefPos + inBlockOffset;
+ if (refIdx >= referenceBases.length || referenceBases[refIdx] == 0) break;
+ mdString.appendCodePoint(referenceBases[refIdx]);
}
matchCount = 0;
if (inBlockOffset < blockLength) break;
blockRefPos += blockLength;
nmCount += blockLength;
- } else if (op == CigarOperator.INSERTION
- || op == CigarOperator.SOFT_CLIP) {
+ } else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) {
blockReadStart += blockLength;
if (op == CigarOperator.INSERTION) nmCount += blockLength;
} else if (op == CigarOperator.SKIPPED_REGION) {
@@ -1024,8 +1050,33 @@ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref
}
mdString.append(matchCount);
- if (calcMD) record.setAttribute(SAMTag.MD, mdString.toString());
- if (calcNM) record.setAttribute(SAMTag.NM, nmCount);
+ return new Tuple<>(mdString.toString(), nmCount);
+ }
+
+ /**
+ * Calculate MD and NM similarly to Samtools, except that N->N is a match.
+ *
+ * @param record Input record for which to calculate NM and MD.
+ * The appropriate tags will be added/updated in the record
+ * @param ref The reference bases for the entire contig to which the record is mapped
+ * (index 0 = position 1 on the contig)
+ * @param calcMD A flag indicating whether to update the MD tag in the record
+ * @param calcNM A flag indicating whether to update the NM tag in the record
+ */
+ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref,
+ final boolean calcMD, final boolean calcNM) {
+ if (!calcMD && !calcNM)
+ return;
+
+ final Tuple result = calculateMdAndNm(
+ record.getCigar().getCigarElements(),
+ record.getReadBases(),
+ ref,
+ 0, // ref array starts at position 1 on the contig
+ record.getAlignmentStart());
+
+ if (calcMD) record.setAttribute(SAMTag.MD, result.a);
+ if (calcNM) record.setAttribute(SAMTag.NM, result.b);
}
public static byte upperCase(final byte base) {
diff --git a/src/main/java/htsjdk/samtools/util/zip/LibdeflateDeflater.java b/src/main/java/htsjdk/samtools/util/zip/LibdeflateDeflater.java
index da73a24a11..8b26861b72 100644
--- a/src/main/java/htsjdk/samtools/util/zip/LibdeflateDeflater.java
+++ b/src/main/java/htsjdk/samtools/util/zip/LibdeflateDeflater.java
@@ -25,6 +25,7 @@
import com.fulcrumgenomics.jlibdeflate.LibdeflateCompressor;
+import java.nio.ByteBuffer;
import java.util.zip.Deflater;
/**
@@ -70,6 +71,19 @@ public void setInput(final byte[] input, final int off, final int len) {
this.done = false;
}
+ @Override
+ public void setInput(final ByteBuffer input) {
+ final int len = input.remaining();
+ if (input.hasArray()) {
+ setInput(input.array(), input.arrayOffset() + input.position(), len);
+ input.position(input.limit());
+ } else {
+ final byte[] bytes = new byte[len];
+ input.get(bytes);
+ setInput(bytes, 0, len);
+ }
+ }
+
@Override
public void finish() {
this.finishing = true;
@@ -95,6 +109,21 @@ public int deflate(final byte[] output, final int off, final int len) {
return compressed;
}
+ @Override
+ public int deflate(final ByteBuffer output) {
+ return deflate(output, Deflater.NO_FLUSH);
+ }
+
+ @Override
+ public int deflate(final ByteBuffer output, final int flush) {
+ if (!output.hasArray()) {
+ throw new UnsupportedOperationException("LibdeflateDeflater requires a heap-backed ByteBuffer for output");
+ }
+ final int n = deflate(output.array(), output.arrayOffset() + output.position(), output.remaining());
+ output.position(output.position() + n);
+ return n;
+ }
+
@Override
public boolean finished() {
return finishing && done;
diff --git a/src/main/java/htsjdk/samtools/util/zip/LibdeflateInflater.java b/src/main/java/htsjdk/samtools/util/zip/LibdeflateInflater.java
index dc0e396560..e3f596c00b 100644
--- a/src/main/java/htsjdk/samtools/util/zip/LibdeflateInflater.java
+++ b/src/main/java/htsjdk/samtools/util/zip/LibdeflateInflater.java
@@ -25,6 +25,7 @@
import com.fulcrumgenomics.jlibdeflate.LibdeflateDecompressor;
+import java.nio.ByteBuffer;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
@@ -66,6 +67,29 @@ public void setInput(final byte[] input, final int off, final int len) {
this.inputLen = len;
}
+ @Override
+ public void setInput(final ByteBuffer input) {
+ final int len = input.remaining();
+ if (input.hasArray()) {
+ setInput(input.array(), input.arrayOffset() + input.position(), len);
+ input.position(input.limit());
+ } else {
+ final byte[] bytes = new byte[len];
+ input.get(bytes);
+ setInput(bytes, 0, len);
+ }
+ }
+
+ @Override
+ public int inflate(final ByteBuffer output) throws DataFormatException {
+ if (!output.hasArray()) {
+ throw new UnsupportedOperationException("LibdeflateInflater requires a heap-backed ByteBuffer for output");
+ }
+ final int n = inflate(output.array(), output.arrayOffset() + output.position(), output.remaining());
+ output.position(output.position() + n);
+ return n;
+ }
+
@Override
public int inflate(final byte[] output, final int off, final int len) throws DataFormatException {
if (inputBuf == null || inputLen == 0 || len == 0) {
diff --git a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java
index a5ddeeffd9..93504c7d2d 100644
--- a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java
+++ b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java
@@ -2,18 +2,28 @@
import htsjdk.HtsjdkTest;
import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1;
+import htsjdk.beta.plugin.IOUtils;
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
+import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
import htsjdk.beta.plugin.reads.ReadsFormats;
import htsjdk.beta.plugin.registry.HtsDefaultRegistry;
import htsjdk.io.HtsPath;
import htsjdk.io.IOPath;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.cram.cram31.CRAM31FidelityTestBase;
+import htsjdk.samtools.cram.common.CramVersions;
import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.samtools.util.FileExtensions;
import htsjdk.utils.SamtoolsTestUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
public class HtsCRAMCodec31Test extends HtsjdkTest {
final IOPath TEST_DIR = new HtsPath("src/test/resources/htsjdk/samtools/");
@@ -45,8 +55,63 @@ public void testCRAMDecoder() {
try (final CloseableIterator iterator = cramDecoder.iterator()) {
Assert.assertTrue(iterator.hasNext());
final SAMRecord samRecord = iterator.next();
- Assert.assertEquals(samRecord.getReadName(), "20FUKAAXX100202:6:27:4968:125377");
+ Assert.assertEquals(samRecord.getReadName(), "20FUKAAXX100202:5:62:8987:1929");
+ }
+ }
+ }
+
+ @Test
+ public void testRoundTripCRAM31() throws IOException {
+ final IOPath sourceCRAMPath = new HtsPath(TEST_DIR + "cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram");
+ final IOPath referencePath = new HtsPath(TEST_DIR + "reference/human_g1k_v37.20.21.fasta.gz");
+ final IOPath tempCRAM31Path = IOUtils.createTempPath("htsCRAMCodecTemporary", FileExtensions.CRAM);
+
+ final ReadsDecoderOptions readsDecoderOptions =
+ new ReadsDecoderOptions().setCRAMDecoderOptions(
+ new CRAMDecoderOptions().setReferencePath(referencePath));
+ final ReadsEncoderOptions readsEncoderOptions =
+ new ReadsEncoderOptions()
+ .setPreSorted(true)
+ .setCRAMEncoderOptions(new CRAMEncoderOptions().setReferencePath(referencePath));
+
+ try (final CRAMDecoder cramDecoder = (CRAMDecoder)
+ HtsDefaultRegistry.getReadsResolver().getReadsDecoder(sourceCRAMPath, readsDecoderOptions);
+ final CRAMEncoder cram31Encoder = (CRAMEncoder)
+ HtsDefaultRegistry.getReadsResolver().getReadsEncoder(tempCRAM31Path, readsEncoderOptions)) {
+
+ Assert.assertNotNull(cramDecoder);
+ Assert.assertEquals(cramDecoder.getFileFormat(), ReadsFormats.CRAM);
+ Assert.assertTrue(cramDecoder.getDisplayName().contains(sourceCRAMPath.toString()));
+
+ Assert.assertNotNull(cram31Encoder);
+ Assert.assertEquals(cram31Encoder.getFileFormat(), ReadsFormats.CRAM);
+ Assert.assertTrue(cram31Encoder.getDisplayName().contains(tempCRAM31Path.toString()));
+
+ final SAMFileHeader samFileHeader = cramDecoder.getHeader();
+ cram31Encoder.setHeader(samFileHeader);
+ for (final SAMRecord samRec : cramDecoder) {
+ cram31Encoder.write(samRec);
+ }
+ }
+
+ // make sure we got a CRAM 3.1 file
+ Assert.assertEquals(CRAM31FidelityTestBase.getCRAMVersion(tempCRAM31Path), CramVersions.CRAM_v3_1);
+
+ final List recs30 = new ArrayList<>();
+ final List recs31 = new ArrayList<>();
+
+ try (final CRAMDecoder cram30Decoder = (CRAMDecoder)
+ HtsDefaultRegistry.getReadsResolver().getReadsDecoder(sourceCRAMPath, readsDecoderOptions);
+ final CRAMDecoder cram31Decoder = (CRAMDecoder)
+ HtsDefaultRegistry.getReadsResolver().getReadsDecoder(tempCRAM31Path, readsDecoderOptions)) {
+ final Iterator it31 = cram31Decoder.iterator();
+ for (final SAMRecord sam30Rec : cram30Decoder) {
+ final SAMRecord sam31Rec = it31.next();
+ recs30.add(sam30Rec);
+ recs31.add(sam31Rec);
+ Assert.assertEquals(sam30Rec, sam31Rec);
}
}
}
+
}
diff --git a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java
index dc66b80ad5..25d74d7baa 100644
--- a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java
+++ b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java
@@ -3,6 +3,7 @@
import htsjdk.HtsjdkTest;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.reference.InMemoryReferenceSequenceFile;
+import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.CloseableIterator;
import org.testng.Assert;
import org.testng.annotations.Test;
@@ -35,6 +36,18 @@ private List createRecords(int count) {
}
list.addAll(builder.getRecords());
+ // Add NM/MD tags to match what CRAM decode will regenerate
+ final ReferenceSource refSource = createReferenceSource();
+ for (final SAMRecord rec : list) {
+ if (!rec.getReadUnmappedFlag() && rec.getReferenceIndex() >= 0) {
+ final byte[] refBases = refSource.getReferenceBases(
+ rec.getHeader().getSequence(rec.getReferenceIndex()), false);
+ if (refBases != null) {
+ SequenceUtil.calculateMdAndNmTags(rec, refBases, true, true);
+ }
+ }
+ }
+
Collections.sort(list, new SAMRecordCoordinateComparator());
return list;
diff --git a/src/test/java/htsjdk/samtools/CRAMFileBAIIndexTest.java b/src/test/java/htsjdk/samtools/CRAMFileBAIIndexTest.java
index 1deab7c074..f110dc49a7 100644
--- a/src/test/java/htsjdk/samtools/CRAMFileBAIIndexTest.java
+++ b/src/test/java/htsjdk/samtools/CRAMFileBAIIndexTest.java
@@ -25,7 +25,11 @@
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
+import java.util.Iterator;
import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.stream.Stream;
/**
* A collection of tests for CRAM BAI index write/read that use BAMFileIndexTest/index_test.bam file as the
@@ -38,6 +42,12 @@ public class CRAMFileBAIIndexTest extends HtsjdkTest {
private static final int NUMBER_OF_MAPPED_READS = 9721;
private static final int NUMBER_OF_READS = 10000;
+ // Caches to avoid rebuilding CRAM/BAI bytes for the same encoding strategy across DataProvider rows.
+ // Tests are read-only against this data so sharing is safe.
+ private static final ConcurrentMap cramBytesCache = new ConcurrentHashMap<>();
+ private static final ConcurrentMap baiBytesCache = new ConcurrentHashMap<>();
+ private static final ConcurrentMap cramFileCache = new ConcurrentHashMap<>();
+
private final static String TEST_QUERY_ALIGNMENT_CONTIG = "chrM";
private final static int TEST_QUERY_ALIGNMENT_START = 1519;
@@ -52,33 +62,23 @@ public class CRAMFileBAIIndexTest extends HtsjdkTest {
@DataProvider(name="filesWithContainerAndSlicePartitioningVariations")
public Object[][] getFilesWithContainerAndSlicePartitioningVariations() throws IOException {
return new Object[][] {
- // in order to set reads/slice to a small number, we must do the same for minimumSingleReferenceSliceSize
- //{ getCRAMFileForBAMFile(BAM_FILE, referenceSource, new CRAMEncodingStrategy()) },
- { getCRAMFileForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(100).setReadsPerSlice(100)) },
- { getCRAMFileForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(150).setReadsPerSlice(150)) },
- { getCRAMFileForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(200).setReadsPerSlice(200)) },
- { getCRAMFileForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(300).setReadsPerSlice(300)) },
+ // Smallest and largest reads/slice to exercise different container/slice boundaries.
+ // Intermediate values (150, 200) removed since CRAMIndexPermutationsTests covers
+ // 12 strategy permutations with a different BAM.
+ { getCachedCRAMFile(100) },
+ { getCachedCRAMFile(300) },
};
}
@DataProvider(name="bytesWithContainerAndSlicePartitioningVariations")
- public Object[][] getBytesWithContainerAndSlicePartitioningVariations() throws IOException {
- return new Object[][] {
- // in order to set reads/slice to a small number, we must do the same for minimumSingleReferenceSliceSize
- //{ getCRAMBytesForBAMFile(BAM_FILE, referenceSource, new CRAMEncodingStrategy()) },
- { getCRAMBytesForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(100).setReadsPerSlice(100)) },
- { getCRAMBytesForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(150).setReadsPerSlice(150)) },
- { getCRAMBytesForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(200).setReadsPerSlice(200)) },
- { getCRAMBytesForBAMFile(BAM_FILE, referenceSource,
- new CRAMEncodingStrategy().setMinimumSingleReferenceSliceSize(300).setReadsPerSlice(300)) },
- };
+ public Iterator