From 881163d75301976b17864b09a08d8d1e5b31adb9 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 24 Feb 2025 09:02:37 -0500 Subject: [PATCH 1/4] Enable a naive CRAM 3.1 write profile. --- .../reads/cram/cramV3_1/CRAMEncoderV3_1.java | 1 - .../beta/plugin/registry/ReadsResolver.java | 26 ------------ .../samtools/cram/common/CramVersions.java | 2 +- .../CompressionHeaderEncodingMap.java | 26 ++++++++---- .../cram/structure/CompressorCache.java | 8 ++-- .../htsjdk/samtools/cram/CRAM31Tests.java | 40 ++++++++++++++++--- .../htsjdk/samtools/cram/CRAMVersionTest.java | 4 +- .../structure/SliceBlockWriteStreamTest.java | 8 +++- 8 files changed, 67 insertions(+), 48 deletions(-) diff --git a/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java b/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java index fdcc53a8cc..4ae82e91e1 100644 --- a/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java +++ b/src/main/java/htsjdk/beta/codecs/reads/cram/cramV3_1/CRAMEncoderV3_1.java @@ -22,7 +22,6 @@ public class CRAMEncoderV3_1 extends CRAMEncoder { */ public CRAMEncoderV3_1(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) { super(outputBundle, readsEncoderOptions); - throw new CRAMException("CRAM v3.1 encoding is not yet supported"); } @Override diff --git a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java index 4e67a4dffd..b0df99899a 100644 --- a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java +++ b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java @@ -209,30 +209,4 @@ public ReadsEncoder getReadsEncoder( .getEncoder(outputBundle, readsEncoderOptions); } - /** - * Temporarily override to remove the CRAM 3.1 codec from the list of candidate codecs when the request is for - * the newest version, since it has no write implementation yet. - */ - @Override - protected List filterByVersion(final List candidateCodecs, final HtsVersion htsVersion) { - final List preFilteredCodecs; - if (htsVersion.equals(HtsVersion.NEWEST_VERSION)) { - // if the request is for the newest version, then pre-filter out the CRAM 3.1 codec since it has no - // write implementation yet, and then delegate to the superclass to let it find the newest version among - // the remaining codecs - preFilteredCodecs = candidateCodecs.stream().filter( - c -> !(c.getFileFormat().equals(ReadsFormats.CRAM) - && c.getVersion().equals(CRAMCodecV3_1.VERSION_3_1))) - .collect(Collectors.toList()); - final HtsVersion newestVersion = preFilteredCodecs.stream() - .map(c -> c.getVersion()) - .reduce(candidateCodecs.get(0).getVersion(), - (HtsVersion a, HtsVersion b) -> a.compareTo(b) > 0 ? a : b); - return candidateCodecs.stream().filter( - c -> c.getVersion().equals(newestVersion)).collect(Collectors.toList()); - } else { - preFilteredCodecs = candidateCodecs; - } - return super.filterByVersion(preFilteredCodecs, htsVersion); - } } diff --git a/src/main/java/htsjdk/samtools/cram/common/CramVersions.java b/src/main/java/htsjdk/samtools/cram/common/CramVersions.java index 852a135563..be0c86f909 100644 --- a/src/main/java/htsjdk/samtools/cram/common/CramVersions.java +++ b/src/main/java/htsjdk/samtools/cram/common/CramVersions.java @@ -17,7 +17,7 @@ public final class CramVersions { /** * The default CRAM version when creating a new CRAM output file or stream. */ - public static final CRAMVersion DEFAULT_CRAM_VERSION = CRAM_v3; + public static final CRAMVersion DEFAULT_CRAM_VERSION = CRAM_v3_1; /** * Return true if {@code candidateVersion} is a supported CRAM version. diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java index 4394b49eab..a7be0d938f 100644 --- a/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java +++ b/src/main/java/htsjdk/samtools/cram/structure/CompressionHeaderEncodingMap.java @@ -26,7 +26,8 @@ import htsjdk.samtools.cram.CRAMException; import htsjdk.samtools.cram.compression.ExternalCompressor; -import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Params; +import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode; +import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params; import htsjdk.samtools.cram.encoding.CRAMEncoding; import htsjdk.samtools.cram.encoding.external.ByteArrayStopEncoding; import htsjdk.samtools.cram.encoding.external.ExternalByteEncoding; @@ -140,7 +141,7 @@ public CompressionHeaderEncodingMap(final CRAMEncodingStrategy encodingStrategy) putExternalRansOrderOneEncoding(DataSeries.RG_ReadGroup); putExternalRansOrderZeroEncoding(DataSeries.RI_RefId); putExternalRansOrderOneEncoding(DataSeries.RL_ReadLength); - putExternalByteArrayStopTabGzipEncoding(encodingStrategy, DataSeries.RN_ReadName); + putByteArrayStopNameTokEncoding(encodingStrategy, DataSeries.RN_ReadName); putExternalGzipEncoding(encodingStrategy, DataSeries.RS_RefSkip); putExternalByteArrayStopTabGzipEncoding(encodingStrategy, DataSeries.SC_SoftClip); // the TC data series is obsolete @@ -287,13 +288,13 @@ public ExternalCompressor getBestExternalCompressor(final byte[] data, final CRA final int gzipLen = gzip.compress(data, null).length; final ExternalCompressor rans0 = compressorCache.getCompressorForMethod( - BlockCompressionMethod.RANS, - RANS4x8Params.ORDER.ZERO.ordinal()); + BlockCompressionMethod.RANSNx16, + RANSNx16Params.ORDER.ZERO.ordinal()); final int rans0Len = rans0.compress(data,null).length; final ExternalCompressor rans1 = compressorCache.getCompressorForMethod( - BlockCompressionMethod.RANS, - RANS4x8Params.ORDER.ONE.ordinal()); + BlockCompressionMethod.RANSNx16, + RANSNx16Params.ORDER.ONE.ordinal()); final int rans1Len = rans1.compress(data, null).length; // find the best of general purpose codecs: @@ -386,6 +387,15 @@ private void putExternalByteArrayStopTabGzipEncoding(final CRAMEncodingStrategy compressorCache.getCompressorForMethod(BlockCompressionMethod.GZIP, encodingStrategy.getGZIPCompressionLevel())); } + private void putByteArrayStopNameTokEncoding(final CRAMEncodingStrategy encodingStrategy, final DataSeries dataSeries) { + // ByteArrayStopEncoding is paired with name tokenisation since using it with the + // NameTokenisationDecode.NAME_SEPARATOR conveniently writes the read name data in the NAME_SEPARATOR + // delimited/terminated format that is expected by the downstream tokenisation compressor code + putExternalEncoding(dataSeries, + new ByteArrayStopEncoding(NameTokenisationDecode.NAME_SEPARATOR, dataSeries.getExternalBlockContentId()).toEncodingDescriptor(), + compressorCache.getCompressorForMethod(BlockCompressionMethod.NAME_TOKENISER, 0)); + } + // add an external encoding appropriate for the dataSeries value type, with a GZIP compressor private void putExternalGzipEncoding(final CRAMEncodingStrategy encodingStrategy, final DataSeries dataSeries) { putExternalEncoding( @@ -397,14 +407,14 @@ private void putExternalGzipEncoding(final CRAMEncodingStrategy encodingStrategy private void putExternalRansOrderOneEncoding(final DataSeries dataSeries) { putExternalEncoding( dataSeries, - compressorCache.getCompressorForMethod(BlockCompressionMethod.RANS, RANS4x8Params.ORDER.ONE.ordinal())); + compressorCache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ONE.ordinal())); } // add an external encoding appropriate for the dataSeries value type, with a RANS order 0 compressor private void putExternalRansOrderZeroEncoding(final DataSeries dataSeries) { putExternalEncoding( dataSeries, - compressorCache.getCompressorForMethod(BlockCompressionMethod.RANS, RANS4x8Params.ORDER.ZERO.ordinal())); + compressorCache.getCompressorForMethod(BlockCompressionMethod.RANSNx16, RANSNx16Params.ORDER.ZERO.ordinal())); } @Override diff --git a/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java b/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java index a7e28511d8..a0087452c9 100644 --- a/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java +++ b/src/main/java/htsjdk/samtools/cram/structure/CompressorCache.java @@ -79,8 +79,8 @@ public ExternalCompressor getCompressorForMethod( return getCachedCompressorForMethod(compressionMethod, compressorSpecificArg); case RANS: { - // for efficiency, we want to share the same underlying RANS object with both order-0 and - // order-1 ExternalCompressors + // in previous implementations, we would cache separate order-0 and order-1 compressors for performance + // reasons; we no longer NEED to do so but retain this structure for now final int ransArg = compressorSpecificArg == ExternalCompressor.NO_COMPRESSION_ARG ? RANS4x8Params.ORDER.ZERO.ordinal() : compressorSpecificArg; @@ -103,8 +103,8 @@ public ExternalCompressor getCompressorForMethod( } case RANSNx16: { - // for efficiency, we want to share the same underlying RANSNx16 object with both order-0 and - // order-1 ExternalCompressors + // in previous implementations, we would cache separate order-0 and order-1 compressors for performance + // reasons; we no longer NEED to do so but retain this structure for now final int ransArg = compressorSpecificArg == ExternalCompressor.NO_COMPRESSION_ARG ? RANSNx16Params.ORDER.ZERO.ordinal() : compressorSpecificArg; diff --git a/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java b/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java index b193e41a4b..7397ca4cbd 100644 --- a/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java +++ b/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java @@ -24,7 +24,7 @@ public class CRAM31Tests extends HtsjdkTest { private static final String TEST_DATA_DIR = "src/test/resources/htsjdk/samtools/cram/"; - // use samtools to convert to CRAM3.1 using samtools profiles fast, normanl, small, and archive, then + // use samtools to convert to CRAM3.1 using samtools profiles fast, normal, small, and archive, then // read with htsjdk and compare the results with the original @DataProvider(name="cram31FidelityTests") @@ -98,11 +98,12 @@ public Object[][] cram31FidelityTests() { }; } + //because conversions are expensive, this tests both htsjdk cram 31.1 reading and writing together @Test(dataProvider = "cram31FidelityTests") - public void testCRAM31RoundTrip( + public void testCRAM31SamtoolsFidelity( final IOPath testInput, final IOPath testReference, - final String samtoolsCommandLineArgs) { + final String samtoolsCommandLineArgs) throws IOException { // for testing CRAM 3.1, we use CRAM 3.0 as our ground-truth for comparison, so check to make // sure that our input test file is CRAM 3.0 and not 3.1 @@ -110,12 +111,41 @@ public void testCRAM31RoundTrip( // use samtools to convert the input to CRAM 3.1, then compare the result of that with the original final IOPath cram31Path = IOUtils.createTempPath("cram31Test", "cram"); + + // test htsjdk cram 3.1 reading by using samtools to convert the input to CRAM 3.1, and then consuming it + // with htsjdk and comparing the results to the original + final IOPath cramSamtools31Path = IOUtils.createTempPath("cram31SamtoolsWriteTest", ".cram"); SamtoolsTestUtils.convertToCRAM( testInput, - cram31Path, + cramSamtools31Path, testReference, samtoolsCommandLineArgs); - doCRAMCompare(testInput.toPath(), cram31Path.toPath(), testReference.toPath()); + Assert.assertEquals(getCRAMVersion(cramSamtools31Path), CramVersions.CRAM_v3_1); + + // compare the original CRAM 3.0 test input with the samtools-generated 3.1 output + doCRAMCompare(testInput.toPath(), cramSamtools31Path.toPath(), testReference.toPath()); + + // now test htsjdk CRAM 3.1 writing by using htsjdk to write CRAM 3.1 and use samtools to consume it and write + // it back to another CRAM (3.1), and then read that result back in and compare it to the original + final IOPath cramHtsjdk31Path = IOUtils.createTempPath("cram31HtsjdkWriteTest", ".cram"); + final SamReaderFactory samReaderFactory = + SamReaderFactory.makeDefault() + .referenceSequence(testReference.toPath()) + .validationStringency(ValidationStringency.LENIENT); + try (final SamReader reader = samReaderFactory.open(testInput.toPath()); + final SAMFileWriter writer = new SAMFileWriterFactory() + .makeWriter(reader.getFileHeader().clone(), true, cramHtsjdk31Path.toPath(), testReference.toPath())) { + for (final SAMRecord rec : reader) { + writer.addAlignment(rec); + } + } + Assert.assertEquals(getCRAMVersion(cramHtsjdk31Path), CramVersions.CRAM_v3_1); + + // compare the original test input with the htsjdk-generated 3.1 output + doCRAMCompare(testInput.toPath(), cramHtsjdk31Path.toPath(), testReference.toPath()); + + // for completeness, compare the htsjdk-generated cram 3.1 output with the samtools-generated 3.1 output + doCRAMCompare(cramHtsjdk31Path.toPath(), cramSamtools31Path.toPath(), testReference.toPath()); } public static void doCRAMCompare( diff --git a/src/test/java/htsjdk/samtools/cram/CRAMVersionTest.java b/src/test/java/htsjdk/samtools/cram/CRAMVersionTest.java index e785ce4e6e..3c74b7cc10 100644 --- a/src/test/java/htsjdk/samtools/cram/CRAMVersionTest.java +++ b/src/test/java/htsjdk/samtools/cram/CRAMVersionTest.java @@ -37,11 +37,11 @@ public class CRAMVersionTest extends HtsjdkTest { * @throws IOException */ @Test - public void test_V3() throws IOException { + public void test_V3_1() throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ReferenceSource source = new ReferenceSource((File) null); SAMFileHeader samFileHeader = new SAMFileHeader(); - CRAMVersion cramVersion = CramVersions.CRAM_v3; + CRAMVersion cramVersion = CramVersions.CRAM_v3_1; CRAMFileWriter w = new CRAMFileWriter(baos, source, samFileHeader, null); SAMRecord record = new SAMRecord(samFileHeader); record.setReadName("name"); diff --git a/src/test/java/htsjdk/samtools/cram/structure/SliceBlockWriteStreamTest.java b/src/test/java/htsjdk/samtools/cram/structure/SliceBlockWriteStreamTest.java index 2c0cd213cc..69dfd6b124 100644 --- a/src/test/java/htsjdk/samtools/cram/structure/SliceBlockWriteStreamTest.java +++ b/src/test/java/htsjdk/samtools/cram/structure/SliceBlockWriteStreamTest.java @@ -1,6 +1,7 @@ package htsjdk.samtools.cram.structure; import htsjdk.HtsjdkTest; +import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationTest; import htsjdk.samtools.cram.io.BitOutputStream; import org.testng.Assert; import org.testng.annotations.Test; @@ -32,7 +33,12 @@ public void testSliceBlocksWriteStreamsRoundTrip() throws IOException { // and write the name of each data series to the corresponding external stream for (final DataSeries dataSeries : DataSeries.values()) { if (!StructureTestUtils.DATASERIES_NOT_WRITTEN_BY_HTSJDK.contains(dataSeries)) { - final String uncompressedContent = dataSeries.getCanonicalName(); + // artificially prepare read name data since the name tokenizer requires structured data and cannot + // handle the raw,unstructured data which this test uses + final String uncompressedContent = + dataSeries == DataSeries.RN_ReadName ? + dataSeries.getCanonicalName() + NameTokenisationTest.LOCAL_NAME_SEPARATOR_CHARSEQUENCE : + dataSeries.getCanonicalName(); expectedExternalContent.put(dataSeries.getExternalBlockContentId(), uncompressedContent); sliceBlocksWriteStreams .getExternalOutputStream(dataSeries.getExternalBlockContentId()) From d516bce7b70e0b40e6a48fd33ce97440eafbc16b Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 7 Apr 2025 14:17:37 -0400 Subject: [PATCH 2/4] Changes based on instrumented trial runs. --- .../NameTokenisationEncode.java | 35 ++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java index d17220d1fc..3322ae6407 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java @@ -342,15 +342,20 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo ByteBuffer compressedByteBuffer = null; if (useArith == true) { // use the range encoder + // this code path is never executed by the default write profile, since we don't turn on the + // range coder, but it is used by the test suite final int[] rangeEncoderFlagsSets = { - 0, - RangeParams.ORDER_FLAG_MASK, + // based on a few observations (using ransNx16, not range), RLE, PACK, and ORDER/PACK seem to + // yield the best results; this to be validated for the range encoder but for now use the same + // flags as for ransNx16 + 0, // no flags, just use arith RangeParams.RLE_FLAG_MASK, //64 - RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, //65 RangeParams.PACK_FLAG_MASK, //128, RangeParams.PACK_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, //129 + //RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK, //65 + //RangeParams.ORDER_FLAG_MASK, // we don't include stripe here since it's not implemented for write - RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK // 193+8 + //RangeParams.PACK_FLAG_MASK | RangeParams.RLE_FLAG_MASK | RangeParams.ORDER_FLAG_MASK // 193+8 }; for (int rangeEncoderFlagSet : rangeEncoderFlagsSets) { if ((rangeEncoderFlagSet & RangeParams.ORDER_FLAG_MASK) != 0 && nameTokenStream.remaining() < 100) { @@ -368,16 +373,24 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo compressedByteBuffer = tmpByteBuffer; } } + if (bestCompressedLength > nameTokenStream.limit()) { + // compression doesn't buy us anything; just use CAT + final RangeEncode rangeEncode = new RangeEncode(); + nameTokenStream.rewind(); + compressedByteBuffer = rangeEncode.compress(nameTokenStream, new RangeParams(RANSNx16Params.CAT_FLAG_MASK)); + } } else { final int[] ransNx16FlagsSets = { - 0, - RANSNx16Params.ORDER_FLAG_MASK, + 0, // no flags, just use RANSNx16 RANSNx16Params.RLE_FLAG_MASK, //64 - RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, //65 RANSNx16Params.PACK_FLAG_MASK, //128, RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, //129 + // based on a few observations using ransNx16, RLE, PACK, and ORDER/PACK seem to yield the + // best results; this needs more validation but for now don't try the remaining combinations + //RANSNx16Params.ORDER_FLAG_MASK, + //RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, //65 // we don't include stripe here since it's not implemented for write - RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK // 193+8 + //RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK // 193+8 }; for (int ransNx16FlagSet : ransNx16FlagsSets) { if ((ransNx16FlagSet & RANSNx16Params.ORDER_FLAG_MASK) != 0 && nameTokenStream.remaining() < 100) { @@ -395,6 +408,12 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo compressedByteBuffer = tmpByteBuffer; } } + if (bestCompressedLength > nameTokenStream.limit()) { + // compression doesn't buy us anything; just use CAT + final RANSNx16Encode ransEncode = new RANSNx16Encode(); + nameTokenStream.rewind(); + compressedByteBuffer = ransEncode.compress(nameTokenStream, new RANSNx16Params(RANSNx16Params.CAT_FLAG_MASK)); + } } return compressedByteBuffer; } From 0135312a462f3db8a9012c8f7bb15235e87d2538 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 14 Apr 2025 17:31:45 -0400 Subject: [PATCH 3/4] CRAM 3.1 write tests and code cleanup. --- .../beta/plugin/registry/ReadsResolver.java | 5 -- .../cram/compression/CompressionUtils.java | 2 +- .../codecs/reads/cram/HtsCRAMCodec31Test.java | 63 +++++++++++++++++++ .../htsjdk/samtools/cram/CRAM31Tests.java | 30 ++++----- 4 files changed, 80 insertions(+), 20 deletions(-) diff --git a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java index b0df99899a..b89ffc4a10 100644 --- a/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java +++ b/src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java @@ -1,6 +1,5 @@ package htsjdk.beta.plugin.registry; -import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1; import htsjdk.beta.exception.HtsjdkException; import htsjdk.beta.exception.HtsjdkPluginException; import htsjdk.beta.plugin.HtsVersion; @@ -12,13 +11,9 @@ import htsjdk.beta.plugin.reads.ReadsDecoderOptions; import htsjdk.beta.plugin.reads.ReadsEncoder; import htsjdk.beta.plugin.reads.ReadsEncoderOptions; -import htsjdk.beta.plugin.reads.ReadsFormats; import htsjdk.io.IOPath; import htsjdk.utils.ValidationUtils; -import java.util.List; -import java.util.stream.Collectors; - /** * Class with methods for resolving inputs and outputs to reads encoders and decoders. *

diff --git a/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java b/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java index c8a5bf04d9..e96dfdc08f 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java +++ b/src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java @@ -156,7 +156,7 @@ public static ByteBuffer allocateOutputBuffer(final int inSize) { final int compressedSize = (int) (inSize + 257 * 257 * 3 + 9); final ByteBuffer outputBuffer = allocateByteBuffer(compressedSize); if (outputBuffer.remaining() < compressedSize) { - throw new CRAMException("Failed to allocate sufficient buffer size for RANS coder."); + throw new CRAMException("Failed to allocate sufficient buffer size for CRAM codec."); } return outputBuffer; } diff --git a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java index a5ddeeffd9..3de23ea898 100644 --- a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java +++ b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java @@ -2,18 +2,28 @@ import htsjdk.HtsjdkTest; import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1; +import htsjdk.beta.plugin.IOUtils; import htsjdk.beta.plugin.reads.ReadsDecoderOptions; +import htsjdk.beta.plugin.reads.ReadsEncoderOptions; import htsjdk.beta.plugin.reads.ReadsFormats; import htsjdk.beta.plugin.registry.HtsDefaultRegistry; import htsjdk.io.HtsPath; import htsjdk.io.IOPath; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.cram.CRAM31Tests; +import htsjdk.samtools.cram.common.CramVersions; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.FileExtensions; import htsjdk.utils.SamtoolsTestUtils; import org.testng.Assert; import org.testng.annotations.Test; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + public class HtsCRAMCodec31Test extends HtsjdkTest { final IOPath TEST_DIR = new HtsPath("src/test/resources/htsjdk/samtools/"); @@ -49,4 +59,57 @@ public void testCRAMDecoder() { } } } + + @Test + public void testRoundTripCRAM31() throws IOException { + final IOPath sourceCRAMPath = new HtsPath(TEST_DIR + "cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram"); + final IOPath referencePath = new HtsPath(TEST_DIR + "reference/human_g1k_v37.20.21.fasta.gz"); + final IOPath tempCRAM31Path = IOUtils.createTempPath("htsCRAMCodecTemporary", FileExtensions.CRAM); + + final ReadsDecoderOptions readsDecoderOptions = + new ReadsDecoderOptions().setCRAMDecoderOptions( + new CRAMDecoderOptions().setReferencePath(referencePath)); + final ReadsEncoderOptions readsEncoderOptions = + new ReadsEncoderOptions().setCRAMEncoderOptions(new CRAMEncoderOptions().setReferencePath(referencePath)); + + try (final CRAMDecoder cramDecoder = (CRAMDecoder) + HtsDefaultRegistry.getReadsResolver().getReadsDecoder(sourceCRAMPath, readsDecoderOptions); + final CRAMEncoder cram31Encoder = (CRAMEncoder) + HtsDefaultRegistry.getReadsResolver().getReadsEncoder(tempCRAM31Path, readsEncoderOptions)) { + + Assert.assertNotNull(cramDecoder); + Assert.assertEquals(cramDecoder.getFileFormat(), ReadsFormats.CRAM); + Assert.assertTrue(cramDecoder.getDisplayName().contains(sourceCRAMPath.toString())); + + Assert.assertNotNull(cram31Encoder); + Assert.assertEquals(cram31Encoder.getFileFormat(), ReadsFormats.CRAM); + Assert.assertTrue(cram31Encoder.getDisplayName().contains(tempCRAM31Path.toString())); + + final SAMFileHeader samFileHeader = cramDecoder.getHeader(); + cram31Encoder.setHeader(samFileHeader); + for (final SAMRecord samRec : cramDecoder) { + cram31Encoder.write(samRec); + } + } + + // make sure we got a CRAM 3.1 file + Assert.assertEquals(CRAM31Tests.getCRAMVersion(tempCRAM31Path), CramVersions.CRAM_v3_1); + + final List recs30 = new ArrayList<>(); + final List recs31 = new ArrayList<>(); + + try (final CRAMDecoder cram30Decoder = (CRAMDecoder) + HtsDefaultRegistry.getReadsResolver().getReadsDecoder(sourceCRAMPath, readsDecoderOptions); + final CRAMDecoder cram31Decoder = (CRAMDecoder) + HtsDefaultRegistry.getReadsResolver().getReadsDecoder(tempCRAM31Path, readsDecoderOptions)) { + final Iterator it31 = cram31Decoder.iterator(); + for (final SAMRecord sam30Rec : cram30Decoder) { + final SAMRecord sam31Rec = it31.next(); + recs30.add(sam30Rec); + recs31.add(sam31Rec); + Assert.assertEquals(sam30Rec, sam31Rec); + } + } + } + } diff --git a/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java b/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java index 7397ca4cbd..5275bb758c 100644 --- a/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java +++ b/src/test/java/htsjdk/samtools/cram/CRAM31Tests.java @@ -110,8 +110,6 @@ public void testCRAM31SamtoolsFidelity( Assert.assertEquals(getCRAMVersion(testInput), CramVersions.CRAM_v3); // use samtools to convert the input to CRAM 3.1, then compare the result of that with the original - final IOPath cram31Path = IOUtils.createTempPath("cram31Test", "cram"); - // test htsjdk cram 3.1 reading by using samtools to convert the input to CRAM 3.1, and then consuming it // with htsjdk and comparing the results to the original final IOPath cramSamtools31Path = IOUtils.createTempPath("cram31SamtoolsWriteTest", ".cram"); @@ -128,17 +126,7 @@ public void testCRAM31SamtoolsFidelity( // now test htsjdk CRAM 3.1 writing by using htsjdk to write CRAM 3.1 and use samtools to consume it and write // it back to another CRAM (3.1), and then read that result back in and compare it to the original final IOPath cramHtsjdk31Path = IOUtils.createTempPath("cram31HtsjdkWriteTest", ".cram"); - final SamReaderFactory samReaderFactory = - SamReaderFactory.makeDefault() - .referenceSequence(testReference.toPath()) - .validationStringency(ValidationStringency.LENIENT); - try (final SamReader reader = samReaderFactory.open(testInput.toPath()); - final SAMFileWriter writer = new SAMFileWriterFactory() - .makeWriter(reader.getFileHeader().clone(), true, cramHtsjdk31Path.toPath(), testReference.toPath())) { - for (final SAMRecord rec : reader) { - writer.addAlignment(rec); - } - } + doHTSJDKWriteCRAM(testInput, cramHtsjdk31Path, testReference); Assert.assertEquals(getCRAMVersion(cramHtsjdk31Path), CramVersions.CRAM_v3_1); // compare the original test input with the htsjdk-generated 3.1 output @@ -178,7 +166,21 @@ public static void doCRAMCompare( Assert.assertEquals(diffCount, 0); } - private static CRAMVersion getCRAMVersion(final IOPath cramPath) { + public static void doHTSJDKWriteCRAM(final IOPath inputPath, final IOPath outputPath, final IOPath referencePath) throws IOException{ + final SamReaderFactory samReaderFactory = + SamReaderFactory.makeDefault() + .referenceSequence(referencePath.toPath()) + .validationStringency(ValidationStringency.LENIENT); + try (final SamReader reader = samReaderFactory.open(inputPath.toPath()); + final SAMFileWriter writer = new SAMFileWriterFactory() + .makeWriter(reader.getFileHeader().clone(), true, outputPath.toPath(), referencePath.toPath())) { + for (final SAMRecord rec : reader) { + writer.addAlignment(rec); + } + } + } + + public static CRAMVersion getCRAMVersion(final IOPath cramPath) { try (final InputStream fis = Files.newInputStream(cramPath.toPath())) { final CramHeader cramHeader = CramIO.readCramHeader(fis); return cramHeader.getCRAMVersion(); From 7ff46c4c02d1c6f89e7f6e4eb39691324c15d548 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 14 Apr 2025 18:48:06 -0400 Subject: [PATCH 4/4] Temp fix for preSorted=false default. --- .../htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java index 3de23ea898..e1ff709b5c 100644 --- a/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java +++ b/src/test/java/htsjdk/beta/codecs/reads/cram/HtsCRAMCodec31Test.java @@ -70,7 +70,9 @@ public void testRoundTripCRAM31() throws IOException { new ReadsDecoderOptions().setCRAMDecoderOptions( new CRAMDecoderOptions().setReferencePath(referencePath)); final ReadsEncoderOptions readsEncoderOptions = - new ReadsEncoderOptions().setCRAMEncoderOptions(new CRAMEncoderOptions().setReferencePath(referencePath)); + new ReadsEncoderOptions() + .setPreSorted(true) + .setCRAMEncoderOptions(new CRAMEncoderOptions().setReferencePath(referencePath)); try (final CRAMDecoder cramDecoder = (CRAMDecoder) HtsDefaultRegistry.getReadsResolver().getReadsDecoder(sourceCRAMPath, readsDecoderOptions);