diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java b/core/src/main/java/org/apache/iceberg/MetricsUtil.java index 72c57a8bebcf..052a13bb88f4 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java +++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import java.nio.ByteBuffer; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; @@ -479,6 +480,81 @@ public void set(int pos, T value) { } } + static Map valueCounts(ContentStats stats) { + if (stats == null) { + return null; + } + + Map result = Maps.newHashMap(); + for (FieldStats fs : stats.fieldStats()) { + if (fs != null && fs.valueCount() != null) { + result.put(fs.fieldId(), fs.valueCount()); + } + } + + return result.isEmpty() ? null : Collections.unmodifiableMap(result); + } + + static Map nullValueCounts(ContentStats stats) { + if (stats == null) { + return null; + } + + Map result = Maps.newHashMap(); + for (FieldStats fs : stats.fieldStats()) { + if (fs != null && fs.nullValueCount() != null) { + result.put(fs.fieldId(), fs.nullValueCount()); + } + } + + return result.isEmpty() ? null : Collections.unmodifiableMap(result); + } + + static Map nanValueCounts(ContentStats stats) { + if (stats == null) { + return null; + } + + Map result = Maps.newHashMap(); + for (FieldStats fs : stats.fieldStats()) { + if (fs != null && fs.nanValueCount() != null) { + result.put(fs.fieldId(), fs.nanValueCount()); + } + } + + return result.isEmpty() ? null : Collections.unmodifiableMap(result); + } + + static Map lowerBounds(ContentStats stats) { + if (stats == null) { + return null; + } + + Map result = Maps.newHashMap(); + for (FieldStats fs : stats.fieldStats()) { + if (fs != null && fs.lowerBound() != null && fs.type() != null) { + result.put(fs.fieldId(), Conversions.toByteBuffer(fs.type(), fs.lowerBound())); + } + } + + return result.isEmpty() ? null : Collections.unmodifiableMap(result); + } + + static Map upperBounds(ContentStats stats) { + if (stats == null) { + return null; + } + + Map result = Maps.newHashMap(); + for (FieldStats fs : stats.fieldStats()) { + if (fs != null && fs.upperBound() != null && fs.type() != null) { + result.put(fs.fieldId(), Conversions.toByteBuffer(fs.type(), fs.upperBound())); + } + } + + return result.isEmpty() ? null : Collections.unmodifiableMap(result); + } + static ContentStats fromMetrics(Schema schema, Metrics metrics) { if (null == metrics) { return null; diff --git a/core/src/main/java/org/apache/iceberg/TrackedFileAdapters.java b/core/src/main/java/org/apache/iceberg/TrackedFileAdapters.java new file mode 100644 index 000000000000..525357439a4d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFileAdapters.java @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Adapts {@link TrackedFile} entries to the {@link DataFile} and {@link DeleteFile} APIs. + * + *

V4 colocates deletion vectors with data file entries in {@link TrackedFile}. Rather than + * extending {@link DataFile} with deletion vector fields, DVs are extracted as separate {@link + * DeleteFile} objects via {@link #asDVDeleteFile(TrackedFile, Map)}. This matches the v3 convention + * where DVs are tracked as {@link DeleteFile} entries in delete manifests and keeps the existing + * {@link FileScanTask} contract ({@code file()} + {@code deletes()}) unchanged. + */ +class TrackedFileAdapters { + + private TrackedFileAdapters() {} + + static DataFile asDataFile(TrackedFile file, Map specsById) { + Preconditions.checkArgument( + file.contentType() == FileContent.DATA, + "Invalid content type for DataFile: %s", + file.contentType()); + return new TrackedDataFile(file, resolveSpec(file, specsById)); + } + + static DeleteFile asDVDeleteFile(TrackedFile file, Map specsById) { + Preconditions.checkArgument( + file.contentType() == FileContent.DATA, + "Invalid content type for DV delete file: %s", + file.contentType()); + return new TrackedDVDeleteFile(file, resolveSpec(file, specsById)); + } + + static DeleteFile asEqualityDeleteFile(TrackedFile file, Map specsById) { + Preconditions.checkArgument( + file.contentType() == FileContent.EQUALITY_DELETES, + "Invalid content type for equality delete file: %s", + file.contentType()); + return new TrackedEqualityDeleteFile(file, resolveSpec(file, specsById)); + } + + private static PartitionSpec resolveSpec( + TrackedFile file, Map specsById) { + Integer specId = file.specId(); + if (specId != null) { + PartitionSpec spec = specsById.get(specId); + Preconditions.checkArgument( + spec != null, "Cannot find partition spec for spec ID: %s", specId); + return spec; + } + + for (PartitionSpec spec : specsById.values()) { + if (spec.isUnpartitioned()) { + return spec; + } + } + + throw new IllegalArgumentException( + "Cannot find unpartitioned spec in specs: " + specsById.keySet()); + } + + /** + * Shared base for all tracked file adapters. Holds the common fields and implements the methods + * that delegate to {@link TrackedFile} and {@link PartitionSpec}. + */ + private abstract static class TrackedFileAdapter> + implements ContentFile { + private final TrackedFile file; + private final PartitionSpec spec; + + private TrackedFileAdapter(TrackedFile file, PartitionSpec spec) { + this.file = file; + this.spec = spec; + } + + protected TrackedFile file() { + return file; + } + + protected PartitionSpec spec() { + return spec; + } + + protected Tracking tracking() { + return file.tracking(); + } + + @Override + public Long pos() { + Tracking tracking = tracking(); + return tracking != null ? tracking.manifestPos() : null; + } + + @Override + public String manifestLocation() { + Tracking tracking = tracking(); + return tracking != null ? tracking.manifestLocation() : null; + } + + @Override + public int specId() { + return file.specId() != null ? file.specId() : spec.specId(); + } + + // TODO: return a real partition tuple (https://github.com/apache/iceberg/issues/16222) + @Override + public StructLike partition() { + return null; + } + + @Override + public Long dataSequenceNumber() { + Tracking tracking = tracking(); + return tracking != null ? tracking.dataSequenceNumber() : null; + } + + @Override + public Long fileSequenceNumber() { + Tracking tracking = tracking(); + return tracking != null ? tracking.fileSequenceNumber() : null; + } + } + + /** Shared base for adapters that delegate to a {@link TrackedFile} for content file fields. */ + private abstract static class TrackedContentFile> + extends TrackedFileAdapter { + private TrackedContentFile(TrackedFile file, PartitionSpec spec) { + super(file, spec); + } + + @SuppressWarnings("deprecation") + @Override + public CharSequence path() { + return file().location(); + } + + @Override + public String location() { + return file().location(); + } + + @Override + public FileFormat format() { + return file().fileFormat(); + } + + @Override + public long recordCount() { + return file().recordCount(); + } + + @Override + public long fileSizeInBytes() { + return file().fileSizeInBytes(); + } + + @Override + public Integer sortOrderId() { + return file().sortOrderId(); + } + + @Override + public ByteBuffer keyMetadata() { + return file().keyMetadata(); + } + + @Override + public List splitOffsets() { + return file().splitOffsets(); + } + + @Override + public Map columnSizes() { + return null; + } + + @Override + public Map valueCounts() { + return MetricsUtil.valueCounts(file().contentStats()); + } + + @Override + public Map nullValueCounts() { + return MetricsUtil.nullValueCounts(file().contentStats()); + } + + @Override + public Map nanValueCounts() { + return MetricsUtil.nanValueCounts(file().contentStats()); + } + + @Override + public Map lowerBounds() { + return MetricsUtil.lowerBounds(file().contentStats()); + } + + @Override + public Map upperBounds() { + return MetricsUtil.upperBounds(file().contentStats()); + } + } + + /** Adapts a TrackedFile DATA entry to the {@link DataFile} interface. */ + private static class TrackedDataFile extends TrackedContentFile implements DataFile { + private TrackedDataFile(TrackedFile file, PartitionSpec spec) { + super(file, spec); + } + + @Override + public FileContent content() { + return FileContent.DATA; + } + + @Override + public Long firstRowId() { + return tracking() != null ? tracking().firstRowId() : null; + } + + @Override + public DataFile copy() { + return new TrackedDataFile(file().copy(), spec()); + } + + @Override + public DataFile copy(boolean withStats) { + return withStats ? copy() : copyWithoutStats(); + } + + @Override + public DataFile copyWithoutStats() { + return new TrackedDataFile(file().copyWithoutStats(), spec()); + } + + @Override + public DataFile copyWithStats(Set requestedColumnIds) { + return new TrackedDataFile(file().copyWithStats(requestedColumnIds), spec()); + } + } + + /** Adapts a TrackedFile EQUALITY_DELETES entry to the {@link DeleteFile} interface. */ + private static class TrackedEqualityDeleteFile extends TrackedContentFile + implements DeleteFile { + private TrackedEqualityDeleteFile(TrackedFile file, PartitionSpec spec) { + super(file, spec); + } + + @Override + public FileContent content() { + return FileContent.EQUALITY_DELETES; + } + + @Override + public List equalityFieldIds() { + return file().equalityIds(); + } + + @Override + public DeleteFile copy() { + return new TrackedEqualityDeleteFile(file().copy(), spec()); + } + + @Override + public DeleteFile copy(boolean withStats) { + return withStats ? copy() : copyWithoutStats(); + } + + @Override + public DeleteFile copyWithoutStats() { + return new TrackedEqualityDeleteFile(file().copyWithoutStats(), spec()); + } + + @Override + public DeleteFile copyWithStats(Set requestedColumnIds) { + return new TrackedEqualityDeleteFile(file().copyWithStats(requestedColumnIds), spec()); + } + } + + /** + * Adapts the deletion vector from a TrackedFile DATA entry to the {@link DeleteFile} interface. + * + *

The DV blob metadata is mapped to the DeleteFile DV fields: {@link + * DeleteFile#referencedDataFile()} is the data file location, and {@link + * DeleteFile#contentOffset()} / {@link DeleteFile#contentSizeInBytes()} point to the blob within + * the Puffin file. + */ + private static class TrackedDVDeleteFile extends TrackedFileAdapter + implements DeleteFile { + private final DeletionVector dv; + + private TrackedDVDeleteFile(TrackedFile file, PartitionSpec spec) { + super(file, spec); + Preconditions.checkArgument( + file.deletionVector() != null, "Cannot create DV delete file: no deletion vector"); + this.dv = file.deletionVector(); + } + + @Override + public FileContent content() { + return FileContent.POSITION_DELETES; + } + + @SuppressWarnings("deprecation") + @Override + public CharSequence path() { + return dv.location(); + } + + @Override + public String location() { + return dv.location(); + } + + @Override + public FileFormat format() { + return FileFormat.PUFFIN; + } + + @Override + public long recordCount() { + return dv.cardinality(); + } + + // Returns the DV blob size, not the full Puffin file size. The DeletionVector metadata does not + // include the Puffin file size, so this is the best approximation available. Space accounting + // that sums fileSizeInBytes() was already imprecise in v3 (multiple DVs sharing a Puffin file + // each reported the full file size). + @Override + public long fileSizeInBytes() { + return dv.sizeInBytes(); + } + + // From the spec: position deletes are required to be sorted by file and position, not a table + // order, and should set sort order id to null + @Override + public Integer sortOrderId() { + return null; + } + + @Override + public ByteBuffer keyMetadata() { + return null; + } + + @Override + public List equalityFieldIds() { + return null; + } + + @Override + public String referencedDataFile() { + return file().location(); + } + + @Override + public Long contentOffset() { + return dv.offset(); + } + + @Override + public Long contentSizeInBytes() { + return dv.sizeInBytes(); + } + + @Override + public Map columnSizes() { + return null; + } + + @Override + public Map valueCounts() { + return null; + } + + @Override + public Map nullValueCounts() { + return null; + } + + @Override + public Map nanValueCounts() { + return null; + } + + @Override + public Map lowerBounds() { + return null; + } + + @Override + public Map upperBounds() { + return null; + } + + @Override + public DeleteFile copy() { + return new TrackedDVDeleteFile(file().copyWithoutStats(), spec()); + } + + @Override + public DeleteFile copy(boolean withStats) { + return copy(); + } + + @Override + public DeleteFile copyWithoutStats() { + return copy(); + } + + @Override + public DeleteFile copyWithStats(Set requestedColumnIds) { + return copy(); + } + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFileAdapters.java b/core/src/test/java/org/apache/iceberg/TestTrackedFileAdapters.java new file mode 100644 index 000000000000..b04b5c04dd23 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFileAdapters.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestTrackedFileAdapters { + + private static final String MANIFEST_LOCATION = "s3://bucket/table/manifest.parquet"; + + private static final Map UNPARTITIONED = + ImmutableMap.of(0, PartitionSpec.unpartitioned()); + + private static Map specsById(PartitionSpec spec) { + return ImmutableMap.of(spec.specId(), spec); + } + + @Test + void testDataFileAdapterDelegatesAllFields() { + TrackingStruct tracking = createTracking(3L); + ContentStats stats = createContentStats(); + + TrackedFileStruct file = + new TrackedFileStruct( + tracking, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, 0); + file.set(8, stats); + file.set(9, 3); + file.set(12, ByteBuffer.wrap(new byte[] {1, 2, 3})); + file.set(13, ImmutableList.of(50L, 100L)); + + DataFile dataFile = TrackedFileAdapters.asDataFile(file, UNPARTITIONED); + + assertThat(dataFile.pos()).isEqualTo(3L); + assertThat(dataFile.specId()).isEqualTo(0); + assertThat(dataFile.content()).isEqualTo(FileContent.DATA); + assertThat(dataFile.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(dataFile.format()).isEqualTo(FileFormat.PARQUET); + assertThat(dataFile.recordCount()).isEqualTo(100L); + assertThat(dataFile.fileSizeInBytes()).isEqualTo(1024L); + assertThat(dataFile.sortOrderId()).isEqualTo(3); + assertThat(dataFile.dataSequenceNumber()).isEqualTo(10L); + assertThat(dataFile.fileSequenceNumber()).isEqualTo(11L); + assertThat(dataFile.firstRowId()).isEqualTo(1000L); + assertThat(dataFile.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(dataFile.splitOffsets()).containsExactly(50L, 100L); + assertThat(dataFile.manifestLocation()).isEqualTo(MANIFEST_LOCATION); + assertThat(dataFile.equalityFieldIds()).isNull(); + assertThat(dataFile.columnSizes()).isNull(); + assertThat(dataFile.valueCounts()).containsOnly(entry(1, 100L), entry(2, 200L)); + assertThat(dataFile.nullValueCounts()).containsOnly(entry(1, 5L), entry(2, 10L)); + assertThat(dataFile.nanValueCounts()).containsOnly(entry(2, 3L)); + assertThat(dataFile.lowerBounds()) + .containsEntry(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)) + .containsEntry(2, Conversions.toByteBuffer(Types.FloatType.get(), 1.0f)); + assertThat(dataFile.upperBounds()) + .containsEntry(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1000)) + .containsEntry(2, Conversions.toByteBuffer(Types.FloatType.get(), 100.0f)); + } + + @Test + void testDataFileAdapterRejectsNonData() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.EQUALITY_DELETES, + "s3://bucket/delete.avro", + FileFormat.AVRO, + null, + 50L, + 512L); + file.set(6, 0); + + assertThatThrownBy(() -> TrackedFileAdapters.asDataFile(file, UNPARTITIONED)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid content type for DataFile: %s", FileContent.EQUALITY_DELETES); + } + + @Test + void testEqualityDeleteFileAdapterDelegatesAllFields() { + TrackingStruct tracking = createTracking(5L); + PartitionSpec spec = PartitionSpec.builderFor(new Schema()).withSpecId(1).build(); + ContentStats stats = createContentStats(); + + TrackedFileStruct file = + new TrackedFileStruct( + tracking, + FileContent.EQUALITY_DELETES, + "s3://bucket/eq-delete.avro", + FileFormat.AVRO, + null, + 50L, + 512L); + file.set(6, 1); + file.set(8, stats); + file.set(9, 5); + file.set(12, ByteBuffer.wrap(new byte[] {4, 5})); + file.set(13, ImmutableList.of(200L)); + file.set(14, ImmutableList.of(1, 2, 3)); + + DeleteFile deleteFile = TrackedFileAdapters.asEqualityDeleteFile(file, specsById(spec)); + + assertThat(deleteFile.pos()).isEqualTo(5L); + assertThat(deleteFile.specId()).isEqualTo(1); + assertThat(deleteFile.content()).isEqualTo(FileContent.EQUALITY_DELETES); + assertThat(deleteFile.location()).isEqualTo("s3://bucket/eq-delete.avro"); + assertThat(deleteFile.format()).isEqualTo(FileFormat.AVRO); + assertThat(deleteFile.recordCount()).isEqualTo(50L); + assertThat(deleteFile.fileSizeInBytes()).isEqualTo(512L); + assertThat(deleteFile.sortOrderId()).isEqualTo(5); + assertThat(deleteFile.dataSequenceNumber()).isEqualTo(10L); + assertThat(deleteFile.fileSequenceNumber()).isEqualTo(11L); + assertThat(deleteFile.firstRowId()).isNull(); + assertThat(deleteFile.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {4, 5})); + assertThat(deleteFile.splitOffsets()).containsExactly(200L); + assertThat(deleteFile.manifestLocation()).isEqualTo(MANIFEST_LOCATION); + assertThat(deleteFile.equalityFieldIds()).containsExactly(1, 2, 3); + assertThat(deleteFile.columnSizes()).isNull(); + assertThat(deleteFile.valueCounts()).containsOnly(entry(1, 100L), entry(2, 200L)); + assertThat(deleteFile.nullValueCounts()).containsOnly(entry(1, 5L), entry(2, 10L)); + assertThat(deleteFile.nanValueCounts()).containsOnly(entry(2, 3L)); + assertThat(deleteFile.lowerBounds()) + .containsEntry(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)) + .containsEntry(2, Conversions.toByteBuffer(Types.FloatType.get(), 1.0f)); + assertThat(deleteFile.upperBounds()) + .containsEntry(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1000)) + .containsEntry(2, Conversions.toByteBuffer(Types.FloatType.get(), 100.0f)); + } + + @Test + void testEqualityDeleteFileAdapterRejectsNonEqualityDeletes() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, 0); + + assertThatThrownBy(() -> TrackedFileAdapters.asEqualityDeleteFile(file, UNPARTITIONED)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid content type for equality delete file: %s", FileContent.DATA); + } + + @Test + void testDVDeleteFileAdapterDelegatesAllFields() { + TrackingStruct tracking = createTracking(7L); + PartitionSpec spec = PartitionSpec.builderFor(new Schema()).withSpecId(2).build(); + + TrackedFileStruct file = createDataFileWithDV(tracking, 2); + + DeleteFile dvFile = TrackedFileAdapters.asDVDeleteFile(file, specsById(spec)); + + // DV-specific fields from DeletionVector + assertThat(dvFile.content()).isEqualTo(FileContent.POSITION_DELETES); + assertThat(dvFile.location()).isEqualTo("s3://bucket/puffin/dv-file.bin"); + assertThat(dvFile.format()).isEqualTo(FileFormat.PUFFIN); + assertThat(dvFile.recordCount()).isEqualTo(10L); + assertThat(dvFile.fileSizeInBytes()).isEqualTo(256L); + assertThat(dvFile.referencedDataFile()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(dvFile.contentOffset()).isEqualTo(128L); + assertThat(dvFile.contentSizeInBytes()).isEqualTo(256L); + + // fields delegated from TrackedFile / Tracking + assertThat(dvFile.pos()).isEqualTo(7L); + assertThat(dvFile.specId()).isEqualTo(2); + assertThat(dvFile.dataSequenceNumber()).isEqualTo(10L); + assertThat(dvFile.fileSequenceNumber()).isEqualTo(11L); + assertThat(dvFile.manifestLocation()).isEqualTo(MANIFEST_LOCATION); + + // fields that should be null for DVs + assertThat(dvFile.sortOrderId()).isNull(); + assertThat(dvFile.firstRowId()).isNull(); + assertThat(dvFile.keyMetadata()).isNull(); + assertThat(dvFile.splitOffsets()).isNull(); + assertThat(dvFile.equalityFieldIds()).isNull(); + assertThat(dvFile.columnSizes()).isNull(); + assertThat(dvFile.valueCounts()).isNull(); + assertThat(dvFile.nullValueCounts()).isNull(); + assertThat(dvFile.nanValueCounts()).isNull(); + assertThat(dvFile.lowerBounds()).isNull(); + assertThat(dvFile.upperBounds()).isNull(); + } + + @Test + void testDVDeleteFileAdapterRejectsNonData() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.EQUALITY_DELETES, + "s3://bucket/eq-delete.avro", + FileFormat.AVRO, + null, + 50L, + 512L); + file.set(6, 0); + file.set(10, createDeletionVector()); + + assertThatThrownBy(() -> TrackedFileAdapters.asDVDeleteFile(file, UNPARTITIONED)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid content type for DV delete file: %s", FileContent.EQUALITY_DELETES); + } + + @Test + void testDVDeleteFileAdapterRejectsNullDV() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, 0); + + assertThatThrownBy(() -> TrackedFileAdapters.asDVDeleteFile(file, UNPARTITIONED)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot create DV delete file: no deletion vector"); + } + + @Test + void testNullContentStatsReturnsNullStats() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, 0); + + DataFile dataFile = TrackedFileAdapters.asDataFile(file, UNPARTITIONED); + + assertThat(dataFile.valueCounts()).isNull(); + assertThat(dataFile.nullValueCounts()).isNull(); + assertThat(dataFile.nanValueCounts()).isNull(); + assertThat(dataFile.lowerBounds()).isNull(); + assertThat(dataFile.upperBounds()).isNull(); + } + + @Test + void testNullSpecIdResolvesToUnpartitionedSpec() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + + PartitionSpec spec = PartitionSpec.builderFor(new Schema()).withSpecId(5).build(); + DataFile dataFile = TrackedFileAdapters.asDataFile(file, specsById(spec)); + assertThat(dataFile.specId()).isEqualTo(5); + } + + @Test + void testNullSpecIdThrowsWhenNoUnpartitionedSpec() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + PartitionSpec partitioned = PartitionSpec.builderFor(schema).identity("id").build(); + + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + + assertThatThrownBy(() -> TrackedFileAdapters.asDataFile(file, specsById(partitioned))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find unpartitioned spec in specs"); + } + + @Test + void testUnknownSpecIdThrows() { + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, 99); + + assertThatThrownBy(() -> TrackedFileAdapters.asDataFile(file, ImmutableMap.of())) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find partition spec for spec ID"); + } + + private static TrackingStruct createTracking(long manifestPos) { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .fileSequenceNumber(11L) + .firstRowId(1000L) + .build(); + // manifestLocation and manifestPos are set by manifest readers, not written to manifests + tracking.setManifestLocation(MANIFEST_LOCATION); + tracking.set(8, manifestPos); + return tracking; + } + + private static TrackedFileStruct createDataFileWithDV(Tracking tracking, int specId) { + TrackedFileStruct file = + new TrackedFileStruct( + tracking, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + null, + 100L, + 1024L); + file.set(6, specId); + file.set(10, createDeletionVector()); + return file; + } + + private static DeletionVectorStruct createDeletionVector() { + return DeletionVectorStruct.builder() + .location("s3://bucket/puffin/dv-file.bin") + .offset(128L) + .sizeInBytes(256L) + .cardinality(10L) + .build(); + } + + private static ContentStats createContentStats() { + Types.StructType statsStruct = + Types.StructType.of( + Types.NestedField.optional( + 10000, + "1", + Types.StructType.of( + Types.NestedField.optional(10001, "value_count", Types.LongType.get()), + Types.NestedField.optional(10002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(10003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(10006, "lower_bound", Types.IntegerType.get()), + Types.NestedField.optional(10007, "upper_bound", Types.IntegerType.get()))), + Types.NestedField.optional( + 20000, + "2", + Types.StructType.of( + Types.NestedField.optional(20001, "value_count", Types.LongType.get()), + Types.NestedField.optional(20002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(20003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(20006, "lower_bound", Types.FloatType.get()), + Types.NestedField.optional(20007, "upper_bound", Types.FloatType.get())))); + + List> fieldStatsList = + ImmutableList.of( + BaseFieldStats.builder() + .fieldId(1) + .type(Types.IntegerType.get()) + .valueCount(100L) + .nullValueCount(5L) + .lowerBound(1) + .upperBound(1000) + .build(), + BaseFieldStats.builder() + .fieldId(2) + .type(Types.FloatType.get()) + .valueCount(200L) + .nullValueCount(10L) + .nanValueCount(3L) + .lowerBound(1.0f) + .upperBound(100.0f) + .build()); + + return BaseContentStats.builder() + .withStatsStruct(statsStruct) + .withFieldStats(fieldStatsList) + .build(); + } + + private static Map.Entry entry(int key, long value) { + return Map.entry(key, value); + } +}