Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.Serializable;
import org.apache.iceberg.avro.SupportsIndexProjection;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Types;

/** Mutable {@link StructLike} implementation of {@link DeletionVector}. */
Expand Down Expand Up @@ -49,6 +50,14 @@ private DeletionVectorStruct(DeletionVectorStruct toCopy) {
this.cardinality = toCopy.cardinality;
}

private DeletionVectorStruct(String location, long offset, long sizeInBytes, long cardinality) {
super(BASE_TYPE, BASE_TYPE);
Comment thread
anoopj marked this conversation as resolved.
this.location = location;
this.offset = offset;
this.sizeInBytes = sizeInBytes;
this.cardinality = cardinality;
}

@Override
public String location() {
return location;
Expand Down Expand Up @@ -115,6 +124,10 @@ protected <T> void internalSet(int pos, T value) {
}
}

static Builder builder() {
return new Builder();
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this)
Expand All @@ -124,4 +137,41 @@ public String toString() {
.add("cardinality", cardinality)
.toString();
}

Comment thread
anoopj marked this conversation as resolved.
static class Builder {
private String location = null;
Copy link
Copy Markdown
Contributor

@nastra nastra Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these all valid values? because we're currently not preventing the builder from creating an instance where e.g. only location is set

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not a valid value. I was initially on the fence about adding validations because the builders are only used for explicit construction (mainly in tests, and not in the read/write path, which uses set(pos, value) directly).

But adding validation in build() makes sense. I'll add checks that required fields have been set, along with tests.

private long offset = -1L;
private long sizeInBytes = -1L;
private long cardinality = -1L;

Builder location(String dvLocation) {
this.location = dvLocation;
return this;
}

Builder offset(long dvOffset) {
this.offset = dvOffset;
return this;
}

Builder sizeInBytes(long dvSizeInBytes) {
this.sizeInBytes = dvSizeInBytes;
return this;
}

Builder cardinality(long dvCardinality) {
this.cardinality = dvCardinality;
return this;
}

DeletionVectorStruct build() {
Preconditions.checkArgument(location != null, "Invalid location: null");
Preconditions.checkArgument(offset >= 0, "Invalid offset: %s (must be >= 0)", offset);
Preconditions.checkArgument(
sizeInBytes >= 0, "Invalid size in bytes: %s (must be >= 0)", sizeInBytes);
Preconditions.checkArgument(
cardinality >= 0, "Invalid cardinality: %s (must be >= 0)", cardinality);
return new DeletionVectorStruct(location, offset, sizeInBytes, cardinality);
}
}
}
153 changes: 153 additions & 0 deletions core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.Arrays;
import org.apache.iceberg.avro.SupportsIndexProjection;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ByteBuffers;

Expand Down Expand Up @@ -73,6 +74,32 @@ private ManifestInfoStruct(ManifestInfoStruct toCopy) {
this.dvCardinality = toCopy.dvCardinality;
}

private ManifestInfoStruct(
int addedFilesCount,
int existingFilesCount,
int deletedFilesCount,
int replacedFilesCount,
long addedRowsCount,
long existingRowsCount,
long deletedRowsCount,
long replacedRowsCount,
long minSequenceNumber,
byte[] dv,
Long dvCardinality) {
super(BASE_TYPE, BASE_TYPE);
this.addedFilesCount = addedFilesCount;
this.existingFilesCount = existingFilesCount;
this.deletedFilesCount = deletedFilesCount;
this.replacedFilesCount = replacedFilesCount;
this.addedRowsCount = addedRowsCount;
this.existingRowsCount = existingRowsCount;
this.deletedRowsCount = deletedRowsCount;
this.replacedRowsCount = replacedRowsCount;
this.minSequenceNumber = minSequenceNumber;
this.dv = dv;
this.dvCardinality = dvCardinality;
}

@Override
public int addedFilesCount() {
return addedFilesCount;
Expand Down Expand Up @@ -208,6 +235,10 @@ protected <T> void internalSet(int pos, T value) {
}
}

static Builder builder() {
return new Builder();
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this)
Expand All @@ -224,4 +255,126 @@ public String toString() {
.add("dv_cardinality", dvCardinality == null ? "null" : dvCardinality)
.toString();
}

static class Builder {
private int addedFilesCount = -1;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as in the other builder. Is -1 a valid value across all of these? If not, then we need to add some validation checks inside the build() method

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above. It is not a valid value - added validation

private int existingFilesCount = -1;
private int deletedFilesCount = -1;
private int replacedFilesCount = -1;
private long addedRowsCount = -1L;
private long existingRowsCount = -1L;
private long deletedRowsCount = -1L;
private long replacedRowsCount = -1L;
private long minSequenceNumber = -1L;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in other places we default minSequenceNumber to 0L, so might be worth considering doing that here too. On the other hand, defaulting to -1L forces a caller to always set the minSequenceNumber

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we follow the conventions of BaseFile where we default to -1 to make sure this is set by callers. This was a feedback from @rdblue on #15854

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok then let's go with -1 as the default here, thanks for confirming

private byte[] dv = null;
private Long dvCardinality = null;

Builder addedFilesCount(int count) {
this.addedFilesCount = count;
return this;
}

Builder existingFilesCount(int count) {
this.existingFilesCount = count;
return this;
}

Builder deletedFilesCount(int count) {
this.deletedFilesCount = count;
return this;
}

Builder replacedFilesCount(int count) {
this.replacedFilesCount = count;
return this;
}

Builder addedRowsCount(long count) {
this.addedRowsCount = count;
return this;
}

Builder existingRowsCount(long count) {
this.existingRowsCount = count;
return this;
}

Builder deletedRowsCount(long count) {
this.deletedRowsCount = count;
return this;
}

Builder replacedRowsCount(long count) {
this.replacedRowsCount = count;
return this;
}

Builder minSequenceNumber(long sequenceNumber) {
this.minSequenceNumber = sequenceNumber;
return this;
}

Builder dv(ByteBuffer buffer) {
Comment thread
stevenzwu marked this conversation as resolved.
this.dv = buffer != null ? ByteBuffers.toByteArray(buffer) : null;
return this;
}

Builder dv(byte[] buffer) {
this.dv = buffer;
return this;
}

Builder dvCardinality(Long cardinality) {
this.dvCardinality = cardinality;
return this;
}

ManifestInfoStruct build() {
Preconditions.checkArgument(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For builders, we usually fail as early as possible. For this, it means moving these checks into addedFilesCount(int) and similar methods. This keeps the code more focused: problems that are specific to a value are checked when setting that value, and checks in build are for consistency. For instance, we could have a check here that addedRowsCount == 0 || addedFilesCount > 0

Another good side effect is the caller gets a stack trace with addedFilesCount rather than build. This is minor because we typically see builders configured and completed in one place, but there are some cases like parsing that pass builders around the call stack and it is best to fail immediately to point the caller to the right place in their own code.

I think the rationale for adding the checks here was that we use -1 as a sentinel value that means "not set", which has to be checked in build. But that leads to bad error messages:

ManifestInfoStruct.builder().build()

This fails with Invalid added files count: -1 (must be >= 0) instead of Missing required value: added files count. I appreciate the adherence to doing what BaseFile was doing, but in this case I think the right solution is to use Integer and null for the default, then check the nulls here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. Fixing it in #16408

addedFilesCount >= 0, "Invalid added files count: %s (must be >= 0)", addedFilesCount);
Preconditions.checkArgument(
existingFilesCount >= 0,
"Invalid existing files count: %s (must be >= 0)",
existingFilesCount);
Preconditions.checkArgument(
deletedFilesCount >= 0,
"Invalid deleted files count: %s (must be >= 0)",
deletedFilesCount);
Preconditions.checkArgument(
replacedFilesCount >= 0,
"Invalid replaced files count: %s (must be >= 0)",
replacedFilesCount);
Preconditions.checkArgument(
addedRowsCount >= 0, "Invalid added rows count: %s (must be >= 0)", addedRowsCount);
Preconditions.checkArgument(
existingRowsCount >= 0,
"Invalid existing rows count: %s (must be >= 0)",
existingRowsCount);
Preconditions.checkArgument(
deletedRowsCount >= 0, "Invalid deleted rows count: %s (must be >= 0)", deletedRowsCount);
Preconditions.checkArgument(
replacedRowsCount >= 0,
"Invalid replaced rows count: %s (must be >= 0)",
replacedRowsCount);
Preconditions.checkArgument(
minSequenceNumber >= 0,
"Invalid min sequence number: %s (must be >= 0)",
minSequenceNumber);
Preconditions.checkArgument(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A check that the dvCardinality is positive?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

(dv == null) == (dvCardinality == null),
"Invalid DV and cardinality: must both be null or non-null");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the kind of consistency check that belongs in the builder.

Is it okay to set cardinality to 0?

Maybe it doesn't matter because I think we're removing the MDV cardinality.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a validation on dvCardinality for now.

return new ManifestInfoStruct(
addedFilesCount,
existingFilesCount,
deletedFilesCount,
replacedFilesCount,
addedRowsCount,
existingRowsCount,
deletedRowsCount,
replacedRowsCount,
minSequenceNumber,
dv,
dvCardinality);
Comment thread
anoopj marked this conversation as resolved.
}
}
}
102 changes: 98 additions & 4 deletions core/src/main/java/org/apache/iceberg/TrackingStruct.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,6 @@ class TrackingStruct extends SupportsIndexProjection implements Tracking, Serial
super(BASE_TYPE, type);
}

TrackingStruct() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this constructor removed? Isn't it needed?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need it for the serialization path. Added back.

super(BASE_TYPE.fields().size());
}

private TrackingStruct(TrackingStruct toCopy) {
super(toCopy);
this.status = toCopy.status;
Expand All @@ -83,6 +79,26 @@ private TrackingStruct(TrackingStruct toCopy) {
this.manifestPos = toCopy.manifestPos;
}

private TrackingStruct(
EntryStatus status,
Long snapshotId,
Long dataSequenceNumber,
Long fileSequenceNumber,
Long dvSnapshotId,
Long firstRowId,
byte[] deletedPositions,
byte[] replacedPositions) {
super(BASE_TYPE, BASE_TYPE);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this going to compare BASE_TYPE to itself and build the same position map that you would get by calling super(BASE_TYPE.fields().size())?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats correct. Fixed.

this.status = status;
this.snapshotId = snapshotId;
this.dataSequenceNumber = dataSequenceNumber;
this.fileSequenceNumber = fileSequenceNumber;
this.dvSnapshotId = dvSnapshotId;
this.firstRowId = firstRowId;
this.deletedPositions = deletedPositions;
this.replacedPositions = replacedPositions;
}

void inheritFrom(Tracking manifestTracking) {
if (manifestTracking != null) {
if (snapshotId == null) {
Expand Down Expand Up @@ -233,6 +249,10 @@ protected <T> void internalSet(int pos, T value) {
}
}

static Builder builder() {
return new Builder();
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this)
Expand All @@ -246,4 +266,78 @@ public String toString() {
.add("replaced_positions", replacedPositions == null ? "null" : "(binary)")
.toString();
}

static class Builder {
private EntryStatus status = null;
private Long snapshotId = null;
private Long dataSequenceNumber = null;
private Long fileSequenceNumber = null;
private Long dvSnapshotId = null;
private Long firstRowId = null;
Comment thread
nastra marked this conversation as resolved.
private byte[] deletedPositions = null;
private byte[] replacedPositions = null;

Builder status(EntryStatus entryStatus) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Entry status has a specific lifecycle: when files are written, they are ADDED. When read and written again, they move to EXISTING. And when either an ADDED or EXISTING entry is deleted, it is set to DELETED. Also, the snapshotId field is related: it is required for ADDED, must not be modified for EXISTING, and again must be supplied for DELETED. (I think REPLACED will have similar behavior and it is easier to think about the API by omitting it for now.)

I think that it makes more sense to have the builder handle these cases than to allow creating possibly misconfigured Tracking structs. I would handle these requirements in a way that match the lifecycle:

  Builder added(long snapshotId); // creates a new Tracking for an addition
  Builder existing(Tracking); // creates Tracking for an existing file based on Tracking read from a manifest
  Builder deleted(Tracking, long snapshotId); // creates Tracking to delete a file

Passing the existing tracking in makes it so the builder can keep values that must be preserved. And this approach allows the builder to use the initial intent (added/existing/deleted) to perform better validation. For instance, you can't call dvSnapshotId for a deleted entry -- it has to come from the existing/added entry and can't be changed.

I should also note that a builder may not be the right way to create Tracking instances. It could be that we have a builder for ADDED, but then use instance methods to change from one state to the next, like toExisting(...). I think going through the thought exercise of restricting what you can do through the builder is the right path forward. If it doesn't really work then we can consider other approaches.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just wrote this elsewhere and I think it is a good addition to understand the why for my comment above: We don't just want to validate that a single Tracking object is self-consistent. We want to guide how one Tracking object evolves into the next for a TrackedFile. That means we don't want a simple builder to set every field.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restructured along those lines. The common builder() is replaced with the entry points above. The existing/deleted builders carry sequence numbers etc from source.

this.status = entryStatus;
return this;
}

Builder snapshotId(Long id) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be long right? It is not valid to pass a null into this builder, although I am fine with leaving it null by not calling snapshotId.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Made it long in the added/existing() builders

this.snapshotId = id;
return this;
}

Builder dataSequenceNumber(Long sequenceNumber) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already commented on another review: #16100 (comment)

To summarize, I don't think that this method or fileSequenceNumber should be part of the builder because those should never be set by callers. There does need to be a way to set dataSequenceNumber when re-sequencing files, but that is a minor case that I think we should handle when updating the table operations. For now, we should remove these.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed both dataSequenceNumber() and fileSequenceNumber() from the builder.

this.dataSequenceNumber = sequenceNumber;
return this;
}

Builder fileSequenceNumber(Long sequenceNumber) {
this.fileSequenceNumber = sequenceNumber;
return this;
}

Builder dvSnapshotId(Long id) {
this.dvSnapshotId = id;
return this;
}

Builder firstRowId(Long rowId) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another one that should never be set through the builder. It can only be set through inheritance.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

this.firstRowId = rowId;
return this;
}

Builder deletedPositions(ByteBuffer positions) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we have an overloaded method that takes byte[] instead of the ByteBuffer? Seems like all calling sites wrap a byte[] into a ByteBuffer just to extract it here again

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. Done

this.deletedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null;
return this;
}

Builder deletedPositions(byte[] positions) {
this.deletedPositions = positions;
return this;
}

Builder replacedPositions(ByteBuffer positions) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that this is premature. We should know whether we are going to use ByteBuffer or byte[] before making this decision right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped the byte[] overload. The setter takes ByteBuffer only, matching what Tracking returns.

this.replacedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null;
return this;
}

Builder replacedPositions(byte[] positions) {
this.replacedPositions = positions;
return this;
}

TrackingStruct build() {
Preconditions.checkArgument(status != null, "Invalid status: null");
return new TrackingStruct(
status,
snapshotId,
dataSequenceNumber,
fileSequenceNumber,
dvSnapshotId,
firstRowId,
deletedPositions,
replacedPositions);
}
}
}
Loading
Loading