Skip to content

Commit 4b85a22

Browse files
authored
HBASE-29039 Seek past delete markers instead of skipping one at a time (#8001)
When a DeleteColumn or DeleteFamily marker is encountered during a normal user scan, the matcher currently returns SKIP, forcing the scanner to advance one cell at a time. This causes read latency to degrade linearly with the number of accumulated delete markers for the same row or column. Since these are range deletes that mask all remaining versions of the column, seek past the entire column immediately via columns.getNextRowOrNextColumn(). This is safe because cells arrive in timestamp descending order, so any puts newer than the delete have already been processed. For DeleteFamily, also fix getKeyForNextColumn in ScanQueryMatcher to bypass the empty-qualifier guard (HBASE-18471) when the cell is a DeleteFamily marker. Without this, the seek barely advances past the current cell instead of jumping to the first real qualified column. The optimization is only applied with plain ScanDeleteTracker, and skipped when: - seePastDeleteMarkers is true (KEEP_DELETED_CELLS) - newVersionBehavior is enabled (sequence IDs determine visibility) - visibility labels are in use (delete/put label mismatch) --- Seeking is more expensive than skipping. When each row has only one DeleteFamily or DeleteColumn marker (common case), the seek overhead adds up across many rows, causing performance regression. Introduce a counter that tracks consecutive range delete markers per row. Only switch from SKIP to SEEK after seeing SEEK_ON_DELETE_MARKER_THRESHOLD (default 10) markers, indicating actual accumulation. This preserves skip performance for the common case while still optimizing the accumulation case. Signed-off-by: Charles Connell <cconnell@apache.org>
1 parent 7b27d09 commit 4b85a22

3 files changed

Lines changed: 196 additions & 6 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@
1818
package org.apache.hadoop.hbase.regionserver.querymatcher;
1919

2020
import java.io.IOException;
21+
import org.apache.hadoop.hbase.CellUtil;
2122
import org.apache.hadoop.hbase.ExtendedCell;
2223
import org.apache.hadoop.hbase.KeepDeletedCells;
24+
import org.apache.hadoop.hbase.KeyValue;
25+
import org.apache.hadoop.hbase.KeyValueUtil;
2326
import org.apache.hadoop.hbase.PrivateCellUtil;
2427
import org.apache.hadoop.hbase.client.Scan;
2528
import org.apache.hadoop.hbase.regionserver.ScanInfo;
@@ -31,6 +34,14 @@
3134
@InterfaceAudience.Private
3235
public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher {
3336

37+
/**
38+
* Number of consecutive range delete markers (DeleteColumn/DeleteFamily) to skip before switching
39+
* to seek. Seeking is more expensive than skipping for a single marker, but much faster when
40+
* markers accumulate. This threshold avoids the seek overhead for the common case (one delete per
41+
* row/column) while still kicking in when markers pile up.
42+
*/
43+
static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10;
44+
3445
/** Keeps track of deletes */
3546
private final DeleteTracker deletes;
3647

@@ -40,18 +51,32 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher {
4051
/** whether time range queries can see rows "behind" a delete */
4152
protected final boolean seePastDeleteMarkers;
4253

54+
/** Whether seek optimization for range delete markers is applicable */
55+
private final boolean canSeekOnDeleteMarker;
56+
57+
/** Count of consecutive range delete markers seen for the same column */
58+
private int rangeDeleteCount;
59+
60+
/** Last range delete cell, for qualifier comparison across consecutive markers */
61+
private ExtendedCell lastDelete;
62+
4363
protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker columns,
4464
boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now) {
4565
super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now);
4666
this.deletes = deletes;
4767
this.get = scan.isGetScan();
4868
this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() != KeepDeletedCells.FALSE;
69+
this.canSeekOnDeleteMarker =
70+
!seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class;
4971
}
5072

5173
@Override
5274
public void beforeShipped() throws IOException {
5375
super.beforeShipped();
5476
deletes.beforeShipped();
77+
if (lastDelete != null) {
78+
lastDelete = KeyValueUtil.toNewKeyCell(lastDelete);
79+
}
5580
}
5681

5782
@Override
@@ -71,8 +96,31 @@ public MatchCode match(ExtendedCell cell) throws IOException {
7196
if (includeDeleteMarker) {
7297
this.deletes.add(cell);
7398
}
99+
100+
// A DeleteColumn or DeleteFamily masks all remaining cells for this column/family.
101+
// Seek past them instead of skipping one cell at a time, but only after seeing
102+
// enough consecutive markers for the same column to justify the seek overhead.
103+
// Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence
104+
// IDs determine visibility), visibility labels (delete/put label mismatch), or
105+
// seePastDeleteMarkers (KEEP_DELETED_CELLS).
106+
if (
107+
canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode()
108+
|| (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0))
109+
) {
110+
if (lastDelete != null && !CellUtil.matchingQualifier(cell, lastDelete)) {
111+
rangeDeleteCount = 0;
112+
}
113+
lastDelete = cell;
114+
if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) {
115+
rangeDeleteCount = 0;
116+
return columns.getNextRowOrNextColumn(cell);
117+
}
118+
} else {
119+
rangeDeleteCount = 0;
120+
}
74121
return MatchCode.SKIP;
75122
}
123+
rangeDeleteCount = 0;
76124
returnCode = checkDeleted(deletes, cell);
77125
if (returnCode != null) {
78126
return returnCode;
@@ -83,6 +131,8 @@ public MatchCode match(ExtendedCell cell) throws IOException {
83131
@Override
84132
protected void reset() {
85133
deletes.reset();
134+
rangeDeleteCount = 0;
135+
lastDelete = null;
86136
}
87137

88138
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,12 +292,11 @@ public void setToNewRow(ExtendedCell currentRow) {
292292
public abstract boolean moreRowsMayExistAfter(ExtendedCell cell);
293293

294294
public ExtendedCell getKeyForNextColumn(ExtendedCell cell) {
295-
// We aren't sure whether any DeleteFamily cells exist, so we can't skip to next column.
296-
// TODO: Current way disable us to seek to next column quickly. Is there any better solution?
297-
// see HBASE-18471 for more details
298-
// see TestFromClientSide3#testScanAfterDeletingSpecifiedRow
299-
// see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2
300-
if (cell.getQualifierLength() == 0) {
295+
// For cells with empty qualifier, we generally can't skip to the next column because
296+
// DeleteFamily cells might exist that we haven't seen yet (see HBASE-18471).
297+
// However, if the cell itself IS a DeleteFamily marker, we know we've already processed it,
298+
// so we can safely seek to the next real column.
299+
if (cell.getQualifierLength() == 0 && !PrivateCellUtil.isDeleteFamily(cell)) {
301300
ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell);
302301
if (nextKey != cell) {
303302
return nextKey;

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.hbase.HConstants;
3131
import org.apache.hadoop.hbase.KeepDeletedCells;
3232
import org.apache.hadoop.hbase.KeyValue;
33+
import org.apache.hadoop.hbase.KeyValue.Type;
3334
import org.apache.hadoop.hbase.PrivateCellUtil;
3435
import org.apache.hadoop.hbase.client.Scan;
3536
import org.apache.hadoop.hbase.filter.FilterBase;
@@ -396,4 +397,144 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE,
396397
Cell nextCell = qm.getKeyForNextColumn(lastCell);
397398
assertArrayEquals(nextCell.getQualifierArray(), col4);
398399
}
400+
401+
/**
402+
* After enough consecutive range delete markers, the matcher should switch from SKIP to
403+
* SEEK_NEXT_COL. Point deletes and KEEP_DELETED_CELLS always SKIP.
404+
*/
405+
@Test
406+
public void testSeekOnRangeDelete() throws IOException {
407+
int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
408+
409+
// DeleteColumn: first N-1 SKIP, N-th triggers SEEK_NEXT_COL
410+
assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteColumn, n);
411+
412+
// DeleteFamily: same threshold behavior
413+
assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteFamily, n);
414+
415+
// Delete (version): always SKIP (point delete, not range)
416+
assertAllSkip(KeepDeletedCells.FALSE, Type.Delete, n + 1);
417+
418+
// KEEP_DELETED_CELLS=TRUE: always SKIP
419+
assertAllSkip(KeepDeletedCells.TRUE, Type.DeleteColumn, n + 1);
420+
}
421+
422+
/**
423+
* DeleteColumn with empty qualifier must not cause seeking past a subsequent DeleteFamily.
424+
* DeleteFamily masks all columns, so it must be tracked by the delete tracker.
425+
*/
426+
@Test
427+
public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOException {
428+
long now = EnvironmentEdgeManager.currentTime();
429+
byte[] e = HConstants.EMPTY_BYTE_ARRAY;
430+
UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1,
431+
ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null,
432+
now - ttl, now, null);
433+
434+
int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
435+
// Feed DCs with empty qualifier past the threshold, then a DF.
436+
// The DF must NOT be seeked past -- it must be SKIP'd so the tracker picks it up.
437+
qm.setToNewRow(new KeyValue(row1, fam1, e, now, Type.DeleteColumn));
438+
for (int i = 0; i < n + 1; i++) {
439+
// Empty qualifier DCs should never trigger seek, regardless of threshold
440+
assertEquals("DC at i=" + i, MatchCode.SKIP,
441+
qm.match(new KeyValue(row1, fam1, e, now - i, Type.DeleteColumn)));
442+
}
443+
KeyValue df = new KeyValue(row1, fam1, e, now - n - 1, Type.DeleteFamily);
444+
KeyValue put = new KeyValue(row1, fam1, col1, now - n - 1, Type.Put, data);
445+
// DF must be processed (SKIP), not seeked past
446+
assertEquals(MatchCode.SKIP, qm.match(df));
447+
// Put in col1 at t=now-3 should be masked by DF@t=now-3
448+
MatchCode putCode = qm.match(put);
449+
assertEquals(MatchCode.SEEK_NEXT_COL, putCode);
450+
}
451+
452+
/**
453+
* DeleteColumn markers for different qualifiers should not accumulate the seek counter. Only
454+
* consecutive markers for the same qualifier should trigger seeking.
455+
*/
456+
@Test
457+
public void testDeleteColumnDifferentQualifiersDoNotSeek() throws IOException {
458+
long now = EnvironmentEdgeManager.currentTime();
459+
UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1,
460+
ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null,
461+
now - ttl, now, null);
462+
463+
// DCs for different qualifiers: counter resets on qualifier change, never seeks
464+
qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
465+
assertEquals(MatchCode.SKIP, qm.match(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn)));
466+
assertEquals(MatchCode.SKIP,
467+
qm.match(new KeyValue(row1, fam1, col2, now - 1, Type.DeleteColumn)));
468+
assertEquals(MatchCode.SKIP,
469+
qm.match(new KeyValue(row1, fam1, col3, now - 2, Type.DeleteColumn)));
470+
assertEquals(MatchCode.SKIP,
471+
qm.match(new KeyValue(row1, fam1, col4, now - 3, Type.DeleteColumn)));
472+
assertEquals(MatchCode.SKIP,
473+
qm.match(new KeyValue(row1, fam1, col5, now - 4, Type.DeleteColumn)));
474+
}
475+
476+
/**
477+
* Delete markers outside the scan's time range (includeDeleteMarker=false) should still
478+
* accumulate the seek counter and trigger SEEK_NEXT_COL after the threshold.
479+
*/
480+
@Test
481+
public void testSeekOnRangeDeleteOutsideTimeRange() throws IOException {
482+
long now = EnvironmentEdgeManager.currentTime();
483+
long futureTs = now + 1_000_000;
484+
Scan scanWithTimeRange = new Scan(scan).setTimeRange(futureTs, Long.MAX_VALUE);
485+
486+
UserScanQueryMatcher qm = UserScanQueryMatcher.create(scanWithTimeRange,
487+
new ScanInfo(this.conf, fam1, 0, 1, ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE,
488+
0, rowComparator, false),
489+
null, now - ttl, now, null);
490+
491+
int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
492+
qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
493+
// All DCs have timestamps below the time range, so includeDeleteMarker is false.
494+
// The seek counter should still accumulate.
495+
for (int i = 0; i < n - 1; i++) {
496+
assertEquals("DC at i=" + i, MatchCode.SKIP,
497+
qm.match(new KeyValue(row1, fam1, col1, now - i, Type.DeleteColumn)));
498+
}
499+
assertEquals(MatchCode.SEEK_NEXT_COL,
500+
qm.match(new KeyValue(row1, fam1, col1, now - n + 1, Type.DeleteColumn)));
501+
}
502+
503+
private UserScanQueryMatcher createDeleteMatcher(KeepDeletedCells keepDeletedCells)
504+
throws IOException {
505+
long now = EnvironmentEdgeManager.currentTime();
506+
return UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl,
507+
keepDeletedCells, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl,
508+
now, null);
509+
}
510+
511+
/** First n-1 markers SKIP, n-th triggers SEEK_NEXT_COL. */
512+
private void assertSeekAfterThreshold(KeepDeletedCells keepDeletedCells, Type type, int n)
513+
throws IOException {
514+
long now = EnvironmentEdgeManager.currentTime();
515+
UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
516+
boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion;
517+
byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
518+
qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
519+
for (int i = 0; i < n - 1; i++) {
520+
assertEquals("Mismatch at index " + i, MatchCode.SKIP,
521+
qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
522+
}
523+
assertEquals("Expected SEEK_NEXT_COL at index " + (n - 1), MatchCode.SEEK_NEXT_COL,
524+
qm.match(new KeyValue(row1, fam1, qual, now - n + 1, type)));
525+
}
526+
527+
/** All markers should SKIP regardless of count. */
528+
private void assertAllSkip(KeepDeletedCells keepDeletedCells, Type type, int count)
529+
throws IOException {
530+
long now = EnvironmentEdgeManager.currentTime();
531+
UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
532+
boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion;
533+
byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
534+
qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
535+
for (int i = 0; i < count; i++) {
536+
assertEquals("Mismatch at index " + i, MatchCode.SKIP,
537+
qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
538+
}
539+
}
399540
}

0 commit comments

Comments
 (0)