Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions deepdoc/parser/mineru_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,6 @@ def _line_tag(self, bx):
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)

def crop(self, text, ZM=1, need_position=False):
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
Expand Down Expand Up @@ -416,7 +415,12 @@ def crop(self, text, ZM=1, need_position=False):
)

positions = []
imgs = []
head_ctx_end = 0
tail_ctx_start = 0
for ii, (pns, left, right, top, bottom) in enumerate(poss):
if ii + 1 == len(poss):
tail_ctx_start = len(imgs)
right = left + max_width

if bottom <= top:
Expand Down Expand Up @@ -468,6 +472,9 @@ def crop(self, text, ZM=1, need_position=False):
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]

if ii == 0:
head_ctx_end = len(imgs)

if not imgs:
if need_position:
return None, None
Expand All @@ -481,7 +488,7 @@ def crop(self, text, ZM=1, need_position=False):
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
if ii < head_ctx_end or ii >= tail_ctx_start:
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
Expand Down
128 changes: 128 additions & 0 deletions test/unit_test/deepdoc/parser/test_mineru_crop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Unit tests for MinerUParser.crop() overlay logic.

Regression tests for issue #14197: content images were incorrectly darkened
when the top context strip had zero height and was skipped.
"""

import logging

import pytest
from PIL import Image

from deepdoc.parser.mineru_parser import MinerUParser
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def _make_parser(page_images):
parser = MinerUParser.__new__(MinerUParser)
parser.page_images = page_images
parser.page_from = 0
parser.logger = logging.getLogger("test_mineru_crop")
return parser


def _solid_image(color, size=(500, 800)):
return Image.new("RGB", size, color)


def _sample_center(img):
w, h = img.size
return img.getpixel((w // 2, h // 2))


class TestCropOverlay:
@pytest.mark.p2
def test_content_not_darkened_when_top_context_skipped(self):
"""
Bug #14197: image at top of page (top=0) produces a zero-height top
context strip that is skipped. The content image must NOT be darkened.
"""
RED = (255, 0, 0)
parser = _make_parser([_solid_image(RED)])

# top=0 → context strip above has zero height → skipped
tag = "@@1\t50.0\t450.0\t0.0\t200.0##"
result = parser.crop(tag)

assert result is not None
r, g, b = _sample_center(result)
# darkened overlay would halve the red channel to ~127
assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"

@pytest.mark.p2
def test_content_not_darkened_when_image_near_top(self):
"""Image within GAP(6px) of page top also produces zero-height top strip."""
RED = (255, 0, 0)
parser = _make_parser([_solid_image(RED)])

# top=4 → max(4-6, 0)=0 → zero-height top context strip
tag = "@@1\t50.0\t450.0\t4.0\t200.0##"
result = parser.crop(tag)

assert result is not None
r, g, b = _sample_center(result)
assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"

@pytest.mark.p2
def test_context_strips_are_darkened(self):
"""Context strips above and below content must receive the overlay."""
WHITE = (255, 255, 255)
parser = _make_parser([_solid_image(WHITE, size=(500, 800))])

# Content in the middle of the page — both context strips are valid
tag = "@@1\t50.0\t450.0\t300.0\t400.0##"
result = parser.crop(tag)

assert result is not None
# Top-most pixel row should be from the darkened top context strip
r, g, b = result.getpixel((result.size[0] // 2, 0))
assert r < 200, f"Top context strip not darkened (r={r})"

@pytest.mark.p1
def test_single_image_not_darkened_when_both_context_strips_skipped(self):
"""
Core bug from #14197: when both context strips are skipped (len(imgs)==1),
the single content image must NOT receive the overlay.
Original code: ii==0 AND ii+1==len(imgs) both True → always darkened.
"""
RED = (255, 0, 0)
# Page height = 200px; content fills full page → both context strips zero-height
parser = _make_parser([_solid_image(RED, size=(500, 200))])

# top=0, bottom=200 = page height → both context strips are zero-height → skipped
tag = "@@1\t50.0\t450.0\t0.0\t200.0##"
result = parser.crop(tag)

assert result is not None
r, g, b = _sample_center(result)
# Before fix: r≈127 (darkened). After fix: r≈255 (clear).
assert r > 200, f"Single content image darkened (r={r}); both-strips-skipped bug not fixed"

@pytest.mark.p2
def test_multi_page_content_not_darkened(self):
"""Content spanning multiple pages must not be darkened."""
RED = (255, 0, 0)
parser = _make_parser([_solid_image(RED), _solid_image(RED)])

# top=0 on page 1, spans to page 2
tag = "@@1-2\t50.0\t450.0\t0.0\t100.0##"
result = parser.crop(tag)

assert result is not None
r, g, b = _sample_center(result)
assert r > 200, f"Multi-page content image darkened (r={r})"
Loading