From 2f81f987cd258c462d7cb0be278206b570bbedd4 Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 19:20:40 -0700 Subject: [PATCH 1/7] Fix: MinerU parsed images are obscured when using MinerU as a parser. (#14197) --- deepdoc/parser/mineru_parser.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 25a0627ff41..f8bc9992b7a 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -416,7 +416,11 @@ def crop(self, text, ZM=1, need_position=False): ) positions = [] + # Each entry is (PIL.Image, is_context) so overlay logic is decoupled from final index. + imgs_with_flags = [] + poss_last_idx = len(poss) - 1 for ii, (pns, left, right, top, bottom) in enumerate(poss): + is_context = ii == 0 or ii == poss_last_idx right = left + max_width if bottom <= top: @@ -443,8 +447,8 @@ def crop(self, text, ZM=1, need_position=False): if x1 <= x0 or y1 <= y0: continue crop0 = img0.crop((x0, y0, x1, y1)) - imgs.append(crop0) - if 0 < ii < len(poss) - 1: + imgs_with_flags.append((crop0, is_context)) + if 0 < ii < poss_last_idx: positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) bottom -= img0.size[1] @@ -463,25 +467,25 @@ def crop(self, text, ZM=1, need_position=False): bottom -= page.size[1] continue cimgp = page.crop((x0, y0, x1, y1)) - imgs.append(cimgp) - if 0 < ii < len(poss) - 1: + imgs_with_flags.append((cimgp, is_context)) + if 0 < ii < poss_last_idx: positions.append((pn + self.page_from, x0, x1, y0, y1)) bottom -= page.size[1] - if not imgs: + if not imgs_with_flags: if need_position: return None, None return height = 0 - for img in imgs: + for img, _ in imgs_with_flags: height += img.size[1] + GAP height = int(height) - width = int(np.max([i.size[0] for i in imgs])) + width = int(np.max([i.size[0] for i, _ in imgs_with_flags])) pic = Image.new("RGB", (width, height), (245, 245, 245)) height = 0 - for ii, img in enumerate(imgs): - if ii == 0 or ii + 1 == len(imgs): + for img, is_context in imgs_with_flags: + if is_context: img = img.convert("RGBA") overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) overlay.putalpha(128) From 11fd3839696f2451cf6979bc1934ba6fb5720874 Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 19:38:24 -0700 Subject: [PATCH 2/7] Fix: MinerU parsed images are obscured when using MinerU as a parser. (#14197) --- deepdoc/parser/mineru_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index f8bc9992b7a..987056f4206 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -447,6 +447,7 @@ def crop(self, text, ZM=1, need_position=False): if x1 <= x0 or y1 <= y0: continue crop0 = img0.crop((x0, y0, x1, y1)) + self.logger.debug(f"[MinerU] crop: page={pns[0]} coords=({x0},{y0},{x1},{y1}) is_context={is_context} page_count={page_count}") imgs_with_flags.append((crop0, is_context)) if 0 < ii < poss_last_idx: positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) From 53b81e7d1360b0648460cad9d1d0f9f4d46cc131 Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 21:22:34 -0700 Subject: [PATCH 3/7] Fix: remove unused imgs variable flagged by ruff --- deepdoc/parser/mineru_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 987056f4206..72b16de9866 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -358,7 +358,6 @@ def _line_tag(self, bx): return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott) def crop(self, text, ZM=1, need_position=False): - imgs = [] poss = self.extract_positions(text) if not poss: if need_position: From 937a76cc15fad6153df628dce3b2deabf2a0364a Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 23:02:16 -0700 Subject: [PATCH 4/7] Fix: simplify overlay logic using boundary indices per reviewer suggestion --- deepdoc/parser/mineru_parser.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 72b16de9866..ba5d6b2c32e 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -415,11 +415,12 @@ def crop(self, text, ZM=1, need_position=False): ) positions = [] - # Each entry is (PIL.Image, is_context) so overlay logic is decoupled from final index. - imgs_with_flags = [] - poss_last_idx = len(poss) - 1 + imgs = [] + head_ctx_end = 0 + tail_ctx_start = 0 for ii, (pns, left, right, top, bottom) in enumerate(poss): - is_context = ii == 0 or ii == poss_last_idx + if ii + 1 == len(poss): + tail_ctx_start = len(imgs) right = left + max_width if bottom <= top: @@ -446,9 +447,8 @@ def crop(self, text, ZM=1, need_position=False): if x1 <= x0 or y1 <= y0: continue crop0 = img0.crop((x0, y0, x1, y1)) - self.logger.debug(f"[MinerU] crop: page={pns[0]} coords=({x0},{y0},{x1},{y1}) is_context={is_context} page_count={page_count}") - imgs_with_flags.append((crop0, is_context)) - if 0 < ii < poss_last_idx: + imgs.append(crop0) + if 0 < ii < len(poss) - 1: positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) bottom -= img0.size[1] @@ -467,25 +467,28 @@ def crop(self, text, ZM=1, need_position=False): bottom -= page.size[1] continue cimgp = page.crop((x0, y0, x1, y1)) - imgs_with_flags.append((cimgp, is_context)) - if 0 < ii < poss_last_idx: + imgs.append(cimgp) + if 0 < ii < len(poss) - 1: positions.append((pn + self.page_from, x0, x1, y0, y1)) bottom -= page.size[1] - if not imgs_with_flags: + if ii == 0: + head_ctx_end = len(imgs) + + if not imgs: if need_position: return None, None return height = 0 - for img, _ in imgs_with_flags: + for img in imgs: height += img.size[1] + GAP height = int(height) - width = int(np.max([i.size[0] for i, _ in imgs_with_flags])) + width = int(np.max([i.size[0] for i in imgs])) pic = Image.new("RGB", (width, height), (245, 245, 245)) height = 0 - for img, is_context in imgs_with_flags: - if is_context: + for ii, img in enumerate(imgs): + if ii < head_ctx_end or ii >= tail_ctx_start: img = img.convert("RGBA") overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) overlay.putalpha(128) From 0ea4150eaca536a93b6368967b22fd9ce64e14fc Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 23:38:24 -0700 Subject: [PATCH 5/7] Test: add unit tests covering both-strips-skipped scenario for #14197 --- .../deepdoc/parser/test_mineru_crop.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 test/unit_test/deepdoc/parser/test_mineru_crop.py diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py new file mode 100644 index 00000000000..35279a7c9fb --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py @@ -0,0 +1,124 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for MinerUParser.crop() overlay logic. + +Regression tests for issue #14197: content images were incorrectly darkened +when the top context strip had zero height and was skipped. +""" + +import logging +from unittest.mock import MagicMock + +import pytest +from PIL import Image + +from deepdoc.parser.mineru_parser import MinerUParser + + +def _make_parser(page_images): + parser = MinerUParser.__new__(MinerUParser) + parser.page_images = page_images + parser.page_from = 0 + parser.logger = logging.getLogger("test_mineru_crop") + return parser + + +def _solid_image(color, size=(500, 800)): + return Image.new("RGB", size, color) + + +def _sample_center(img): + w, h = img.size + return img.getpixel((w // 2, h // 2)) + + +class TestCropOverlay: + def test_content_not_darkened_when_top_context_skipped(self): + """ + Bug #14197: image at top of page (top=0) produces a zero-height top + context strip that is skipped. The content image must NOT be darkened. + """ + RED = (255, 0, 0) + parser = _make_parser([_solid_image(RED)]) + + # top=0 → context strip above has zero height → skipped + tag = "@@1\t50.0\t450.0\t0.0\t200.0##" + result = parser.crop(tag) + + assert result is not None + r, g, b = _sample_center(result) + # darkened overlay would halve the red channel to ~127 + assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content" + + def test_content_not_darkened_when_image_near_top(self): + """Image within GAP(6px) of page top also produces zero-height top strip.""" + RED = (255, 0, 0) + parser = _make_parser([_solid_image(RED)]) + + # top=4 → max(4-6, 0)=0 → zero-height top context strip + tag = "@@1\t50.0\t450.0\t4.0\t200.0##" + result = parser.crop(tag) + + assert result is not None + r, g, b = _sample_center(result) + assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content" + + def test_context_strips_are_darkened(self): + """Context strips above and below content must receive the overlay.""" + WHITE = (255, 255, 255) + parser = _make_parser([_solid_image(WHITE, size=(500, 800))]) + + # Content in the middle of the page — both context strips are valid + tag = "@@1\t50.0\t450.0\t300.0\t400.0##" + result = parser.crop(tag) + + assert result is not None + # Top-most pixel row should be from the darkened top context strip + r, g, b = result.getpixel((result.size[0] // 2, 0)) + assert r < 200, f"Top context strip not darkened (r={r})" + + def test_single_image_not_darkened_when_both_context_strips_skipped(self): + """ + Core bug from #14197: when both context strips are skipped (len(imgs)==1), + the single content image must NOT receive the overlay. + Original code: ii==0 AND ii+1==len(imgs) both True → always darkened. + """ + RED = (255, 0, 0) + # Page height = 200px; content fills full page → both context strips zero-height + parser = _make_parser([_solid_image(RED, size=(500, 200))]) + + # top=0, bottom=200 = page height → both context strips are zero-height → skipped + tag = "@@1\t50.0\t450.0\t0.0\t200.0##" + result = parser.crop(tag) + + assert result is not None + r, g, b = _sample_center(result) + # Before fix: r≈127 (darkened). After fix: r≈255 (clear). + assert r > 200, f"Single content image darkened (r={r}); both-strips-skipped bug not fixed" + + def test_multi_page_content_not_darkened(self): + """Content spanning multiple pages must not be darkened.""" + RED = (255, 0, 0) + parser = _make_parser([_solid_image(RED), _solid_image(RED)]) + + # top=0 on page 1, spans to page 2 + tag = "@@1-2\t50.0\t450.0\t0.0\t100.0##" + result = parser.crop(tag) + + assert result is not None + r, g, b = _sample_center(result) + assert r > 200, f"Multi-page content image darkened (r={r})" From b6eb172cb5ab0d35b48b6f126f127f2c6dd3b60f Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Sun, 19 Apr 2026 23:40:36 -0700 Subject: [PATCH 6/7] Fix: remove unused imports flagged by ruff in test_mineru_crop.py --- test/unit_test/deepdoc/parser/test_mineru_crop.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py index 35279a7c9fb..01553817b9d 100644 --- a/test/unit_test/deepdoc/parser/test_mineru_crop.py +++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py @@ -21,9 +21,7 @@ """ import logging -from unittest.mock import MagicMock -import pytest from PIL import Image from deepdoc.parser.mineru_parser import MinerUParser From e62b21d6b2a69ca862f8b0291040f89d102f51c9 Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Mon, 20 Apr 2026 00:05:28 -0700 Subject: [PATCH 7/7] Fix: add pytest priority markers to test_mineru_crop.py --- test/unit_test/deepdoc/parser/test_mineru_crop.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py index 01553817b9d..dfd0e68812e 100644 --- a/test/unit_test/deepdoc/parser/test_mineru_crop.py +++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py @@ -22,6 +22,7 @@ import logging +import pytest from PIL import Image from deepdoc.parser.mineru_parser import MinerUParser @@ -45,6 +46,7 @@ def _sample_center(img): class TestCropOverlay: + @pytest.mark.p2 def test_content_not_darkened_when_top_context_skipped(self): """ Bug #14197: image at top of page (top=0) produces a zero-height top @@ -62,6 +64,7 @@ def test_content_not_darkened_when_top_context_skipped(self): # darkened overlay would halve the red channel to ~127 assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content" + @pytest.mark.p2 def test_content_not_darkened_when_image_near_top(self): """Image within GAP(6px) of page top also produces zero-height top strip.""" RED = (255, 0, 0) @@ -75,6 +78,7 @@ def test_content_not_darkened_when_image_near_top(self): r, g, b = _sample_center(result) assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content" + @pytest.mark.p2 def test_context_strips_are_darkened(self): """Context strips above and below content must receive the overlay.""" WHITE = (255, 255, 255) @@ -89,6 +93,7 @@ def test_context_strips_are_darkened(self): r, g, b = result.getpixel((result.size[0] // 2, 0)) assert r < 200, f"Top context strip not darkened (r={r})" + @pytest.mark.p1 def test_single_image_not_darkened_when_both_context_strips_skipped(self): """ Core bug from #14197: when both context strips are skipped (len(imgs)==1), @@ -108,6 +113,7 @@ def test_single_image_not_darkened_when_both_context_strips_skipped(self): # Before fix: r≈127 (darkened). After fix: r≈255 (clear). assert r > 200, f"Single content image darkened (r={r}); both-strips-skipped bug not fixed" + @pytest.mark.p2 def test_multi_page_content_not_darkened(self): """Content spanning multiple pages must not be darkened.""" RED = (255, 0, 0)