From 2f81f987cd258c462d7cb0be278206b570bbedd4 Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 19:20:40 -0700
Subject: [PATCH 1/7] Fix: MinerU parsed images are obscured when using MinerU
 as a parser. (#14197)

---
 deepdoc/parser/mineru_parser.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 25a0627ff41..f8bc9992b7a 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -416,7 +416,11 @@ def crop(self, text, ZM=1, need_position=False):
         )
 
         positions = []
+        # Each entry is (PIL.Image, is_context) so overlay logic is decoupled from final index.
+        imgs_with_flags = []
+        poss_last_idx = len(poss) - 1
         for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            is_context = ii == 0 or ii == poss_last_idx
             right = left + max_width
 
             if bottom <= top:
@@ -443,8 +447,8 @@ def crop(self, text, ZM=1, need_position=False):
             if x1 <= x0 or y1 <= y0:
                 continue
             crop0 = img0.crop((x0, y0, x1, y1))
-            imgs.append(crop0)
-            if 0 < ii < len(poss) - 1:
+            imgs_with_flags.append((crop0, is_context))
+            if 0 < ii < poss_last_idx:
                 positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
 
             bottom -= img0.size[1]
@@ -463,25 +467,25 @@ def crop(self, text, ZM=1, need_position=False):
                     bottom -= page.size[1]
                     continue
                 cimgp = page.crop((x0, y0, x1, y1))
-                imgs.append(cimgp)
-                if 0 < ii < len(poss) - 1:
+                imgs_with_flags.append((cimgp, is_context))
+                if 0 < ii < poss_last_idx:
                     positions.append((pn + self.page_from, x0, x1, y0, y1))
                 bottom -= page.size[1]
 
-        if not imgs:
+        if not imgs_with_flags:
             if need_position:
                 return None, None
             return
 
         height = 0
-        for img in imgs:
+        for img, _ in imgs_with_flags:
             height += img.size[1] + GAP
         height = int(height)
-        width = int(np.max([i.size[0] for i in imgs]))
+        width = int(np.max([i.size[0] for i, _ in imgs_with_flags]))
         pic = Image.new("RGB", (width, height), (245, 245, 245))
         height = 0
-        for ii, img in enumerate(imgs):
-            if ii == 0 or ii + 1 == len(imgs):
+        for img, is_context in imgs_with_flags:
+            if is_context:
                 img = img.convert("RGBA")
                 overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
                 overlay.putalpha(128)

From 11fd3839696f2451cf6979bc1934ba6fb5720874 Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 19:38:24 -0700
Subject: [PATCH 2/7] Fix: MinerU parsed images are obscured when using MinerU
 as a parser. (#14197)

---
 deepdoc/parser/mineru_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index f8bc9992b7a..987056f4206 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -447,6 +447,7 @@ def crop(self, text, ZM=1, need_position=False):
             if x1 <= x0 or y1 <= y0:
                 continue
             crop0 = img0.crop((x0, y0, x1, y1))
+            self.logger.debug(f"[MinerU] crop: page={pns[0]} coords=({x0},{y0},{x1},{y1}) is_context={is_context} page_count={page_count}")
             imgs_with_flags.append((crop0, is_context))
             if 0 < ii < poss_last_idx:
                 positions.append((pns[0] + self.page_from, x0, x1, y0, y1))

From 53b81e7d1360b0648460cad9d1d0f9f4d46cc131 Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 21:22:34 -0700
Subject: [PATCH 3/7] Fix: remove unused imgs variable flagged by ruff

---
 deepdoc/parser/mineru_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 987056f4206..72b16de9866 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -358,7 +358,6 @@ def _line_tag(self, bx):
         return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
 
     def crop(self, text, ZM=1, need_position=False):
-        imgs = []
         poss = self.extract_positions(text)
         if not poss:
             if need_position:

From 937a76cc15fad6153df628dce3b2deabf2a0364a Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 23:02:16 -0700
Subject: [PATCH 4/7] Fix: simplify overlay logic using boundary indices per
 reviewer suggestion

---
 deepdoc/parser/mineru_parser.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 72b16de9866..ba5d6b2c32e 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -415,11 +415,12 @@ def crop(self, text, ZM=1, need_position=False):
         )
 
         positions = []
-        # Each entry is (PIL.Image, is_context) so overlay logic is decoupled from final index.
-        imgs_with_flags = []
-        poss_last_idx = len(poss) - 1
+        imgs = []
+        head_ctx_end = 0
+        tail_ctx_start = 0
         for ii, (pns, left, right, top, bottom) in enumerate(poss):
-            is_context = ii == 0 or ii == poss_last_idx
+            if ii + 1 == len(poss):
+                tail_ctx_start = len(imgs)
             right = left + max_width
 
             if bottom <= top:
@@ -446,9 +447,8 @@ def crop(self, text, ZM=1, need_position=False):
             if x1 <= x0 or y1 <= y0:
                 continue
             crop0 = img0.crop((x0, y0, x1, y1))
-            self.logger.debug(f"[MinerU] crop: page={pns[0]} coords=({x0},{y0},{x1},{y1}) is_context={is_context} page_count={page_count}")
-            imgs_with_flags.append((crop0, is_context))
-            if 0 < ii < poss_last_idx:
+            imgs.append(crop0)
+            if 0 < ii < len(poss) - 1:
                 positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
 
             bottom -= img0.size[1]
@@ -467,25 +467,28 @@ def crop(self, text, ZM=1, need_position=False):
                     bottom -= page.size[1]
                     continue
                 cimgp = page.crop((x0, y0, x1, y1))
-                imgs_with_flags.append((cimgp, is_context))
-                if 0 < ii < poss_last_idx:
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
                     positions.append((pn + self.page_from, x0, x1, y0, y1))
                 bottom -= page.size[1]
 
-        if not imgs_with_flags:
+            if ii == 0:
+                head_ctx_end = len(imgs)
+
+        if not imgs:
             if need_position:
                 return None, None
             return
 
         height = 0
-        for img, _ in imgs_with_flags:
+        for img in imgs:
             height += img.size[1] + GAP
         height = int(height)
-        width = int(np.max([i.size[0] for i, _ in imgs_with_flags]))
+        width = int(np.max([i.size[0] for i in imgs]))
         pic = Image.new("RGB", (width, height), (245, 245, 245))
         height = 0
-        for img, is_context in imgs_with_flags:
-            if is_context:
+        for ii, img in enumerate(imgs):
+            if ii < head_ctx_end or ii >= tail_ctx_start:
                 img = img.convert("RGBA")
                 overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
                 overlay.putalpha(128)

From 0ea4150eaca536a93b6368967b22fd9ce64e14fc Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 23:38:24 -0700
Subject: [PATCH 5/7] Test: add unit tests covering both-strips-skipped
 scenario for #14197

---
 .../deepdoc/parser/test_mineru_crop.py        | 124 ++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 test/unit_test/deepdoc/parser/test_mineru_crop.py

diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py
new file mode 100644
index 00000000000..35279a7c9fb
--- /dev/null
+++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py
@@ -0,0 +1,124 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""Unit tests for MinerUParser.crop() overlay logic.
+
+Regression tests for issue #14197: content images were incorrectly darkened
+when the top context strip had zero height and was skipped.
+"""
+
+import logging
+from unittest.mock import MagicMock
+
+import pytest
+from PIL import Image
+
+from deepdoc.parser.mineru_parser import MinerUParser
+
+
+def _make_parser(page_images):
+    parser = MinerUParser.__new__(MinerUParser)
+    parser.page_images = page_images
+    parser.page_from = 0
+    parser.logger = logging.getLogger("test_mineru_crop")
+    return parser
+
+
+def _solid_image(color, size=(500, 800)):
+    return Image.new("RGB", size, color)
+
+
+def _sample_center(img):
+    w, h = img.size
+    return img.getpixel((w // 2, h // 2))
+
+
+class TestCropOverlay:
+    def test_content_not_darkened_when_top_context_skipped(self):
+        """
+        Bug #14197: image at top of page (top=0) produces a zero-height top
+        context strip that is skipped. The content image must NOT be darkened.
+        """
+        RED = (255, 0, 0)
+        parser = _make_parser([_solid_image(RED)])
+
+        # top=0 → context strip above has zero height → skipped
+        tag = "@@1\t50.0\t450.0\t0.0\t200.0##"
+        result = parser.crop(tag)
+
+        assert result is not None
+        r, g, b = _sample_center(result)
+        # darkened overlay would halve the red channel to ~127
+        assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"
+
+    def test_content_not_darkened_when_image_near_top(self):
+        """Image within GAP(6px) of page top also produces zero-height top strip."""
+        RED = (255, 0, 0)
+        parser = _make_parser([_solid_image(RED)])
+
+        # top=4 → max(4-6, 0)=0 → zero-height top context strip
+        tag = "@@1\t50.0\t450.0\t4.0\t200.0##"
+        result = parser.crop(tag)
+
+        assert result is not None
+        r, g, b = _sample_center(result)
+        assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"
+
+    def test_context_strips_are_darkened(self):
+        """Context strips above and below content must receive the overlay."""
+        WHITE = (255, 255, 255)
+        parser = _make_parser([_solid_image(WHITE, size=(500, 800))])
+
+        # Content in the middle of the page — both context strips are valid
+        tag = "@@1\t50.0\t450.0\t300.0\t400.0##"
+        result = parser.crop(tag)
+
+        assert result is not None
+        # Top-most pixel row should be from the darkened top context strip
+        r, g, b = result.getpixel((result.size[0] // 2, 0))
+        assert r < 200, f"Top context strip not darkened (r={r})"
+
+    def test_single_image_not_darkened_when_both_context_strips_skipped(self):
+        """
+        Core bug from #14197: when both context strips are skipped (len(imgs)==1),
+        the single content image must NOT receive the overlay.
+        Original code: ii==0 AND ii+1==len(imgs) both True → always darkened.
+        """
+        RED = (255, 0, 0)
+        # Page height = 200px; content fills full page → both context strips zero-height
+        parser = _make_parser([_solid_image(RED, size=(500, 200))])
+
+        # top=0, bottom=200 = page height → both context strips are zero-height → skipped
+        tag = "@@1\t50.0\t450.0\t0.0\t200.0##"
+        result = parser.crop(tag)
+
+        assert result is not None
+        r, g, b = _sample_center(result)
+        # Before fix: r≈127 (darkened). After fix: r≈255 (clear).
+        assert r > 200, f"Single content image darkened (r={r}); both-strips-skipped bug not fixed"
+
+    def test_multi_page_content_not_darkened(self):
+        """Content spanning multiple pages must not be darkened."""
+        RED = (255, 0, 0)
+        parser = _make_parser([_solid_image(RED), _solid_image(RED)])
+
+        # top=0 on page 1, spans to page 2
+        tag = "@@1-2\t50.0\t450.0\t0.0\t100.0##"
+        result = parser.crop(tag)
+
+        assert result is not None
+        r, g, b = _sample_center(result)
+        assert r > 200, f"Multi-page content image darkened (r={r})"

From b6eb172cb5ab0d35b48b6f126f127f2c6dd3b60f Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Sun, 19 Apr 2026 23:40:36 -0700
Subject: [PATCH 6/7] Fix: remove unused imports flagged by ruff in
 test_mineru_crop.py

---
 test/unit_test/deepdoc/parser/test_mineru_crop.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py
index 35279a7c9fb..01553817b9d 100644
--- a/test/unit_test/deepdoc/parser/test_mineru_crop.py
+++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py
@@ -21,9 +21,7 @@
 """
 
 import logging
-from unittest.mock import MagicMock
 
-import pytest
 from PIL import Image
 
 from deepdoc.parser.mineru_parser import MinerUParser

From e62b21d6b2a69ca862f8b0291040f89d102f51c9 Mon Sep 17 00:00:00 2001
From: web-dev0521 <jasonpette1783@gmail.com>
Date: Mon, 20 Apr 2026 00:05:28 -0700
Subject: [PATCH 7/7] Fix: add pytest priority markers to test_mineru_crop.py

---
 test/unit_test/deepdoc/parser/test_mineru_crop.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/unit_test/deepdoc/parser/test_mineru_crop.py b/test/unit_test/deepdoc/parser/test_mineru_crop.py
index 01553817b9d..dfd0e68812e 100644
--- a/test/unit_test/deepdoc/parser/test_mineru_crop.py
+++ b/test/unit_test/deepdoc/parser/test_mineru_crop.py
@@ -22,6 +22,7 @@
 
 import logging
 
+import pytest
 from PIL import Image
 
 from deepdoc.parser.mineru_parser import MinerUParser
@@ -45,6 +46,7 @@ def _sample_center(img):
 
 
 class TestCropOverlay:
+    @pytest.mark.p2
     def test_content_not_darkened_when_top_context_skipped(self):
         """
         Bug #14197: image at top of page (top=0) produces a zero-height top
@@ -62,6 +64,7 @@ def test_content_not_darkened_when_top_context_skipped(self):
         # darkened overlay would halve the red channel to ~127
         assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"
 
+    @pytest.mark.p2
     def test_content_not_darkened_when_image_near_top(self):
         """Image within GAP(6px) of page top also produces zero-height top strip."""
         RED = (255, 0, 0)
@@ -75,6 +78,7 @@ def test_content_not_darkened_when_image_near_top(self):
         r, g, b = _sample_center(result)
         assert r > 200, f"Content image darkened (r={r}); overlay incorrectly applied to content"
 
+    @pytest.mark.p2
     def test_context_strips_are_darkened(self):
         """Context strips above and below content must receive the overlay."""
         WHITE = (255, 255, 255)
@@ -89,6 +93,7 @@ def test_context_strips_are_darkened(self):
         r, g, b = result.getpixel((result.size[0] // 2, 0))
         assert r < 200, f"Top context strip not darkened (r={r})"
 
+    @pytest.mark.p1
     def test_single_image_not_darkened_when_both_context_strips_skipped(self):
         """
         Core bug from #14197: when both context strips are skipped (len(imgs)==1),
@@ -108,6 +113,7 @@ def test_single_image_not_darkened_when_both_context_strips_skipped(self):
         # Before fix: r≈127 (darkened). After fix: r≈255 (clear).
         assert r > 200, f"Single content image darkened (r={r}); both-strips-skipped bug not fixed"
 
+    @pytest.mark.p2
     def test_multi_page_content_not_darkened(self):
         """Content spanning multiple pages must not be darkened."""
         RED = (255, 0, 0)