From 1070315a0be555cab02d70781bbb38ef3de3bc26 Mon Sep 17 00:00:00 2001 From: JSap0914 Date: Tue, 16 Jun 2026 18:48:56 +0900 Subject: [PATCH] fix: prevent MarkdownHeaderLevelInferrer regex from consuming newlines The header pattern used `\s+` between the `#` characters and the heading text, which matches any whitespace including newlines. When a header line contained *only* trailing whitespace (e.g. `## `) the regex engine could consume the trailing space *and* the following newline in one `\s+` match, then lazily expand `(.+?)` over the next header line. This caused a whitespace-only pseudo-header and the following real header to be treated as a single match, rewriting `## \n## Section\nContent` to the invalid `# ## Section\nContent` (a hash character embedded inside the heading text). Fix: replace `\s+` / `(?:\s*)` with `[ \t]+` / `(?:[ \t]*)` so that the pattern is restricted to horizontal whitespace only and cannot span line boundaries. All 14 existing tests continue to pass; two new regression tests cover the fixed cases. --- .../preprocessors/md_header_level_inferrer.py | 6 ++-- .../test_markdown_header_level_inferrer.py | 28 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py index 4625bbca..adfbf815 100644 --- a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py +++ b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py @@ -39,8 +39,10 @@ class MarkdownHeaderLevelInferrer: def __init__(self): """Initializes the MarkdownHeaderLevelInferrer.""" - # handles headers with optional trailing spaces and empty content - self._header_pattern = re.compile(r"(?m)^(#{1,6})\s+(.+?)(?:\s*)$") + # handles headers with optional trailing spaces; uses [ \t]+ (not \s+) to avoid + # consuming newlines, which would incorrectly merge a whitespace-only header line + # with the following header and produce invalid Markdown (e.g. "# # Title"). + self._header_pattern = re.compile(r"(?m)^(#{1,6})[ \t]+(.+?)(?:[ \t]*)$") @component.output_types(documents=list[Document]) def run(self, documents: list[Document]) -> dict: diff --git a/test/components/preprocessors/test_markdown_header_level_inferrer.py b/test/components/preprocessors/test_markdown_header_level_inferrer.py index 5aeaadf8..64522952 100644 --- a/test/components/preprocessors/test_markdown_header_level_inferrer.py +++ b/test/components/preprocessors/test_markdown_header_level_inferrer.py @@ -160,3 +160,31 @@ def test_very_long_content(): assert content.startswith("# Header 0") assert "# Header 1" in content assert len(content.split("\n")) == len(text.split("\n")) + + +def test_whitespace_only_header_does_not_merge_with_next_header(): + """Regression: a header line with only whitespace after the '#' characters (e.g. '## ') + must not be merged with the following header line. + + Before the fix, the regex used \\s+ which could consume a trailing newline, causing + '## \\n## Section\\nContent' to produce '# ## Section\\nContent' (invalid Markdown). + """ + inferrer = MarkdownHeaderLevelInferrer() + text = "## \n## Section\nContent" + result = inferrer.run([Document(content=text)]) + output = result["documents"][0].content + # The whitespace-only first line is not a valid header and must be left untouched. + # '## Section' is the first matchable header, so it becomes level 1. + assert "# ## Section" not in output, f"Whitespace-only header merged with next header. Got: {output!r}" + assert output == "## \n# Section\nContent", f"Unexpected output: {output!r}" + + +def test_whitespace_only_header_does_not_produce_invalid_markdown(): + """Regression: '# \\n# Second' must not produce '# # Second' (a hash in heading text).""" + inferrer = MarkdownHeaderLevelInferrer() + text = "# \n# Second" + result = inferrer.run([Document(content=text)]) + output = result["documents"][0].content + assert output != "# # Second", f"Regex consumed newline and merged headers. Got: {output!r}" + # '# Second' is the first real header; it should remain at level 1. + assert output == "# \n# Second", f"Unexpected output: {output!r}"