From 1070315a0be555cab02d70781bbb38ef3de3bc26 Mon Sep 17 00:00:00 2001
From: JSap0914 <JSap0914@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:48:56 +0900
Subject: [PATCH] fix: prevent MarkdownHeaderLevelInferrer regex from consuming
 newlines

The header pattern used `\s+` between the `#` characters and the heading
text, which matches any whitespace including newlines.  When a header line
contained *only* trailing whitespace (e.g. `## `) the regex engine could
consume the trailing space *and* the following newline in one `\s+` match,
then lazily expand `(.+?)` over the next header line.  This caused a
whitespace-only pseudo-header and the following real header to be treated as
a single match, rewriting `## \n## Section\nContent` to the invalid
`# ## Section\nContent` (a hash character embedded inside the heading text).

Fix: replace `\s+` / `(?:\s*)` with `[ \t]+` / `(?:[ \t]*)` so that
the pattern is restricted to horizontal whitespace only and cannot span line
boundaries.  All 14 existing tests continue to pass; two new regression tests
cover the fixed cases.
---
 .../preprocessors/md_header_level_inferrer.py |  6 ++--
 .../test_markdown_header_level_inferrer.py    | 28 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py
index 4625bbca..adfbf815 100644
--- a/haystack_experimental/components/preprocessors/md_header_level_inferrer.py
+++ b/haystack_experimental/components/preprocessors/md_header_level_inferrer.py
@@ -39,8 +39,10 @@ class MarkdownHeaderLevelInferrer:
 
     def __init__(self):
         """Initializes the MarkdownHeaderLevelInferrer."""
-        # handles headers with optional trailing spaces and empty content
-        self._header_pattern = re.compile(r"(?m)^(#{1,6})\s+(.+?)(?:\s*)$")
+        # handles headers with optional trailing spaces; uses [ \t]+ (not \s+) to avoid
+        # consuming newlines, which would incorrectly merge a whitespace-only header line
+        # with the following header and produce invalid Markdown (e.g. "# # Title").
+        self._header_pattern = re.compile(r"(?m)^(#{1,6})[ \t]+(.+?)(?:[ \t]*)$")
 
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document]) -> dict:
diff --git a/test/components/preprocessors/test_markdown_header_level_inferrer.py b/test/components/preprocessors/test_markdown_header_level_inferrer.py
index 5aeaadf8..64522952 100644
--- a/test/components/preprocessors/test_markdown_header_level_inferrer.py
+++ b/test/components/preprocessors/test_markdown_header_level_inferrer.py
@@ -160,3 +160,31 @@ def test_very_long_content():
     assert content.startswith("# Header 0")
     assert "# Header 1" in content
     assert len(content.split("\n")) == len(text.split("\n"))
+
+
+def test_whitespace_only_header_does_not_merge_with_next_header():
+    """Regression: a header line with only whitespace after the '#' characters (e.g. '## ')
+    must not be merged with the following header line.
+
+    Before the fix, the regex used \\s+ which could consume a trailing newline, causing
+    '## \\n## Section\\nContent' to produce '# ## Section\\nContent' (invalid Markdown).
+    """
+    inferrer = MarkdownHeaderLevelInferrer()
+    text = "## \n## Section\nContent"
+    result = inferrer.run([Document(content=text)])
+    output = result["documents"][0].content
+    # The whitespace-only first line is not a valid header and must be left untouched.
+    # '## Section' is the first matchable header, so it becomes level 1.
+    assert "# ## Section" not in output, f"Whitespace-only header merged with next header. Got: {output!r}"
+    assert output == "## \n# Section\nContent", f"Unexpected output: {output!r}"
+
+
+def test_whitespace_only_header_does_not_produce_invalid_markdown():
+    """Regression: '# \\n# Second' must not produce '# # Second' (a hash in heading text)."""
+    inferrer = MarkdownHeaderLevelInferrer()
+    text = "# \n# Second"
+    result = inferrer.run([Document(content=text)])
+    output = result["documents"][0].content
+    assert output != "# # Second", f"Regex consumed newline and merged headers. Got: {output!r}"
+    # '# Second' is the first real header; it should remain at level 1.
+    assert output == "# \n# Second", f"Unexpected output: {output!r}"