lark-parser · erezsh · May 9, 2026 · May 9, 2026 · May 10, 2026
diff --git a/docs/classes.rst b/docs/classes.rst
@@ -5,7 +5,7 @@ Lark
 ----
 
 .. autoclass:: lark.Lark
-    :members: open, parse, parse_interactive, lex, save, load, get_terminal, open_from_package
+    :members: open, parse, parse_interactive, scan, lex, save, load, get_terminal, open_from_package
 
 
 Using Unicode character classes with ``regex``
@@ -101,3 +101,10 @@ TextSlice
 ---------
 
 .. autoclass:: lark.utils.TextSlice
+
+ScanMatch
+---------
+
+A single match yielded by :meth:`Lark.scan`.
+
+.. autoclass:: lark.parser_frontends.ScanMatch
diff --git a/docs/features.md b/docs/features.md
@@ -22,6 +22,7 @@
 [Read more about the parsers](parsers.md)
 
 ## Extra features
+  - `Lark.scan()` for finding non-overlapping grammar matches embedded in arbitrary text (LALR only — see [recipes](recipes.html#extract-grammar-matches-from-arbitrary-text-with-lark-scan))
   - Support for external regex module ([see here](classes.html#using-unicode-character-classes-with-regex))
   - Import grammars from Nearley.js ([read more](tools.html#importing-grammars-from-nearleyjs))
   - CYK parser

diff --git a/docs/recipes.md b/docs/recipes.md
@@ -204,3 +204,44 @@ the `Indenter` class. Take a look at the [indented tree example][indent] as well
 
 [indent]: examples/indented_tree.html
 [python]: https://github.com/lark-parser/lark/blob/master/lark/grammars/python.lark
+
+
+## Extract grammar matches from arbitrary text with `Lark.scan`
+
+Sometimes parsing the entire document isn't feasible. The scan method
+lets you find every instance of a grammar that appears inside arbitrary text.
+Things like dates in prose, JSON payloads in log lines, or configuration placeholders
+in templates.
+
+`Lark.scan()` walks the input from left to right, yielding a match per snippet that matches your grammar.
+Text that doesn't match is silently skipped.
+
+Example:
+```python
+from lark import Lark
+
+parser = Lark(r"""
+    date: MONTH DAY "," YEAR
+    MONTH: "January"|"February"|"March"|"April"|"May"|"June"
+         |"July"|"August"|"September"|"October"|"November"|"December"
+    DAY: /\d+/
+    YEAR: /\d+/
+    %ignore /\s+/
+""", parser="lalr", start="date")
+
+text = "Python 0.9.0 was released on February 20, 1991, and 1.0 on January 26, 1994."
+
+for match in parser.scan(text):
+    print("Found:", match)
+```
+
+Prints:
+
+```
+Found: ScanMatch(range=(29, 46), value=Tree(Token('RULE', 'date'), [Token('MONTH', 'February'), Token('DAY', '20'), Token('YEAR', '1991')]))
+Found: ScanMatch(range=(59, 75), value=Tree(Token('RULE', 'date'), [Token('MONTH', 'January'), Token('DAY', '26'), Token('YEAR', '1994')]))
+```
+
+For an example scanning text for JSON, see [`examples/advanced/scan_json.py`][scan_json].
+
+[scan_json]: examples/advanced/scan_json.html
diff --git a/examples/advanced/scan_json.py b/examples/advanced/scan_json.py
@@ -0,0 +1,84 @@
+"""
+Extracting JSON payloads from arbitrary text with ``Lark.scan``
+================================================================
+
+Shows ``Lark.scan`` finding JSON objects and arrays embedded in text.
+JSON has a recursive structure, which makes this task very hard for regex,
+but straightforward for a parser.
+
+The grammar and ``Transformer`` here are essentially the same as in
+``examples/json_parser.py``.
+"""
+
+from lark import Lark, Transformer, v_args
+
+
+example_text = r'''
+[2024-05-10 12:00:01] INFO  login: {"user": "alice", "tags": ["admin", "active"]}
+[2024-05-10 12:00:05] ERROR auth failed: {
+    "user": "bob",
+    "attempt": 3,
+    "blocked": false,
+    "reason": "rate-limited"
+}
+... that was unexpected; checking next event ...
+[2024-05-10 12:00:12] INFO  cart updated: {
+    "cart_id": 42,
+    "items": [
+        {"sku": "X1", "qty": 2, "price": 9.99},
+        {"sku": "Y2", "qty": 1, "price": 14.5e1}
+    ],
+    "promo": null
+}
+[2024-05-10 12:00:18] INFO  batch: ["a", "b", "c"]
+[2024-05-10 12:00:30] DEBUG no payload
+'''
+
+
+json_grammar = r"""
+    ?start: object | array
+
+    ?value: object
+          | array
+          | string
+          | SIGNED_NUMBER      -> number
+          | "true"             -> true
+          | "false"            -> false
+          | "null"             -> null
+
+    array  : "[" (value ("," value)*)? "]"
+    object : "{" (pair ("," pair)*)? "}"
+    pair   : string ":" value
+
+    string : ESCAPED_STRING
+
+    %import common.ESCAPED_STRING
+    %import common.SIGNED_NUMBER
+    %import common.WS
+
+    %ignore WS
+"""
+
+
+class TreeToJson(Transformer):
+    @v_args(inline=True)
+    def string(self, s):
+        return s[1:-1].replace('\\"', '"')
+
+    array = list
+    pair = tuple
+    object = dict
+    number = v_args(inline=True)(float)
+
+    null = lambda self, _: None
+    true = lambda self, _: True
+    false = lambda self, _: False
+
+
+parser = Lark(json_grammar, parser="lalr", transformer=TreeToJson())
+
+
+print(f"{'Range':<14}  Payload")
+print("-" * 90)
+for match in parser.scan(example_text):
+    print(f"{str(match.range):<14}  {match.value!r}")
diff --git a/lark/lark.py b/lark/lark.py
@@ -13,7 +13,7 @@
     from .tree import ParseTree
     from .visitors import Transformer
     from typing import Literal
-    from .parser_frontends import ParsingFrontend
+    from .parser_frontends import ParsingFrontend, ScanMatch
 
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
 from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice, LarkInput
@@ -736,5 +736,36 @@ def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[
             raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
         return self.parser.parse(text, start=start, on_error=on_error)
 
+    def scan(self, text: TextOrSlice, start: Optional[str]=None) -> Iterable['ScanMatch[_Return_T]']:
+        """Scan the input text for non-overlapping matches of this grammar.
+        Only works when ``parser='lalr'`` and without ``postlex``.
+
+        Greedy parsing: Where multiple end positions are valid, the longest is returned.
+
+        Does not raise on lex or parse errors — invalid input is silently skipped.
+        Exceptions from user callbacks may still propagate.
+        When a lexer callback raises ``ValueError``, scan() will treat it as a lex error and abort scanning the match.
+
+        For best performance, ensure the first terminal(s) that can be matched by the grammar are unique
+        in the text and always indicate the start of a match.
+
+        A returned match will never start or end with an ignored terminal.
+
+        User ``lexer_callbacks`` must preserve source positions on returned tokens — use
+        ``Token.update()`` rather than constructing a fresh ``Token``.
+
+        Parameters:
+            text (TextOrSlice): Text to be scanned, as ``str``, ``bytes``, or a ``TextSlice`` instance.
+            start (str, optional): Start symbol. Required if Lark was initialized with multiple start symbols.
+
+        Yields:
+            ``ScanMatch`` instances, each with a ``range`` (a (start, end) tuple)
+            and a ``value`` attribute. ``value`` is a ``Tree`` by default, or
+            whatever the ``transformer`` returns when one was supplied.
+
+        See Also: ``Lark.parse()``
+        """
+        return self.parser.scan(text, start=start)
+
 
 ###}
diff --git a/lark/lexer.py b/lark/lexer.py
@@ -254,13 +254,15 @@ def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') ->
         return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
 
     def __reduce__(self):
-        return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
+        return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column,
+                                 self.end_line, self.end_column, self.end_pos))
 
     def __repr__(self):
         return 'Token(%r, %r)' % (self.type, self.value)
 
     def __deepcopy__(self, memo):
-        return Token(self.type, self.value, self.start_pos, self.line, self.column)
+        return Token(self.type, self.value, self.start_pos, self.line, self.column,
+                     self.end_line, self.end_column, self.end_pos)
 
     def __eq__(self, other):
         if isinstance(other, Token) and self.type != other.type:
@@ -396,6 +398,16 @@ def fullmatch(self, text: str) -> Optional[str]:
                 return m.lastgroup
         return None
 
+    def search(self, text: TextSlice, pos: int):
+        "Find earliest match, starting at pos"
+        best = None
+        for mre in self._mres:
+            m = mre.search(text.text, pos, text.end)
+            if m and (best is None or m.start() < best.start()):
+                best = m
+        if best is not None:
+            return (best.group(0), best.lastgroup), best.start()
+
 def _regexp_has_newline(r: str):
     r"""Expressions that may indicate newlines in a regexp:
         - newlines (\n)
@@ -485,6 +497,9 @@ class Lexer(ABC):
     def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
         return NotImplemented
 
+    def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
+        raise TypeError("This lexer cannot be used for searching in text")
+
     def make_lexer_state(self, text: str):
         "Deprecated"
         return LexerState(TextSlice.cast_from(text))
@@ -645,6 +660,17 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
         # EOF
         raise EOFError(self)
 
+    def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
+        while True:
+            res = self.scanner.search(text, pos)
+            if not res:
+                return None
+            (value, type_), actual_pos = res
+            if type_ in self.ignore_types:
+                pos = actual_pos + len(value)
+                continue
+            return Token(type_, value, actual_pos, end_pos=actual_pos + len(value))
+
 
 class ContextualLexer(Lexer):
     lexers: Dict[int, AbstractBasicLexer]
@@ -699,4 +725,7 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[
             except UnexpectedCharacters:
                 raise e  # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
 
+    def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
+        return self.lexers[start_state].search_start(text, start_state, pos)
+
 ###}
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -1,6 +1,8 @@
-from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
+from copy import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Generic, Iterable, Tuple, TypeVar
 
-from .exceptions import ConfigurationError, GrammarError, assert_config
+from .exceptions import ConfigurationError, GrammarError, LexError, UnexpectedInput, assert_config
 from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice, LarkInput
 from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
 from .parsers import earley, xearley, cyk
@@ -14,6 +16,21 @@
 
 ###{standalone
 
+T = TypeVar('T')
+
+@dataclass(frozen=True)
+class ScanMatch(Generic[T]):
+    """A non-overlapping match found by ``Lark.scan()``.
+
+    Attributes:
+        range: A (start, end) tuple of the indices in the input text.
+        value: The parse result. A ``Tree`` by default, or whatever the
+            ``transformer`` returns when one was supplied to Lark.
+    """
+    range: Tuple[int, int]
+    value: T
+
+
 def _wrap_lexer(lexer_class):
     future_interface = getattr(lexer_class, '__future_interface__', 0)
     if future_interface == 2:
@@ -139,6 +156,73 @@ def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
         stream = self._make_lexer_thread(text)
         return self.parser.parse_interactive(stream, chosen_start)
 
+    def scan(self, text: TextOrSlice, start: Optional[str]=None) -> Iterable[ScanMatch]:
+        """See ``Lark.scan``."""
+        if self.parser_conf.parser_type != 'lalr':
+            raise ConfigurationError("scan() requires parser='lalr'")
+        if self.skip_lexer:
+            raise ConfigurationError("scan() does not support lexer='dynamic'/'dynamic_complete'")
+        if self.lexer_conf.postlex is not None:
+            # postlex carries state across the stream (indent depth, paren nesting); mid-stream parses break it.
+            raise ConfigurationError("scan() does not support postlex")
+        chosen_start = self._verify_start(start)
+        start_state = self.parser._parse_table.start_states[chosen_start]
+        text_slice = TextSlice.cast_from(text)
+        pos = text_slice.start
+        while True:
+            # Search for a plausible start
+            first_token = self.lexer.search_start(text_slice, start_state, pos)
+            if first_token is None:
+                return
+            assert first_token.start_pos is not None and first_token.start_pos >= text_slice.start
+            assert first_token.end_pos is not None and first_token.end_pos <= text_slice.end
+
+            # Parse without callbacks, to keep value-stack minimal and avoid expensive deepcopies.
+            # Aim for the longest possible match, and save the tokens we lex for later replay.
+            stunted_ip = self.parse_interactive(text_slice.start_from(first_token.start_pos), start=chosen_start)
+            stunted_ip.parser_state.parse_conf = copy(stunted_ip.parser_state.parse_conf)
+            stunted_ip.parser_state.parse_conf.callbacks = {}
+            matched_tokens = []
+            longest_match = 0  # number of tokens in the longest accepted prefix
+            token_stream = stunted_ip.lexer_thread.lex(stunted_ip.parser_state)
+            try:
+                for token in token_stream:
+                    stunted_ip.feed_token(token)
+                    matched_tokens.append(token)
+                    # Test if we reached a possible completed parse
+                    if '$END' in stunted_ip.choices():
+                        tmp_ip = stunted_ip.copy(deepcopy_values=False)
+                        try:
+                            tmp_ip.feed_eof(token)
+                        except UnexpectedInput:
+                            continue
+                        longest_match = len(matched_tokens)
+                        # keep going and testing for candidates, until the parse ends or fails
+            except UnexpectedInput:
+                # Parse failed
+                pass
+            except ValueError:
+                # A user lexer-callback raised an error
+                pass
+
+            if longest_match:
+                # Match found! Replay tokens with real callbacks, and yield the result
+                last = matched_tokens[longest_match - 1]
+                if last.end_pos is None:
+                    raise LexError(
+                        f"Lexer callback for {last.type!r} did not preserve token positions; "
+                        f"scan() requires source positions on every token (use Token.update() in callbacks).")
+                replay_ip = self.parse_interactive(start=chosen_start)
+                for t in matched_tokens[:longest_match]:
+                    replay_ip.feed_token(t)
+                res = replay_ip.feed_eof(last)
+                yield ScanMatch((first_token.start_pos, last.end_pos), res)
+                # Resume from end of match (no overlaps)
+                pos = last.end_pos
+            else:
+                # No match found. Scan again from next character
+                pos = first_token.start_pos + 1
+
 
 def _validate_frontend_args(parser, lexer) -> None:
     assert_config(parser, ('lalr', 'earley', 'cyk'))

diff --git a/lark/utils.py b/lark/utils.py
@@ -215,6 +215,9 @@ def cast_from(cls, text: 'TextOrSlice') -> 'TextSlice[AnyStr]':
     def is_complete_text(self):
         return self.start == 0 and self.end == len(self.text)
 
+    def start_from(self, pos: int) -> 'TextSlice[AnyStr]':
+        return TextSlice(self.text, pos, self.end)
+
     def __len__(self):
         return self.end - self.start
 

diff --git a/tests/__main__.py b/tests/__main__.py
@@ -13,6 +13,7 @@
 from .test_tree_forest_transformer import TestTreeForestTransformer
 from .test_lexer import TestLexer
 from .test_python_grammar import TestPythonParser
+from .test_scan import TestScan
 from .test_tree_templates import *  # We define __all__ to list which TestSuites to run
 
 try: