Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Lark
----

.. autoclass:: lark.Lark
:members: open, parse, parse_interactive, lex, save, load, get_terminal, open_from_package
:members: open, parse, parse_interactive, scan, lex, save, load, get_terminal, open_from_package


Using Unicode character classes with ``regex``
Expand Down Expand Up @@ -101,3 +101,10 @@ TextSlice
---------

.. autoclass:: lark.utils.TextSlice

ScanMatch
---------

A single match yielded by :meth:`Lark.scan`.

.. autoclass:: lark.parser_frontends.ScanMatch
1 change: 1 addition & 0 deletions docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
[Read more about the parsers](parsers.md)

## Extra features
- `Lark.scan()` for finding non-overlapping grammar matches embedded in arbitrary text (LALR only — see [recipes](recipes.html#extract-grammar-matches-from-arbitrary-text-with-lark-scan))
- Support for external regex module ([see here](classes.html#using-unicode-character-classes-with-regex))
- Import grammars from Nearley.js ([read more](tools.html#importing-grammars-from-nearleyjs))
- CYK parser
Expand Down
41 changes: 41 additions & 0 deletions docs/recipes.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,44 @@ the `Indenter` class. Take a look at the [indented tree example][indent] as well

[indent]: examples/indented_tree.html
[python]: https://github.com/lark-parser/lark/blob/master/lark/grammars/python.lark


## Extract grammar matches from arbitrary text with `Lark.scan`

Sometimes parsing the entire document isn't feasible. The scan method
lets you find every instance of a grammar that appears inside arbitrary text.
Things like dates in prose, JSON payloads in log lines, or configuration placeholders
in templates.

`Lark.scan()` walks the input from left to right, yielding a match per snippet that matches your grammar.
Text that doesn't match is silently skipped.

Example:
```python
from lark import Lark

parser = Lark(r"""
date: MONTH DAY "," YEAR
MONTH: "January"|"February"|"March"|"April"|"May"|"June"
|"July"|"August"|"September"|"October"|"November"|"December"
DAY: /\d+/
YEAR: /\d+/
%ignore /\s+/
""", parser="lalr", start="date")

text = "Python 0.9.0 was released on February 20, 1991, and 1.0 on January 26, 1994."

for match in parser.scan(text):
print("Found:", match)
```

Prints:

```
Found: ScanMatch(range=(29, 46), value=Tree(Token('RULE', 'date'), [Token('MONTH', 'February'), Token('DAY', '20'), Token('YEAR', '1991')]))
Found: ScanMatch(range=(59, 75), value=Tree(Token('RULE', 'date'), [Token('MONTH', 'January'), Token('DAY', '26'), Token('YEAR', '1994')]))
```

For an example scanning text for JSON, see [`examples/advanced/scan_json.py`][scan_json].

[scan_json]: examples/advanced/scan_json.html
84 changes: 84 additions & 0 deletions examples/advanced/scan_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Extracting JSON payloads from arbitrary text with ``Lark.scan``
================================================================

Shows ``Lark.scan`` finding JSON objects and arrays embedded in text.
JSON has a recursive structure, which makes this task very hard for regex,
but straightforward for a parser.

The grammar and ``Transformer`` here are essentially the same as in
``examples/json_parser.py``.
"""

from lark import Lark, Transformer, v_args


example_text = r'''
[2024-05-10 12:00:01] INFO login: {"user": "alice", "tags": ["admin", "active"]}
[2024-05-10 12:00:05] ERROR auth failed: {
"user": "bob",
"attempt": 3,
"blocked": false,
"reason": "rate-limited"
}
... that was unexpected; checking next event ...
[2024-05-10 12:00:12] INFO cart updated: {
"cart_id": 42,
"items": [
{"sku": "X1", "qty": 2, "price": 9.99},
{"sku": "Y2", "qty": 1, "price": 14.5e1}
],
"promo": null
}
[2024-05-10 12:00:18] INFO batch: ["a", "b", "c"]
[2024-05-10 12:00:30] DEBUG no payload
'''


json_grammar = r"""
?start: object | array

?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" (value ("," value)*)? "]"
object : "{" (pair ("," pair)*)? "}"
pair : string ":" value

string : ESCAPED_STRING

%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS

%ignore WS
"""


class TreeToJson(Transformer):
@v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')

array = list
pair = tuple
object = dict
number = v_args(inline=True)(float)

null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False


parser = Lark(json_grammar, parser="lalr", transformer=TreeToJson())


print(f"{'Range':<14} Payload")
print("-" * 90)
for match in parser.scan(example_text):
print(f"{str(match.range):<14} {match.value!r}")
33 changes: 32 additions & 1 deletion lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .tree import ParseTree
from .visitors import Transformer
from typing import Literal
from .parser_frontends import ParsingFrontend
from .parser_frontends import ParsingFrontend, ScanMatch

from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice, LarkInput
Expand Down Expand Up @@ -736,5 +736,36 @@ def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[
raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
return self.parser.parse(text, start=start, on_error=on_error)

def scan(self, text: TextOrSlice, start: Optional[str]=None) -> Iterable['ScanMatch[_Return_T]']:
"""Scan the input text for non-overlapping matches of this grammar.
Only works when ``parser='lalr'`` and without ``postlex``.

Greedy parsing: Where multiple end positions are valid, the longest is returned.

Does not raise on lex or parse errors — invalid input is silently skipped.
Exceptions from user callbacks may still propagate.
When a lexer callback raises ``ValueError``, scan() will treat it as a lex error and abort scanning the match.

For best performance, ensure the first terminal(s) that can be matched by the grammar are unique
in the text and always indicate the start of a match.

A returned match will never start or end with an ignored terminal.

User ``lexer_callbacks`` must preserve source positions on returned tokens — use
``Token.update()`` rather than constructing a fresh ``Token``.

Parameters:
text (TextOrSlice): Text to be scanned, as ``str``, ``bytes``, or a ``TextSlice`` instance.
start (str, optional): Start symbol. Required if Lark was initialized with multiple start symbols.

Yields:
``ScanMatch`` instances, each with a ``range`` (a (start, end) tuple)
and a ``value`` attribute. ``value`` is a ``Tree`` by default, or
whatever the ``transformer`` returns when one was supplied.

See Also: ``Lark.parse()``
"""
return self.parser.scan(text, start=start)


###}
33 changes: 31 additions & 2 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,13 +254,15 @@ def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') ->
return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)

def __reduce__(self):
return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column,
self.end_line, self.end_column, self.end_pos))

def __repr__(self):
return 'Token(%r, %r)' % (self.type, self.value)

def __deepcopy__(self, memo):
return Token(self.type, self.value, self.start_pos, self.line, self.column)
return Token(self.type, self.value, self.start_pos, self.line, self.column,
self.end_line, self.end_column, self.end_pos)

def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
Expand Down Expand Up @@ -396,6 +398,16 @@ def fullmatch(self, text: str) -> Optional[str]:
return m.lastgroup
return None

def search(self, text: TextSlice, pos: int):
"Find earliest match, starting at pos"
best = None
for mre in self._mres:
m = mre.search(text.text, pos, text.end)
if m and (best is None or m.start() < best.start()):
best = m
if best is not None:
return (best.group(0), best.lastgroup), best.start()

def _regexp_has_newline(r: str):
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
Expand Down Expand Up @@ -485,6 +497,9 @@ class Lexer(ABC):
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
return NotImplemented

def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
raise TypeError("This lexer cannot be used for searching in text")

def make_lexer_state(self, text: str):
"Deprecated"
return LexerState(TextSlice.cast_from(text))
Expand Down Expand Up @@ -645,6 +660,17 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
# EOF
raise EOFError(self)

def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
while True:
res = self.scanner.search(text, pos)
if not res:
return None
(value, type_), actual_pos = res
if type_ in self.ignore_types:
pos = actual_pos + len(value)
continue
return Token(type_, value, actual_pos, end_pos=actual_pos + len(value))


class ContextualLexer(Lexer):
lexers: Dict[int, AbstractBasicLexer]
Expand Down Expand Up @@ -699,4 +725,7 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[
except UnexpectedCharacters:
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.

def search_start(self, text: TextSlice, start_state: Any, pos: int) -> Optional[Token]:
return self.lexers[start_state].search_start(text, start_state, pos)

###}
88 changes: 86 additions & 2 deletions lark/parser_frontends.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
from copy import copy
from dataclasses import dataclass
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Generic, Iterable, Tuple, TypeVar

from .exceptions import ConfigurationError, GrammarError, assert_config
from .exceptions import ConfigurationError, GrammarError, LexError, UnexpectedInput, assert_config
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice, LarkInput
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
from .parsers import earley, xearley, cyk
Expand All @@ -14,6 +16,21 @@

###{standalone

T = TypeVar('T')

@dataclass(frozen=True)
class ScanMatch(Generic[T]):
"""A non-overlapping match found by ``Lark.scan()``.

Attributes:
range: A (start, end) tuple of the indices in the input text.
value: The parse result. A ``Tree`` by default, or whatever the
``transformer`` returns when one was supplied to Lark.
"""
range: Tuple[int, int]
value: T


def _wrap_lexer(lexer_class):
future_interface = getattr(lexer_class, '__future_interface__', 0)
if future_interface == 2:
Expand Down Expand Up @@ -139,6 +156,73 @@ def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
stream = self._make_lexer_thread(text)
return self.parser.parse_interactive(stream, chosen_start)

def scan(self, text: TextOrSlice, start: Optional[str]=None) -> Iterable[ScanMatch]:
"""See ``Lark.scan``."""
if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("scan() requires parser='lalr'")
if self.skip_lexer:
raise ConfigurationError("scan() does not support lexer='dynamic'/'dynamic_complete'")
if self.lexer_conf.postlex is not None:
# postlex carries state across the stream (indent depth, paren nesting); mid-stream parses break it.
raise ConfigurationError("scan() does not support postlex")
chosen_start = self._verify_start(start)
start_state = self.parser._parse_table.start_states[chosen_start]
text_slice = TextSlice.cast_from(text)
pos = text_slice.start
while True:
# Search for a plausible start
first_token = self.lexer.search_start(text_slice, start_state, pos)
if first_token is None:
return
assert first_token.start_pos is not None and first_token.start_pos >= text_slice.start
assert first_token.end_pos is not None and first_token.end_pos <= text_slice.end

# Parse without callbacks, to keep value-stack minimal and avoid expensive deepcopies.
# Aim for the longest possible match, and save the tokens we lex for later replay.
stunted_ip = self.parse_interactive(text_slice.start_from(first_token.start_pos), start=chosen_start)
stunted_ip.parser_state.parse_conf = copy(stunted_ip.parser_state.parse_conf)
stunted_ip.parser_state.parse_conf.callbacks = {}
matched_tokens = []
longest_match = 0 # number of tokens in the longest accepted prefix
token_stream = stunted_ip.lexer_thread.lex(stunted_ip.parser_state)
try:
for token in token_stream:
stunted_ip.feed_token(token)
matched_tokens.append(token)
# Test if we reached a possible completed parse
if '$END' in stunted_ip.choices():
tmp_ip = stunted_ip.copy(deepcopy_values=False)
try:
tmp_ip.feed_eof(token)
except UnexpectedInput:
continue
longest_match = len(matched_tokens)
# keep going and testing for candidates, until the parse ends or fails
except UnexpectedInput:
# Parse failed
pass
except ValueError:
# A user lexer-callback raised an error
pass

if longest_match:
# Match found! Replay tokens with real callbacks, and yield the result
last = matched_tokens[longest_match - 1]
if last.end_pos is None:
raise LexError(
f"Lexer callback for {last.type!r} did not preserve token positions; "
f"scan() requires source positions on every token (use Token.update() in callbacks).")
replay_ip = self.parse_interactive(start=chosen_start)
for t in matched_tokens[:longest_match]:
replay_ip.feed_token(t)
res = replay_ip.feed_eof(last)
yield ScanMatch((first_token.start_pos, last.end_pos), res)
# Resume from end of match (no overlaps)
pos = last.end_pos
else:
# No match found. Scan again from next character
pos = first_token.start_pos + 1


def _validate_frontend_args(parser, lexer) -> None:
assert_config(parser, ('lalr', 'earley', 'cyk'))
Expand Down
3 changes: 3 additions & 0 deletions lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,9 @@ def cast_from(cls, text: 'TextOrSlice') -> 'TextSlice[AnyStr]':
def is_complete_text(self):
return self.start == 0 and self.end == len(self.text)

def start_from(self, pos: int) -> 'TextSlice[AnyStr]':
return TextSlice(self.text, pos, self.end)

def __len__(self):
return self.end - self.start

Expand Down
1 change: 1 addition & 0 deletions tests/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .test_tree_forest_transformer import TestTreeForestTransformer
from .test_lexer import TestLexer
from .test_python_grammar import TestPythonParser
from .test_scan import TestScan
from .test_tree_templates import * # We define __all__ to list which TestSuites to run

try:
Expand Down
Loading
Loading