Scans a given text and returns tokens based on the provided configuration.
Example
from lectes import Rule, Configuration, Regex, Scanner
config = Configuration(
[
Rule(name="FOR", regex=Regex("for")),
Rule(name="INT", regex=Regex("[1-9]+")),
Rule(name="ID", regex=Regex("[a-zA-Z][a-zA-Z0-9]*")),
Rule(name="WHITESPACE", regex=Regex("( )")),
]
)
scanner = Scanner(config)
program = "somevar in othervar for 9 let"
for token in scanner.scan(program):
print(token)
Source code in src/lectes/scanner/scanner.py
| class Scanner:
"""
Scans a given text and returns tokens based on the provided configuration.
## Example
```python
from lectes import Rule, Configuration, Regex, Scanner
config = Configuration(
[
Rule(name="FOR", regex=Regex("for")),
Rule(name="INT", regex=Regex("[1-9]+")),
Rule(name="ID", regex=Regex("[a-zA-Z][a-zA-Z0-9]*")),
Rule(name="WHITESPACE", regex=Regex("( )")),
]
)
scanner = Scanner(config)
program = "somevar in othervar for 9 let"
for token in scanner.scan(program):
print(token)
```
"""
def __init__(self, configuration: Configuration, debug: bool = False) -> None:
self.configuration = configuration
self.set_text("")
self._unmatched_handler = self._handle_unmatched
self._matched_handlers = {
rule: self._handle_matched for rule in configuration.rules
}
self._debug = debug
self._logger = None
self._match = None
self._matched_rule = None
def scan(self, text: str) -> Generator[Token]:
"""
Scan the given text and yield tokens as they are recognized.
"""
if len(text) == 0:
return
self.set_text(text)
for character in text:
self.logger().debug(f"character: '{character}'")
current_string = self.current_string()
self.logger().debug(f"current_string: '{current_string}'")
lookahead_string = self.lookahead_string()
self.logger().debug(f"lookahead_string: '{lookahead_string}'")
for rule in self.configuration.rules:
if not self._is_last_char() and rule.regex.fullmatch(lookahead_string):
self.logger().debug(
f"rule {rule.name} fullmatched lookahead_string: '{lookahead_string}'"
)
break
if match := rule.regex.search(current_string):
self.logger().debug(
f"rule {rule.name} matched current_string: '{current_string}'"
)
self._update_matched_state(rule, match)
if self._match is not None:
if self._match.unmatched is not None:
self._unmatched_handler(self._match.unmatched)
if self._matched_rule is not None:
result = self._matched_handlers[self._matched_rule](
self._match.string, self._matched_rule
)
if result is not None:
yield result
self.last_position = self.current_position
self.current_position += 1
self._reset_matched_state()
def set_unmatched_handler(self, handler: Callable[[str], None]) -> None:
"""
Set the given function as the handler that executes when a string is not
matched to a configured rule.
The handler receives the string as argument and does returns None.
"""
self._unmatched_handler = handler
def set_handler(self, rule: Rule, handler: Callable[[str, Rule], Any]) -> None:
"""
Set the given function as the handler that executes when a string is matched
against rule.
The handler should receive the matched string literal and the rule as arguments.
"""
self._matched_handlers[rule] = handler
def set_text(self, text: str) -> None:
"""
Set the text to scan.
"""
self.text = text
self.current_position = 1
self.last_position = 0
def current_string(self) -> str:
"""
Return the string that the scanner is currently reading; that is,
the characters from the last matched string up to the character that
the scanner is currently reading.
"""
return self.text[self.last_position : self.current_position]
def lookahead_string(self) -> str:
"""
Return the string that the scanner is currently reading plus one character.
"""
return self.text[self.last_position : self.current_position + 1]
def logger(self) -> Logger:
"""
Return the scanner's logger instance.
"""
if self._logger is None:
self._logger = self._build_logger()
return self._logger
def _build_logger(self) -> Logger:
logger = Logger()
if self._debug:
logger.set_level(LogLevel.DEBUG)
return logger
def _is_last_char(self) -> bool:
return self.current_position == len(self.text)
def _update_matched_state(self, rule: Rule, match: Match) -> None:
if self._match is None or len(match) > len(self._match):
self.logger().debug(f"updating match from {self._match} to {match.string}")
self._matched_rule = rule
self._match = match
def _reset_matched_state(self) -> None:
self._match = None
self._matched_rule = None
@staticmethod
def _handle_unmatched(unmatched: str) -> None:
print(f"unmatched: {unmatched}")
@staticmethod
def _handle_matched(matched: str, rule: Rule) -> Token:
return Token(rule=rule, literal=matched)
|
current_string()
Return the string that the scanner is currently reading; that is,
the characters from the last matched string up to the character that
the scanner is currently reading.
Source code in src/lectes/scanner/scanner.py
| def current_string(self) -> str:
"""
Return the string that the scanner is currently reading; that is,
the characters from the last matched string up to the character that
the scanner is currently reading.
"""
return self.text[self.last_position : self.current_position]
|
logger()
Return the scanner's logger instance.
Source code in src/lectes/scanner/scanner.py
| def logger(self) -> Logger:
"""
Return the scanner's logger instance.
"""
if self._logger is None:
self._logger = self._build_logger()
return self._logger
|
lookahead_string()
Return the string that the scanner is currently reading plus one character.
Source code in src/lectes/scanner/scanner.py
| def lookahead_string(self) -> str:
"""
Return the string that the scanner is currently reading plus one character.
"""
return self.text[self.last_position : self.current_position + 1]
|
scan(text)
Scan the given text and yield tokens as they are recognized.
Source code in src/lectes/scanner/scanner.py
| def scan(self, text: str) -> Generator[Token]:
"""
Scan the given text and yield tokens as they are recognized.
"""
if len(text) == 0:
return
self.set_text(text)
for character in text:
self.logger().debug(f"character: '{character}'")
current_string = self.current_string()
self.logger().debug(f"current_string: '{current_string}'")
lookahead_string = self.lookahead_string()
self.logger().debug(f"lookahead_string: '{lookahead_string}'")
for rule in self.configuration.rules:
if not self._is_last_char() and rule.regex.fullmatch(lookahead_string):
self.logger().debug(
f"rule {rule.name} fullmatched lookahead_string: '{lookahead_string}'"
)
break
if match := rule.regex.search(current_string):
self.logger().debug(
f"rule {rule.name} matched current_string: '{current_string}'"
)
self._update_matched_state(rule, match)
if self._match is not None:
if self._match.unmatched is not None:
self._unmatched_handler(self._match.unmatched)
if self._matched_rule is not None:
result = self._matched_handlers[self._matched_rule](
self._match.string, self._matched_rule
)
if result is not None:
yield result
self.last_position = self.current_position
self.current_position += 1
self._reset_matched_state()
|
set_handler(rule, handler)
Set the given function as the handler that executes when a string is matched
against rule.
The handler should receive the matched string literal and the rule as arguments.
Source code in src/lectes/scanner/scanner.py
| def set_handler(self, rule: Rule, handler: Callable[[str, Rule], Any]) -> None:
"""
Set the given function as the handler that executes when a string is matched
against rule.
The handler should receive the matched string literal and the rule as arguments.
"""
self._matched_handlers[rule] = handler
|
set_text(text)
Set the text to scan.
Source code in src/lectes/scanner/scanner.py
| def set_text(self, text: str) -> None:
"""
Set the text to scan.
"""
self.text = text
self.current_position = 1
self.last_position = 0
|
set_unmatched_handler(handler)
Set the given function as the handler that executes when a string is not
matched to a configured rule.
The handler receives the string as argument and does returns None.
Source code in src/lectes/scanner/scanner.py
| def set_unmatched_handler(self, handler: Callable[[str], None]) -> None:
"""
Set the given function as the handler that executes when a string is not
matched to a configured rule.
The handler receives the string as argument and does returns None.
"""
self._unmatched_handler = handler
|