17#ifndef SCILEX_LEXER_HPP
18#define SCILEX_LEXER_HPP
32#include <unordered_set>
36#include <real/dfa.hpp>
37#include <real/real.hpp>
134 : std::runtime_error(message),
176 std::vector<frame>& stack)
185 if (stack.size() == 1) {
186 throw lex_error(
"cannot pop the mode stack: already at the root mode", start);
191 stack.back().mode_id = r.
action->target_id;
263 explicit lexer(std::vector<rule> rules,
264 std::unordered_set<std::string> insignificant_modes = {},
265 std::unordered_set<std::string> dfa_modes = {},
268 :
rules_(std::move(rules)),
299 [[nodiscard]] std::vector<token>
tokenize(std::string_view source,
302 std::vector<token> out;
304 std::vector<frame> stack {
frame {.
mode_id = 0, .entry_pos = cursor}};
306 while (
scan_next(source, cursor, stack, next)) {
345 [[nodiscard]]
const std::string&
mode_name(std::size_t
id)
const noexcept
359 std::vector<std::string> active;
375 return std::to_string(where.
line) +
":" + std::to_string(where.
column);
399 std::vector<frame>& stack,
402 while (cursor.
offset < source.size()) {
403 const std::string_view rest {source.substr(cursor.
offset)};
404 const std::size_t mode {stack.back().mode_id};
419 while (cursor.
offset < source.size()
433 +
"' (rule never advances)", cursor);
436 const std::size_t best_idx {m.idx};
437 const std::size_t best_len {m.len};
439 advance(source, cursor, best_len);
443 if (!
rules_[best_idx].skip) {
446 out =
token {
rules_[best_idx].
kind, source.substr(start.offset, best_len), start, mode};
451 if (stack.size() > 1) {
454 +
position_label(stack.back().entry_pos) +
")", stack.back().entry_pos);
482 const real::regex& pattern {
rules_[idx].pattern};
484 if (!pattern.has_first_byte_set()) {
487 else if (
const std::optional<unsigned char>
byte {pattern.unique_first_byte()}) {
488 target.first_byte_index[*byte].push_back(idx);
491 for (
int candidate {0}; candidate < 256; ++candidate) {
492 if (pattern.may_start_with(
static_cast<unsigned char>(candidate))) {
493 target.first_byte_index[
static_cast<unsigned char>(candidate)].push_back(idx);
504 && std::all_of(d.first_byte_index.begin(), d.first_byte_index.end(),
505 [](
const std::vector<std::size_t>& bucket) { return bucket.empty(); });
514 if (!candidate.action) {
517 if (candidate.pattern.pattern().empty()) {
518 throw std::invalid_argument(
"a transition rule must consume input (empty pattern)");
522 throw std::invalid_argument(
"a transition targets the empty mode '"
523 + candidate.action->target +
"' (no rule is active in it)");
540 for (
const std::string& name : candidate.in_mode) {
543 if (
const std::optional<mode_action> action {candidate.action};
550 for (std::size_t idx {0}; idx <
rules_.size(); ++idx) {
551 if (
rules_[idx].in_mode.empty()) {
555 for (
const std::string& name :
rules_[idx].in_mode) {
567 candidate.action->target_id =
mode_id_.at(candidate.action->target);
578 if (insignificant_modes.empty()) {
582 for (
const std::string& name : insignificant_modes) {
583 const auto found {
mode_id_.find(name)};
585 throw std::invalid_argument(
"insignificant_modes names an unknown mode: " + name);
612 std::string_view rest,
613 unsigned char lead)
const
616 if (
const std::optional<real::dfa_match> matched {
per_mode_dfa_[mode]->dfa.match(rest)}) {
618 .len = matched->length};
634 unsigned char byte)
const
636 return !
per_mode_[mode].first_byte_index[byte].empty();
644 std::string_view source,
645 std::size_t offset)
const
647 const unsigned char lead {
static_cast<unsigned char>(source[offset])};
651 return munch_at(mode, source.substr(offset), lead).
have;
661 for (std::size_t i {0}; i < n; ++i) {
662 if (source[cursor.
offset] ==
'\n') {
679 const unsigned char b0 {
static_cast<unsigned char>(s[off])};
685 if ((b0 & 0xE0U) == 0xC0U) {
689 else if ((b0 & 0xF0U) == 0xE0U) {
693 else if ((b0 & 0xF8U) == 0xF0U) {
700 if (off + len > s.size()) {
703 for (std::size_t i {1}; i < len; ++i) {
704 const unsigned char bi {
static_cast<unsigned char>(s[off + i])};
705 if ((bi & 0xC0U) != 0x80U) {
708 cp = (cp << 6U) | (bi & 0x3FU);
710 static constexpr unsigned int min_for_len[5] {0, 0, 0x80U, 0x800U, 0x10000U};
711 if (cp < min_for_len[len] || (cp >= 0xD800U && cp <= 0xDFFFU) || cp > 0x10FFFFU) {
730 const unsigned char byte {
static_cast<unsigned char>(source[off])};
731 if ((
byte & 0xC0U) == 0x80U) {
735 for (std::size_t back {1}; back <= 3 && back <= off; ++back) {
754 std::string_view rest,
755 unsigned char lead)
const
757 std::size_t best_len {0};
758 std::size_t best_idx {0};
760 const auto consider {[&](std::size_t idx) {
768 const auto matched {
rules_[idx].pattern.match(rest)};
774 && (!have || matched.end() > best_len
775 || (matched.end() == best_len && idx < best_idx))) {
776 best_len = matched.end();
782 for (
const std::size_t idx : active.first_byte_index[lead]) {
785 for (
const std::size_t idx : active.general) {
788 return {.have = have, .idx = best_idx, .len = best_len};
794 std::size_t mode)
const
796 const std::vector<std::string>& modes {
rules_[idx].in_mode};
800 for (
const std::string& name : modes) {
812 std::vector<std::string>
audit_probes(
const std::vector<std::size_t>& to_global)
const
814 std::array<bool, 256> seen {};
815 std::vector<unsigned char> alpha;
816 const auto add {[&](
unsigned char b) {
822 for (
const std::size_t g : to_global) {
823 for (
int b {0}; b < 256; ++b) {
824 if (
rules_[g].pattern.may_start_with(
static_cast<unsigned char>(b))) {
825 add(
static_cast<unsigned char>(b));
829 for (
const char structural : std::string_view {
" \t\n\"'/*-+=<>()[]{};.:,aAz09_"}) {
830 add(
static_cast<unsigned char>(structural));
837 std::vector<std::string> probes;
838 for (
const unsigned char b : alpha) {
839 for (
const std::size_t n : std::array<std::size_t, 5> {1, 2, 3, 6, 8}) {
840 probes.emplace_back(n,
static_cast<char>(b));
847 std::mt19937 rng {0x5C11EFU};
848 std::uniform_int_distribution<std::size_t> len_d {1, 48};
849 std::uniform_int_distribution<std::size_t> sym_d {0, alpha.size() - 1};
850 for (
int batch {0}; batch < 256; ++batch) {
852 const std::size_t len {len_d(rng)};
853 for (std::size_t i {0}; i < len; ++i) {
854 input.push_back(
static_cast<char>(alpha[sym_d(rng)]));
856 probes.push_back(std::move(input));
865 const std::vector<std::size_t>& to_global,
866 std::size_t mode)
const
868 const std::vector<std::string> probes {
audit_probes(to_global)};
869 for (
const std::string& probe : probes) {
870 const std::string_view rest {probe};
871 const std::optional<real::dfa_match> hit {candidate.match(rest)};
873 std::size_t dfa_idx {0};
874 std::size_t dfa_len {0};
875 if (hit.has_value()) {
876 dfa_idx = to_global[hit->rule_index];
877 dfa_len = hit->length;
881 if (std::tuple {hit.has_value(), dfa_idx, dfa_len} != std::tuple {pike.have, pike.idx, pike.len}) {
900 std::vector<real::detail::program_view> programs;
901 programs.reserve(to_global.size());
902 for (
const std::size_t g : to_global) {
903 programs.push_back(
rules_[g].pattern.raw_program());
906 real::dfa candidate {std::span<const real::detail::program_view>(programs)};
910 return mode_dfa {.
dfa = std::move(candidate), .to_global = std::move(to_global)};
912 catch (
const real::dfa_error&) {
928 for (
const std::string& name : dfa_modes) {
929 const auto found {
mode_id_.find(name)};
931 throw std::invalid_argument(
"dfa_modes names an unknown mode: " + name);
933 const std::size_t mode {found->second};
934 std::vector<std::size_t> to_global;
935 for (std::size_t idx {0}; idx <
rules_.size(); ++idx) {
937 to_global.push_back(idx);
941 per_mode_dfa_[mode] = std::make_shared<const mode_dfa>(std::move(*built));
990 std::string_view source,
1042 return !(*
this == other);
1091 std::string_view source,
Thrown when no rule matches at a position (a lexical error).
lex_error(const std::string &message, position where)
Builds the error.
position where() const noexcept
Returns the position of the unmatched byte.
position where_
Where tokenization failed.
A lexer built from an ordered list of rules.
void validate_transitions() const
Fail-fast transition checks: a transition rule must consume input, and a push/set target must be a de...
void add_to_mode(std::size_t m, std::size_t idx)
Adds rule idx to mode m's dispatch via REAL's exact first-byte API — the same 3-way split (nullable →...
std::vector< dispatch > per_mode_
Dispatch index, one per mode (by id).
column_unit columns() const noexcept
The unit this lexer counts position::column in (positions do not carry it, so a consumer that needs t...
void build_dispatch()
Builds the per-mode first-byte dispatch from rules_ (once, at construction). "default" is mode 0; eve...
scilex::column_unit columns_
The unit position::column is counted in.
std::vector< std::string > audit_probes(const std::vector< std::size_t > &to_global) const
The bounded, deterministic probe inputs for the audit: every active rule's possible first bytes ∪ str...
std::vector< rule > rules_
The ordered token rules.
std::vector< bool > mode_significant_
Layout policy (empty = all significant).
std::optional< mode_dfa > try_build_mode_dfa(std::vector< std::size_t > to_global, std::size_t mode)
Builds the mode_dfa for one mode, or std::nullopt if the mode cannot take the DFA fast path....
static std::size_t valid_utf8_len(std::string_view s, std::size_t off)
The length (1–4) of a valid UTF-8 codepoint starting at off in s, or 0 when the byte there is not a v...
bool mode_is_empty(std::size_t m) const
Whether mode m has no active rule (so nothing can match in it).
bool starts_a_match(std::size_t mode, std::string_view source, std::size_t offset) const
Does a rule in mode match at offset in source? The error-recovery loop's stop test — the smallest suc...
munch_result munch_at(std::size_t mode, std::string_view rest, unsigned char lead) const
The winning munch in mode at the start of rest (lead is rest's first byte), dispatching to the mode's...
const std::vector< bool > & mode_significant() const noexcept
The per-mode-id layout-significance policy (see scilex::layout). Index by a token's mode_id; false ma...
const std::string & mode_name(std::size_t id) const noexcept
The name of mode id (0 is "default"), for labelling tokens.
static std::string position_label(position where)
Formats a position as "line:column" for diagnostics.
bool audit_passes(const real::dfa &candidate, const std::vector< std::size_t > &to_global, std::size_t mode) const
The candidate DFA must reproduce the Pike munch on every probe: catches divergences the bytecode cann...
void build_significance(const std::unordered_set< std::string > &insignificant_modes)
Builds the layout-significance policy from the insignificant-mode names (validated against the intern...
std::size_t intern_mode(const std::string &name)
Interns a mode name to its id, assigning the next id on first sight.
bool rule_active_in_mode(std::size_t idx, std::size_t mode) const
Whether rule idx is active in mode mode (mirrors build_dispatch, an empty in_mode is the default mode...
bool scan_next(std::string_view source, position &cursor, std::vector< frame > &stack, token &out) const
Advances cursor to and past the next non-skipped token in the active mode, applying the winning rule'...
std::map< std::string, std::size_t > mode_id_
Mode name -> id.
std::vector< token > tokenize(std::string_view source, eof_policy policy=eof_policy::omit) const
Tokenizes source into the sequence of non-skipped tokens.
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &&=delete
Deleted: the range would point into a temporary lexer.
munch_result pike_munch_in_mode(std::size_t mode, std::string_view rest, unsigned char lead) const
The per-rule Pike + first-byte-dispatch munch in mode at the start of rest (lead is rest's first byte...
static std::size_t column_step(std::string_view source, std::size_t off, scilex::column_unit unit)
How much the column advances when the byte at off in source is consumed, under unit....
void advance(std::string_view source, position &cursor, std::size_t n) const
Advances cursor by n bytes of source, maintaining the 1-based line/column tracker (a newline resets t...
std::vector< std::shared_ptr< const mode_dfa > > per_mode_dfa_
Per-mode DFA fast path (nullptr = Pike).
std::vector< std::string > dfa_modes_active() const
The modes actually accelerated by a DFA fast path.
lexer(std::vector< rule > rules, std::unordered_set< std::string > insignificant_modes={}, std::unordered_set< std::string > dfa_modes={}, error_policy errors=error_policy::raise, column_unit columns=column_unit::bytes)
Builds a lexer from rules (taken by value, then moved in).
error_policy errors_
What to do at an unmatched byte.
void build_dfa_modes(const std::unordered_set< std::string > &dfa_modes)
Opts the named dfa_modes into the DFA fast path (called once, after build_dispatch)....
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &
Returns a lazy range over the non-skipped tokens of source.
std::vector< std::string > mode_names_
Mode id -> name ("default" is id 0).
bool may_start(std::size_t mode, unsigned char byte) const
O(1) pre-filter for error recovery: can a fixed-lead rule in mode begin with byte?...
Forward (single-pass) iterator yielding one token at a time.
token_iterator(const lexer &owner, std::string_view source, eof_policy policy)
Constructs a begin iterator over source for owner.
std::ptrdiff_t difference_type
Required typedef.
void operator++(int)
Post-increment (single-pass: no useful copy is returned).
std::vector< frame > stack_
Mode stack (top = active).
reference operator*() const
Returns the current token.
token current_
The current token.
bool operator==(const token_iterator &other) const
Equality: both exhausted, or both at the same offset.
pointer operator->() const
Member access to the current token.
position cursor_
Current scan position.
bool done_
True once exhausted (end sentinel).
const lexer * owner_
Rules provider (not owned).
std::string_view source_
Text being scanned.
token_iterator & operator++()
Advances to the next token.
token_iterator()=default
Constructs the end sentinel.
void advance()
Produces the next token, or marks the iterator exhausted.
bool operator!=(const token_iterator &other) const
Inequality.
std::input_iterator_tag iterator_category
Single-pass.
bool eof_done_
End-of-input token already yielded.
eof_policy policy_
End-of-input policy.
A lazy range of tokens, returned by lexer::scan.
token_iterator end() const
End sentinel.
std::string_view source_
Text being scanned.
token_iterator begin() const
Begin iterator (produces the first token).
const lexer * owner_
Rules provider (not owned).
token_range(const lexer &owner, std::string_view source, eof_policy policy)
Builds the range.
eof_policy policy_
End-of-input policy.
The SciLex public API (scilex::lexer, scilex::rule, scilex::token).
void apply_transition(const rule &r, position start, std::vector< frame > &stack)
Applies rule r's mode transition (if any) to stack — the per-scan mode-stack mutation,...
constexpr int error
Reserved token kind for a lexical-error run under scilex::error_policy::token.
column_unit
The unit a token's position::column is counted in.
@ codepoints
One column per Unicode scalar value (a valid UTF-8 codepoint).
@ bytes
One column per byte (the default; column == byte offset within the line + 1).
@ utf16
One column per UTF-16 code unit (BMP = 1, astral = 2) — the LSP unit.
eof_policy
Whether tokenization appends a synthetic end-of-input token.
@ append
Append one end_of_input token at the end position.
@ omit
Stop at the last real token (default).
constexpr int end_of_input
Reserved token kind for the synthetic end-of-input token.
error_policy
What a lexer does when it reaches a byte that no rule in the active mode can begin.
One entry on the per-scan mode stack: the active mode and where it was entered (the entry position fe...
std::size_t mode_id
Id of the active mode.
position entry_pos
Where this mode was entered.
Per-mode dispatch index: the first-byte buckets scoped to one mode.
std::vector< std::size_t > general
Nullable rules (tried everywhere).
std::array< std::vector< std::size_t >, 256 > first_byte_index
Rule indices by leading byte.
An adopted per-mode DFA: the automaton plus its local→global rule map.
std::vector< std::size_t > to_global
DFA local rule index -> global rules_ index.
real::dfa dfa
Recognizes the mode's rules in one pass.
A munch decision: whether a rule matched, which (global index), how many bytes — the small value scan...
A mode transition, fired when its rule wins, acting on the scan's mode stack: enter a nested mode,...
std::string target
The mode push/set enters; ignored (and omittable) for pop.
op operation
Which transition to perform.
std::size_t target_id
The interned id of target, resolved once when the lexer is built (see scilex::lexer::build_dispatch) ...
op
The kind of transition.
@ push
Enter target, remembering the mode below it (a nested context).
@ pop
Leave the current mode, returning to the one beneath it.
@ set
Replace the current mode with target (stack depth unchanged).
A location in the source text.
std::size_t offset
0-based byte offset from the start of the source.
std::size_t column
1-based byte column within the line.
std::size_t line
1-based line number.
A token rule: a kind, the pattern that recognizes it, whether matches are discarded (whitespace,...
int kind
Kind assigned to tokens this rule produces.
std::optional< mode_action > action
Mode transition fired when this rule wins.
bool skip
If true, matches are consumed but not emitted.
real::regex pattern
The recognizer (a linear-time REAL regex; its flags are the author's — see above).
std::vector< std::string > in_mode
Modes this rule is active in; empty ⇒ {"default"}.
One lexical token: a typed slice of the source.
int kind
Caller-defined token kind (e.g. an enum value).
The token produced by the lexer and its source position.