SciLex
A header-only C++20 lexer built on REAL
Loading...
Searching...
No Matches
lexer.hpp
Go to the documentation of this file.
1
17#ifndef SCILEX_LEXER_HPP
18#define SCILEX_LEXER_HPP
19
20#include <algorithm>
21#include <array>
22#include <iterator>
23#include <map>
24#include <memory>
25#include <optional>
26#include <random>
27#include <span>
28#include <stdexcept>
29#include <string>
30#include <string_view>
31#include <tuple>
32#include <unordered_set>
33#include <utility>
34#include <vector>
35
36#include <real/dfa.hpp>
37#include <real/real.hpp>
38
39#include "token.hpp"
40
41namespace scilex {
42
43 class token_iterator;
44 class token_range;
45
53 enum class eof_policy
54 {
55 omit,
56 append,
57 };
58
64 {
66 enum class op
67 {
68 push,
69 pop,
70 set,
71 };
72
74 std::string target {};
75
80 std::size_t target_id {0};
81 };
82
109 struct rule
110 {
111 int kind;
112 real::regex pattern;
113 bool skip {false};
114 std::vector<std::string> in_mode {};
115 std::optional<mode_action> action {};
116 };
117
123 class lex_error : public std::runtime_error
124 {
125 public:
126
132 lex_error(const std::string& message,
134 : std::runtime_error(message),
136 {}
137
141 [[nodiscard]] position where() const noexcept
142 {
143 return where_;
144 }
145
146 private:
147
149 };
150
155 struct frame
156 {
157 std::size_t mode_id;
159 };
160
174 inline void apply_transition(const rule& r,
175 position start,
176 std::vector<frame>& stack)
177 {
178 if (!r.action) {
179 return;
180 }
181 if (r.action->operation == mode_action::op::push) {
182 stack.push_back(frame {.mode_id = r.action->target_id, .entry_pos = start});
183 }
184 else if (r.action->operation == mode_action::op::pop) {
185 if (stack.size() == 1) {
186 throw lex_error("cannot pop the mode stack: already at the root mode", start);
187 }
188 stack.pop_back();
189 }
190 else { // set: replace the active mode in place (depth unchanged)
191 stack.back().mode_id = r.action->target_id;
192 }
193 }
194
200 enum class error_policy
201 {
202 raise,
207 token,
208 };
209
221 enum class column_unit
222 {
223 bytes,
224 codepoints,
225 utf16,
226 };
227
235 class lexer
236 {
237 public:
238
263 explicit lexer(std::vector<rule> rules,
264 std::unordered_set<std::string> insignificant_modes = {},
265 std::unordered_set<std::string> dfa_modes = {},
268 : rules_(std::move(rules)),
269 errors_(errors),
271 {
273 build_significance(insignificant_modes);
274 build_dfa_modes(dfa_modes);
275 }
276
279 [[nodiscard]] column_unit columns() const noexcept
280 {
281 return columns_;
282 }
283
299 [[nodiscard]] std::vector<token> tokenize(std::string_view source,
300 eof_policy policy = eof_policy::omit) const
301 {
302 std::vector<token> out;
303 position cursor {0, 1, 1};
304 std::vector<frame> stack {frame {.mode_id = 0, .entry_pos = cursor}}; // start in "default"
305 token next {};
306 while (scan_next(source, cursor, stack, next)) {
307 out.push_back(next);
308 }
309 if (policy == eof_policy::append) {
310 // The cursor now sits at the end position (past any trailing trivia).
311 out.push_back(token {end_of_input, source.substr(cursor.offset), cursor});
312 }
313 return out;
314 }
315
329 [[nodiscard]] token_range scan(std::string_view source,
330 eof_policy policy = eof_policy::omit) const&;
331
333 token_range scan(std::string_view source,
334 eof_policy policy = eof_policy::omit) const&& = delete;
335
339 [[nodiscard]] const std::vector<bool>& mode_significant() const noexcept
340 {
341 return mode_significant_;
342 }
343
345 [[nodiscard]] const std::string& mode_name(std::size_t id) const noexcept
346 {
347 return mode_names_[id];
348 }
349
357 [[nodiscard]] std::vector<std::string> dfa_modes_active() const
358 {
359 std::vector<std::string> active;
360 for (std::size_t m {0}; m < per_mode_dfa_.size(); ++m) {
361 if (per_mode_dfa_[m]) {
362 active.push_back(mode_names_[m]);
363 }
364 }
365 return active;
366 }
367
368 private:
369
370 friend class token_iterator;
371
373 static std::string position_label(position where)
374 {
375 return std::to_string(where.line) + ":" + std::to_string(where.column);
376 }
377
397 bool scan_next(std::string_view source,
398 position& cursor,
399 std::vector<frame>& stack,
400 token& out) const
401 {
402 while (cursor.offset < source.size()) {
403 const std::string_view rest {source.substr(cursor.offset)};
404 const std::size_t mode {stack.back().mode_id};
405 const munch_result m {munch_at(mode, rest, static_cast<unsigned char>(source[cursor.offset]))};
406
407 if (!m.have) {
409 throw lex_error("no rule matches in mode '" + mode_names_[mode] + "' (entered at "
410 + position_label(stack.back().entry_pos) + ")", cursor); // #1
411 }
412 // Recovery (error_policy::token): accumulate the maximal run of bytes that no rule in this
413 // mode can begin into ONE reserved-kind error token, then resume — no throw, no transition
414 // (the run stays in its mode). The run ends at the first position where a rule matches (>0);
415 // may_start is an O(1) first-byte pre-filter that skips the bulk of the noise without a full
416 // match attempt. The lexeme is the exact offending bytes.
417 const position err_start {cursor};
418 advance(source, cursor, 1); // the byte at err_start is unmatched by definition
419 while (cursor.offset < source.size()
420 && !starts_a_match(mode, source, cursor.offset)) {
421 advance(source, cursor, 1);
422 }
423 out = token {scilex::error, source.substr(err_start.offset, cursor.offset - err_start.offset),
424 err_start, mode};
425 return true;
426 }
427 if (m.len == 0) {
428 // A rule won with a zero-length match (a nullable rule and no longer match at this
429 // position). Advancing by 0 would spin forever, so report it as a lexical error — fatal
430 // under either policy (recovery cannot make progress here). The shared advance point, so
431 // both the Pike and DFA paths are covered.
432 throw lex_error("zero-length match in mode '" + mode_names_[mode]
433 + "' (rule never advances)", cursor); // #4
434 }
435
436 const std::size_t best_idx {m.idx};
437 const std::size_t best_len {m.len};
438 const position start {cursor};
439 advance(source, cursor, best_len);
440
441 apply_transition(rules_[best_idx], start, stack); // advances, then transitions (#2 on a bad pop)
442
443 if (!rules_[best_idx].skip) {
444 // Tag the token with the mode it was lexed in (captured before the
445 // transition above) — Layout Awareness reads it; the scan is untouched.
446 out = token {rules_[best_idx].kind, source.substr(start.offset, best_len), start, mode};
447 return true;
448 }
449 // Skip rule: keep scanning for the next emitted token (possibly in a new mode).
450 }
451 if (stack.size() > 1) {
453 throw lex_error("unterminated mode '" + mode_names_[stack.back().mode_id] + "' (entered at "
454 + position_label(stack.back().entry_pos) + ")", stack.back().entry_pos); // #3
455 }
456 // Recovery (error_policy::token): a mode was still pushed at end of input. Emit one zero-width
457 // error token positioned at the EOF (the partial tokens already emitted stay), then unwind to
458 // the root so the next call reports a clean end of input.
459 out = token {scilex::error, source.substr(cursor.offset, 0), cursor, stack.back().mode_id};
460 stack.resize(1);
461 return true;
462 }
463 return false;
464 }
465
467 std::size_t intern_mode(const std::string& name)
468 {
469 const auto [it, inserted] {mode_id_.emplace(name, mode_names_.size())};
470 if (inserted) {
471 mode_names_.push_back(name);
472 }
473 return it->second;
474 }
475
479 void add_to_mode(std::size_t m,
480 std::size_t idx)
481 {
482 const real::regex& pattern {rules_[idx].pattern};
483 dispatch& target {per_mode_[m]};
484 if (!pattern.has_first_byte_set()) {
485 target.general.push_back(idx);
486 }
487 else if (const std::optional<unsigned char> byte {pattern.unique_first_byte()}) {
488 target.first_byte_index[*byte].push_back(idx);
489 }
490 else {
491 for (int candidate {0}; candidate < 256; ++candidate) {
492 if (pattern.may_start_with(static_cast<unsigned char>(candidate))) {
493 target.first_byte_index[static_cast<unsigned char>(candidate)].push_back(idx);
494 }
495 }
496 }
497 }
498
500 [[nodiscard]] bool mode_is_empty(std::size_t m) const
501 {
502 const dispatch& d {per_mode_[m]};
503 return d.general.empty()
504 && std::all_of(d.first_byte_index.begin(), d.first_byte_index.end(),
505 [](const std::vector<std::size_t>& bucket) { return bucket.empty(); });
506 }
507
512 {
513 for (const rule& candidate : rules_) {
514 if (!candidate.action) {
515 continue;
516 }
517 if (candidate.pattern.pattern().empty()) {
518 throw std::invalid_argument("a transition rule must consume input (empty pattern)");
519 }
520 if (candidate.action->operation != mode_action::op::pop
521 && mode_is_empty(mode_id_.at(candidate.action->target))) {
522 throw std::invalid_argument("a transition targets the empty mode '"
523 + candidate.action->target + "' (no rule is active in it)");
524 }
525 }
526 }
527
537 {
538 intern_mode("default"); // mode 0, always present
539 for (const rule& candidate : rules_) {
540 for (const std::string& name : candidate.in_mode) {
541 intern_mode(name);
542 }
543 if (const std::optional<mode_action> action {candidate.action};
544 action.has_value() && action->operation != mode_action::op::pop) {
545 intern_mode(action->target);
546 }
547 }
548 per_mode_.resize(mode_names_.size());
549
550 for (std::size_t idx {0}; idx < rules_.size(); ++idx) {
551 if (rules_[idx].in_mode.empty()) {
552 add_to_mode(0, idx); // an undeclared rule is active in "default" only
553 }
554 else {
555 for (const std::string& name : rules_[idx].in_mode) {
556 add_to_mode(mode_id_.at(name), idx);
557 }
558 }
559 }
561
562 // Pre-resolve each transition's target mode id once, now that every mode is
563 // interned and validated, so the per-token apply_transition reads a field instead
564 // of a name→id map lookup. The target string stays for diagnostics; pop has none.
565 for (rule& candidate : rules_) {
566 if (candidate.action && candidate.action->operation != mode_action::op::pop) {
567 candidate.action->target_id = mode_id_.at(candidate.action->target);
568 }
569 }
570 }
571
576 void build_significance(const std::unordered_set<std::string>& insignificant_modes)
577 {
578 if (insignificant_modes.empty()) {
579 return;
580 }
581 mode_significant_.assign(mode_names_.size(), true);
582 for (const std::string& name : insignificant_modes) {
583 const auto found {mode_id_.find(name)};
584 if (found == mode_id_.end()) {
585 throw std::invalid_argument("insignificant_modes names an unknown mode: " + name);
586 }
587 mode_significant_[found->second] = false;
588 }
589 }
590
592 struct mode_dfa
593 {
594 real::dfa dfa;
595 std::vector<std::size_t> to_global;
596 };
597
601 {
602 bool have {false};
603 std::size_t idx {0};
604 std::size_t len {0};
605 };
606
611 munch_result munch_at(std::size_t mode,
612 std::string_view rest,
613 unsigned char lead) const
614 {
615 if (per_mode_dfa_[mode]) {
616 if (const std::optional<real::dfa_match> matched {per_mode_dfa_[mode]->dfa.match(rest)}) {
617 return munch_result {.have = true, .idx = per_mode_dfa_[mode]->to_global[matched->rule_index],
618 .len = matched->length};
619 }
620 return munch_result {};
621 }
622 return pike_munch_in_mode(mode, rest, lead);
623 }
624
633 bool may_start(std::size_t mode,
634 unsigned char byte) const
635 {
636 return !per_mode_[mode].first_byte_index[byte].empty();
637 }
638
643 bool starts_a_match(std::size_t mode,
644 std::string_view source,
645 std::size_t offset) const
646 {
647 const unsigned char lead {static_cast<unsigned char>(source[offset])};
648 if (!may_start(mode, lead)) {
649 return false;
650 }
651 return munch_at(mode, source.substr(offset), lead).have;
652 }
653
657 void advance(std::string_view source,
658 position& cursor,
659 std::size_t n) const
660 {
661 for (std::size_t i {0}; i < n; ++i) {
662 if (source[cursor.offset] == '\n') {
663 ++cursor.line;
664 cursor.column = 1;
665 }
666 else {
667 cursor.column += column_step(source, cursor.offset, columns_);
668 }
669 ++cursor.offset;
670 }
671 }
672
676 static std::size_t valid_utf8_len(std::string_view s,
677 std::size_t off)
678 {
679 const unsigned char b0 {static_cast<unsigned char>(s[off])};
680 std::size_t len {0};
681 unsigned int cp {0};
682 if (b0 < 0x80U) {
683 return 1; // ASCII
684 }
685 if ((b0 & 0xE0U) == 0xC0U) {
686 len = 2;
687 cp = b0 & 0x1FU;
688 }
689 else if ((b0 & 0xF0U) == 0xE0U) {
690 len = 3;
691 cp = b0 & 0x0FU;
692 }
693 else if ((b0 & 0xF8U) == 0xF0U) {
694 len = 4;
695 cp = b0 & 0x07U;
696 }
697 else {
698 return 0; // a continuation byte (0x80–0xBF) or an invalid lead (0xF8–0xFF)
699 }
700 if (off + len > s.size()) {
701 return 0; // truncated
702 }
703 for (std::size_t i {1}; i < len; ++i) {
704 const unsigned char bi {static_cast<unsigned char>(s[off + i])};
705 if ((bi & 0xC0U) != 0x80U) {
706 return 0; // a missing continuation
707 }
708 cp = (cp << 6U) | (bi & 0x3FU);
709 }
710 static constexpr unsigned int min_for_len[5] {0, 0, 0x80U, 0x800U, 0x10000U};
711 if (cp < min_for_len[len] || (cp >= 0xD800U && cp <= 0xDFFFU) || cp > 0x10FFFFU) {
712 return 0; // overlong, a UTF-16 surrogate, or beyond U+10FFFF
713 }
714 return len;
715 }
716
723 static std::size_t column_step(std::string_view source,
724 std::size_t off,
726 {
727 if (unit == scilex::column_unit::bytes) {
728 return 1;
729 }
730 const unsigned char byte {static_cast<unsigned char>(source[off])};
731 if ((byte & 0xC0U) == 0x80U) { // a continuation byte
732 // Score 0 only if it belongs to a valid codepoint whose lead is 1–3 bytes back; an orphan
733 // continuation is malformed and scores 1. (A codepoint never spans a newline, so this
734 // fixed look-back cannot cross a line boundary in a way that matters.)
735 for (std::size_t back {1}; back <= 3 && back <= off; ++back) {
736 if (valid_utf8_len(source, off - back) > back) {
737 return 0;
738 }
739 }
740 return 1;
741 }
742 if (unit == scilex::column_unit::utf16) {
743 return valid_utf8_len(source, off) == 4 ? 2 : 1; // an astral codepoint is a surrogate pair
744 }
745 return 1; // codepoints: an ASCII byte or a lead (its continuations already scored 0)
746 }
747
754 std::string_view rest,
755 unsigned char lead) const
756 {
757 std::size_t best_len {0};
758 std::size_t best_idx {0};
759 bool have {false};
760 const auto consider {[&](std::size_t idx) {
761 // idx comes from this mode's first-byte dispatch, populated
762 // in build_dispatch() from rules_ indices, so it is always in
763 // range. The analyzer cannot prove that cross-vector invariant
764 // once this munch is a standalone shared method; a bounds guard
765 // would be an unreachable branch the 100%-4D gate rejects, so the
766 // proven false positive is suppressed here (see REPORT note).
767 // NOLINTNEXTLINE(clang-analyzer-core.NonNullParamChecker)
768 const auto matched {rules_[idx].pattern.match(rest)};
769 // A zero-length match participates (it can only win when no rule
770 // matches >0 here); the shared guard in scan_next turns that win
771 // into a lexical error rather than a stalled scan. Maximal munch
772 // still prefers any longer non-empty match.
773 if (matched
774 && (!have || matched.end() > best_len
775 || (matched.end() == best_len && idx < best_idx))) {
776 best_len = matched.end();
777 best_idx = idx;
778 have = true;
779 }
780 }};
781 const dispatch& active {per_mode_[mode]};
782 for (const std::size_t idx : active.first_byte_index[lead]) {
783 consider(idx);
784 }
785 for (const std::size_t idx : active.general) {
786 consider(idx);
787 }
788 return {.have = have, .idx = best_idx, .len = best_len};
789 }
790
793 [[nodiscard]] bool rule_active_in_mode(std::size_t idx,
794 std::size_t mode) const
795 {
796 const std::vector<std::string>& modes {rules_[idx].in_mode};
797 if (modes.empty()) {
798 return mode == 0;
799 }
800 for (const std::string& name : modes) {
801 if (mode_id_.at(name) == mode) {
802 return true;
803 }
804 }
805 return false;
806 }
807
812 std::vector<std::string> audit_probes(const std::vector<std::size_t>& to_global) const
813 {
814 std::array<bool, 256> seen {};
815 std::vector<unsigned char> alpha;
816 const auto add {[&](unsigned char b) {
817 if (!seen[b]) {
818 seen[b] = true;
819 alpha.push_back(b);
820 }
821 }};
822 for (const std::size_t g : to_global) {
823 for (int b {0}; b < 256; ++b) {
824 if (rules_[g].pattern.may_start_with(static_cast<unsigned char>(b))) {
825 add(static_cast<unsigned char>(b));
826 }
827 }
828 }
829 for (const char structural : std::string_view {" \t\n\"'/*-+=<>()[]{};.:,aAz09_"}) {
830 add(static_cast<unsigned char>(structural));
831 }
832
833 // alpha is always non-empty (the structural bytes above are unconditional), so
834 // the probe count is O(alphabet) + a fixed random batch — deterministic, bounded,
835 // and free of cap branches. Singletons + short repeats expose lazy delimiters and
836 // quantifier boundaries (the hard cases); the random batch broadens coverage.
837 std::vector<std::string> probes;
838 for (const unsigned char b : alpha) {
839 for (const std::size_t n : std::array<std::size_t, 5> {1, 2, 3, 6, 8}) {
840 probes.emplace_back(n, static_cast<char>(b));
841 }
842 }
843 // Fixed seed by design: this RNG only generates local probe strings for the
844 // build-time DFA equivalence audit, which must be reproducible. No security
845 // role (no tokens, crypto or identifiers) — a constant seed is correct here.
846 // NOLINTNEXTLINE(bugprone-random-generator-seed,cert-msc32-c,cert-msc51-cpp)
847 std::mt19937 rng {0x5C11EFU}; // fixed seed: the audit is reproducible
848 std::uniform_int_distribution<std::size_t> len_d {1, 48};
849 std::uniform_int_distribution<std::size_t> sym_d {0, alpha.size() - 1};
850 for (int batch {0}; batch < 256; ++batch) {
851 std::string input;
852 const std::size_t len {len_d(rng)};
853 for (std::size_t i {0}; i < len; ++i) {
854 input.push_back(static_cast<char>(alpha[sym_d(rng)]));
855 }
856 probes.push_back(std::move(input));
857 }
858 return probes;
859 }
860
864 [[nodiscard]] bool audit_passes(const real::dfa& candidate,
865 const std::vector<std::size_t>& to_global,
866 std::size_t mode) const
867 {
868 const std::vector<std::string> probes {audit_probes(to_global)};
869 for (const std::string& probe : probes) {
870 const std::string_view rest {probe}; // probes always have length >= 1
871 const std::optional<real::dfa_match> hit {candidate.match(rest)};
872 const munch_result pike {pike_munch_in_mode(mode, rest, static_cast<unsigned char>(rest[0]))};
873 std::size_t dfa_idx {0};
874 std::size_t dfa_len {0};
875 if (hit.has_value()) {
876 dfa_idx = to_global[hit->rule_index];
877 dfa_len = hit->length;
878 }
879 // One comparison — the tuple's element-wise short-circuit lives in <tuple>, not
880 // here — so any divergence (chiefly a lazy rule's shortest-vs-longest) rejects.
881 if (std::tuple {hit.has_value(), dfa_idx, dfa_len} != std::tuple {pike.have, pike.idx, pike.len}) {
882 return false;
883 }
884 }
885 return true;
886 }
887
897 std::optional<mode_dfa> try_build_mode_dfa(std::vector<std::size_t> to_global,
898 std::size_t mode)
899 {
900 std::vector<real::detail::program_view> programs;
901 programs.reserve(to_global.size());
902 for (const std::size_t g : to_global) {
903 programs.push_back(rules_[g].pattern.raw_program());
904 }
905 try {
906 real::dfa candidate {std::span<const real::detail::program_view>(programs)};
907 if (!audit_passes(candidate, to_global, mode)) {
908 return std::nullopt; // a divergence (e.g. a lazy rule) → keep this mode on Pike
909 }
910 return mode_dfa {.dfa = std::move(candidate), .to_global = std::move(to_global)};
911 }
912 catch (const real::dfa_error&) {
913 return std::nullopt; // un-DFA-able assertion ($, \b, multiline ^/$): keep on Pike
914 }
915 }
916
925 void build_dfa_modes(const std::unordered_set<std::string>& dfa_modes)
926 {
927 per_mode_dfa_.assign(mode_names_.size(), nullptr);
928 for (const std::string& name : dfa_modes) {
929 const auto found {mode_id_.find(name)};
930 if (found == mode_id_.end()) {
931 throw std::invalid_argument("dfa_modes names an unknown mode: " + name);
932 }
933 const std::size_t mode {found->second};
934 std::vector<std::size_t> to_global;
935 for (std::size_t idx {0}; idx < rules_.size(); ++idx) {
936 if (rule_active_in_mode(idx, mode)) {
937 to_global.push_back(idx);
938 }
939 }
940 if (auto built {try_build_mode_dfa(std::move(to_global), mode)}) {
941 per_mode_dfa_[mode] = std::make_shared<const mode_dfa>(std::move(*built));
942 }
943 }
944 }
945
947 struct dispatch
948 {
949 std::array<std::vector<std::size_t>, 256> first_byte_index;
950 std::vector<std::size_t> general;
951 };
952
953 std::vector<rule> rules_;
956 std::vector<std::string> mode_names_;
957 std::map<std::string, std::size_t> mode_id_;
958 std::vector<dispatch> per_mode_;
959 std::vector<std::shared_ptr<const mode_dfa>> per_mode_dfa_;
960 std::vector<bool> mode_significant_;
961 };
962
971 {
972 public:
973
974 using iterator_category = std::input_iterator_tag;
976 using difference_type = std::ptrdiff_t;
977 using pointer = const token*;
978 using reference = const token&;
979
981 token_iterator() = default;
982
989 token_iterator(const lexer& owner,
990 std::string_view source,
991 eof_policy policy)
992 : owner_(&owner),
993 source_(source),
994 policy_(policy),
995 done_(false)
996 {
997 advance();
998 }
999
1002 {
1003 return current_;
1004 }
1005
1008 {
1009 return &current_;
1010 }
1011
1014 {
1015 advance();
1016 return *this;
1017 }
1018
1020 void operator++(int)
1021 {
1022 advance();
1023 }
1024
1030 [[nodiscard]] bool operator==(const token_iterator& other) const
1031 {
1032 return done_ == other.done_ && (done_ || cursor_.offset == other.cursor_.offset);
1033 }
1034
1040 [[nodiscard]] bool operator!=(const token_iterator& other) const
1041 {
1042 return !(*this == other);
1043 }
1044
1045 private:
1046
1047 const lexer* owner_ {nullptr};
1048 std::string_view source_;
1049 position cursor_ {0, 1, 1};
1050 std::vector<frame> stack_ {frame {.mode_id = 0, .entry_pos = position {0, 1, 1}}};
1053 bool eof_done_ {false};
1054 bool done_ {true};
1055
1057 void advance()
1058 {
1059 if (done_) {
1060 return;
1061 }
1063 return;
1064 }
1065 // Input exhausted: yield one end-of-input token if requested, else stop.
1068 eof_done_ = true;
1069 return;
1070 }
1071 done_ = true;
1072 }
1073 };
1074
1081 {
1082 public:
1083
1090 token_range(const lexer& owner,
1091 std::string_view source,
1092 eof_policy policy)
1093 : owner_(&owner),
1094 source_(source),
1095 policy_(policy)
1096 {}
1097
1099 [[nodiscard]] token_iterator begin() const
1100 {
1102 }
1103
1105 [[nodiscard]] token_iterator end() const
1106 {
1107 return token_iterator();
1108 }
1109
1110 private:
1111
1112 const lexer* owner_ {nullptr};
1113 std::string_view source_;
1115 };
1116
1117 inline token_range lexer::scan(std::string_view source,
1118 eof_policy policy) const&
1119 {
1120 return token_range(*this, source, policy);
1121 }
1122} // namespace scilex
1123
1124#endif // SCILEX_LEXER_HPP
Thrown when no rule matches at a position (a lexical error).
Definition lexer.hpp:124
lex_error(const std::string &message, position where)
Builds the error.
Definition lexer.hpp:132
position where() const noexcept
Returns the position of the unmatched byte.
Definition lexer.hpp:141
position where_
Where tokenization failed.
Definition lexer.hpp:148
A lexer built from an ordered list of rules.
Definition lexer.hpp:236
void validate_transitions() const
Fail-fast transition checks: a transition rule must consume input, and a push/set target must be a de...
Definition lexer.hpp:511
void add_to_mode(std::size_t m, std::size_t idx)
Adds rule idx to mode m's dispatch via REAL's exact first-byte API — the same 3-way split (nullable →...
Definition lexer.hpp:479
std::vector< dispatch > per_mode_
Dispatch index, one per mode (by id).
Definition lexer.hpp:958
column_unit columns() const noexcept
The unit this lexer counts position::column in (positions do not carry it, so a consumer that needs t...
Definition lexer.hpp:279
void build_dispatch()
Builds the per-mode first-byte dispatch from rules_ (once, at construction). "default" is mode 0; eve...
Definition lexer.hpp:536
scilex::column_unit columns_
The unit position::column is counted in.
Definition lexer.hpp:955
std::vector< std::string > audit_probes(const std::vector< std::size_t > &to_global) const
The bounded, deterministic probe inputs for the audit: every active rule's possible first bytes ∪ str...
Definition lexer.hpp:812
std::vector< rule > rules_
The ordered token rules.
Definition lexer.hpp:953
std::vector< bool > mode_significant_
Layout policy (empty = all significant).
Definition lexer.hpp:960
std::optional< mode_dfa > try_build_mode_dfa(std::vector< std::size_t > to_global, std::size_t mode)
Builds the mode_dfa for one mode, or std::nullopt if the mode cannot take the DFA fast path....
Definition lexer.hpp:897
static std::size_t valid_utf8_len(std::string_view s, std::size_t off)
The length (1–4) of a valid UTF-8 codepoint starting at off in s, or 0 when the byte there is not a v...
Definition lexer.hpp:676
bool mode_is_empty(std::size_t m) const
Whether mode m has no active rule (so nothing can match in it).
Definition lexer.hpp:500
bool starts_a_match(std::size_t mode, std::string_view source, std::size_t offset) const
Does a rule in mode match at offset in source? The error-recovery loop's stop test — the smallest suc...
Definition lexer.hpp:643
munch_result munch_at(std::size_t mode, std::string_view rest, unsigned char lead) const
The winning munch in mode at the start of rest (lead is rest's first byte), dispatching to the mode's...
Definition lexer.hpp:611
const std::vector< bool > & mode_significant() const noexcept
The per-mode-id layout-significance policy (see scilex::layout). Index by a token's mode_id; false ma...
Definition lexer.hpp:339
const std::string & mode_name(std::size_t id) const noexcept
The name of mode id (0 is "default"), for labelling tokens.
Definition lexer.hpp:345
static std::string position_label(position where)
Formats a position as "line:column" for diagnostics.
Definition lexer.hpp:373
bool audit_passes(const real::dfa &candidate, const std::vector< std::size_t > &to_global, std::size_t mode) const
The candidate DFA must reproduce the Pike munch on every probe: catches divergences the bytecode cann...
Definition lexer.hpp:864
void build_significance(const std::unordered_set< std::string > &insignificant_modes)
Builds the layout-significance policy from the insignificant-mode names (validated against the intern...
Definition lexer.hpp:576
std::size_t intern_mode(const std::string &name)
Interns a mode name to its id, assigning the next id on first sight.
Definition lexer.hpp:467
bool rule_active_in_mode(std::size_t idx, std::size_t mode) const
Whether rule idx is active in mode mode (mirrors build_dispatch, an empty in_mode is the default mode...
Definition lexer.hpp:793
bool scan_next(std::string_view source, position &cursor, std::vector< frame > &stack, token &out) const
Advances cursor to and past the next non-skipped token in the active mode, applying the winning rule'...
Definition lexer.hpp:397
std::map< std::string, std::size_t > mode_id_
Mode name -> id.
Definition lexer.hpp:957
std::vector< token > tokenize(std::string_view source, eof_policy policy=eof_policy::omit) const
Tokenizes source into the sequence of non-skipped tokens.
Definition lexer.hpp:299
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &&=delete
Deleted: the range would point into a temporary lexer.
munch_result pike_munch_in_mode(std::size_t mode, std::string_view rest, unsigned char lead) const
The per-rule Pike + first-byte-dispatch munch in mode at the start of rest (lead is rest's first byte...
Definition lexer.hpp:753
static std::size_t column_step(std::string_view source, std::size_t off, scilex::column_unit unit)
How much the column advances when the byte at off in source is consumed, under unit....
Definition lexer.hpp:723
void advance(std::string_view source, position &cursor, std::size_t n) const
Advances cursor by n bytes of source, maintaining the 1-based line/column tracker (a newline resets t...
Definition lexer.hpp:657
std::vector< std::shared_ptr< const mode_dfa > > per_mode_dfa_
Per-mode DFA fast path (nullptr = Pike).
Definition lexer.hpp:959
std::vector< std::string > dfa_modes_active() const
The modes actually accelerated by a DFA fast path.
Definition lexer.hpp:357
lexer(std::vector< rule > rules, std::unordered_set< std::string > insignificant_modes={}, std::unordered_set< std::string > dfa_modes={}, error_policy errors=error_policy::raise, column_unit columns=column_unit::bytes)
Builds a lexer from rules (taken by value, then moved in).
Definition lexer.hpp:263
error_policy errors_
What to do at an unmatched byte.
Definition lexer.hpp:954
void build_dfa_modes(const std::unordered_set< std::string > &dfa_modes)
Opts the named dfa_modes into the DFA fast path (called once, after build_dispatch)....
Definition lexer.hpp:925
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &
Returns a lazy range over the non-skipped tokens of source.
Definition lexer.hpp:1117
std::vector< std::string > mode_names_
Mode id -> name ("default" is id 0).
Definition lexer.hpp:956
bool may_start(std::size_t mode, unsigned char byte) const
O(1) pre-filter for error recovery: can a fixed-lead rule in mode begin with byte?...
Definition lexer.hpp:633
Forward (single-pass) iterator yielding one token at a time.
Definition lexer.hpp:971
token_iterator(const lexer &owner, std::string_view source, eof_policy policy)
Constructs a begin iterator over source for owner.
Definition lexer.hpp:989
std::ptrdiff_t difference_type
Required typedef.
Definition lexer.hpp:976
void operator++(int)
Post-increment (single-pass: no useful copy is returned).
Definition lexer.hpp:1020
std::vector< frame > stack_
Mode stack (top = active).
Definition lexer.hpp:1050
reference operator*() const
Returns the current token.
Definition lexer.hpp:1001
token current_
The current token.
Definition lexer.hpp:1051
bool operator==(const token_iterator &other) const
Equality: both exhausted, or both at the same offset.
Definition lexer.hpp:1030
pointer operator->() const
Member access to the current token.
Definition lexer.hpp:1007
position cursor_
Current scan position.
Definition lexer.hpp:1049
bool done_
True once exhausted (end sentinel).
Definition lexer.hpp:1054
const lexer * owner_
Rules provider (not owned).
Definition lexer.hpp:1047
std::string_view source_
Text being scanned.
Definition lexer.hpp:1048
token_iterator & operator++()
Advances to the next token.
Definition lexer.hpp:1013
token_iterator()=default
Constructs the end sentinel.
void advance()
Produces the next token, or marks the iterator exhausted.
Definition lexer.hpp:1057
bool operator!=(const token_iterator &other) const
Inequality.
Definition lexer.hpp:1040
std::input_iterator_tag iterator_category
Single-pass.
Definition lexer.hpp:974
bool eof_done_
End-of-input token already yielded.
Definition lexer.hpp:1053
eof_policy policy_
End-of-input policy.
Definition lexer.hpp:1052
A lazy range of tokens, returned by lexer::scan.
Definition lexer.hpp:1081
token_iterator end() const
End sentinel.
Definition lexer.hpp:1105
std::string_view source_
Text being scanned.
Definition lexer.hpp:1113
token_iterator begin() const
Begin iterator (produces the first token).
Definition lexer.hpp:1099
const lexer * owner_
Rules provider (not owned).
Definition lexer.hpp:1112
token_range(const lexer &owner, std::string_view source, eof_policy policy)
Builds the range.
Definition lexer.hpp:1090
eof_policy policy_
End-of-input policy.
Definition lexer.hpp:1114
The SciLex public API (scilex::lexer, scilex::rule, scilex::token).
Definition layout.hpp:47
void apply_transition(const rule &r, position start, std::vector< frame > &stack)
Applies rule r's mode transition (if any) to stack — the per-scan mode-stack mutation,...
Definition lexer.hpp:174
constexpr int error
Reserved token kind for a lexical-error run under scilex::error_policy::token.
Definition token.hpp:37
column_unit
The unit a token's position::column is counted in.
Definition lexer.hpp:222
@ codepoints
One column per Unicode scalar value (a valid UTF-8 codepoint).
@ bytes
One column per byte (the default; column == byte offset within the line + 1).
@ utf16
One column per UTF-16 code unit (BMP = 1, astral = 2) — the LSP unit.
eof_policy
Whether tokenization appends a synthetic end-of-input token.
Definition lexer.hpp:54
@ append
Append one end_of_input token at the end position.
@ omit
Stop at the last real token (default).
constexpr int end_of_input
Reserved token kind for the synthetic end-of-input token.
Definition token.hpp:26
error_policy
What a lexer does when it reaches a byte that no rule in the active mode can begin.
Definition lexer.hpp:201
One entry on the per-scan mode stack: the active mode and where it was entered (the entry position fe...
Definition lexer.hpp:156
std::size_t mode_id
Id of the active mode.
Definition lexer.hpp:157
position entry_pos
Where this mode was entered.
Definition lexer.hpp:158
Per-mode dispatch index: the first-byte buckets scoped to one mode.
Definition lexer.hpp:948
std::vector< std::size_t > general
Nullable rules (tried everywhere).
Definition lexer.hpp:950
std::array< std::vector< std::size_t >, 256 > first_byte_index
Rule indices by leading byte.
Definition lexer.hpp:949
An adopted per-mode DFA: the automaton plus its local→global rule map.
Definition lexer.hpp:593
std::vector< std::size_t > to_global
DFA local rule index -> global rules_ index.
Definition lexer.hpp:595
real::dfa dfa
Recognizes the mode's rules in one pass.
Definition lexer.hpp:594
A munch decision: whether a rule matched, which (global index), how many bytes — the small value scan...
Definition lexer.hpp:601
A mode transition, fired when its rule wins, acting on the scan's mode stack: enter a nested mode,...
Definition lexer.hpp:64
std::string target
The mode push/set enters; ignored (and omittable) for pop.
Definition lexer.hpp:74
op operation
Which transition to perform.
Definition lexer.hpp:73
std::size_t target_id
The interned id of target, resolved once when the lexer is built (see scilex::lexer::build_dispatch) ...
Definition lexer.hpp:80
op
The kind of transition.
Definition lexer.hpp:67
@ push
Enter target, remembering the mode below it (a nested context).
@ pop
Leave the current mode, returning to the one beneath it.
@ set
Replace the current mode with target (stack depth unchanged).
A location in the source text.
Definition token.hpp:48
std::size_t offset
0-based byte offset from the start of the source.
Definition token.hpp:49
std::size_t column
1-based byte column within the line.
Definition token.hpp:51
std::size_t line
1-based line number.
Definition token.hpp:50
A token rule: a kind, the pattern that recognizes it, whether matches are discarded (whitespace,...
Definition lexer.hpp:110
int kind
Kind assigned to tokens this rule produces.
Definition lexer.hpp:111
std::optional< mode_action > action
Mode transition fired when this rule wins.
Definition lexer.hpp:115
bool skip
If true, matches are consumed but not emitted.
Definition lexer.hpp:113
real::regex pattern
The recognizer (a linear-time REAL regex; its flags are the author's — see above).
Definition lexer.hpp:112
std::vector< std::string > in_mode
Modes this rule is active in; empty ⇒ {"default"}.
Definition lexer.hpp:114
One lexical token: a typed slice of the source.
Definition token.hpp:58
int kind
Caller-defined token kind (e.g. an enum value).
Definition token.hpp:59
The token produced by the lexer and its source position.