scilex/lexer_8hpp_source.html

#ifndef SCILEX_LEXER_HPP

#define SCILEX_LEXER_HPP


#include <algorithm>

#include <array>

#include <iterator>

#include <map>

#include <memory>

#include <optional>

#include <random>

#include <span>

#include <stdexcept>

#include <string>

#include <string_view>

#include <tuple>

#include <unordered_set>

#include <utility>

#include <vector>


#include <real/dfa.hpp>

#include <real/real.hpp>


#include "token.hpp"


namespace scilex {


  class token_iterator;

  class token_range;


  enum class eof_policy

  {

    omit,

    append,

  };


  struct mode_action

  {


    enum class op

    {

      push,

      pop,

      set,

    };


    op          operation;

    std::string target {};


    std::size_t target_id {0};

  };


  struct rule

  {

    int                        kind;

    real::regex                pattern;

    bool                       skip    {false};

    std::vector<std::string>   in_mode {};

    std::optional<mode_action> action  {};

  };


  class lex_error : public std::runtime_error

  {

  public:


    lex_error(const std::string& message,

              position           where)

      : std::runtime_error(message),

        where_(where)

    {}


    [[nodiscard]] position where() const noexcept

    {

      return where_;

    }


  private:


    position where_;

  };


  struct frame

  {

    std::size_t mode_id;

    position    entry_pos;

  };


  inline void apply_transition(const rule&         r,

                               position            start,

                               std::vector<frame>& stack)

  {

    if (!r.action) {

      return;

    }

    if (r.action->operation == mode_action::op::push) {

      stack.push_back(frame {.mode_id = r.action->target_id, .entry_pos = start});

    }

    else if (r.action->operation == mode_action::op::pop) {

      if (stack.size() == 1) {

        throw lex_error("cannot pop the mode stack: already at the root mode", start);

      }

      stack.pop_back();

    }

    else { // set: replace the active mode in place (depth unchanged)

      stack.back().mode_id = r.action->target_id;

    }

  }


  enum class error_policy

  {

    raise,

    token,

  };


  enum class column_unit

  {

    bytes,

    codepoints,

    utf16,

  };


  class lexer

  {

  public:


    explicit lexer(std::vector<rule>               rules,

                   std::unordered_set<std::string> insignificant_modes = {},

                   std::unordered_set<std::string> dfa_modes           = {},

                   error_policy                    errors              = error_policy::raise,

                   column_unit                     columns             = column_unit::bytes)

      : rules_(std::move(rules)),

        errors_(errors),

        columns_(columns)

    {

      build_dispatch();

      build_significance(insignificant_modes);

      build_dfa_modes(dfa_modes);

    }


    [[nodiscard]] column_unit columns() const noexcept

    {

      return columns_;

    }


    [[nodiscard]] std::vector<token> tokenize(std::string_view source,

                                              eof_policy       policy = eof_policy::omit) const

    {

      std::vector<token> out;

      position           cursor {0, 1, 1};

      std::vector<frame> stack {frame {.mode_id = 0, .entry_pos = cursor}}; // start in "default"

      token              next   {};

      while (scan_next(source, cursor, stack, next)) {

        out.push_back(next);

      }

      if (policy == eof_policy::append) {

        // The cursor now sits at the end position (past any trailing trivia).

        out.push_back(token {end_of_input, source.substr(cursor.offset), cursor});

      }

      return out;

    }


    [[nodiscard]] token_range scan(std::string_view source,

                                   eof_policy       policy = eof_policy::omit) const&;


    token_range scan(std::string_view source,

                     eof_policy       policy = eof_policy::omit) const&& = delete;


    [[nodiscard]] const std::vector<bool>& mode_significant() const noexcept

    {

      return mode_significant_;

    }


    [[nodiscard]] const std::string& mode_name(std::size_t id) const noexcept

    {

      return mode_names_[id];

    }


    [[nodiscard]] std::vector<std::string> dfa_modes_active() const

    {

      std::vector<std::string> active;

      for (std::size_t m {0}; m < per_mode_dfa_.size(); ++m) {

        if (per_mode_dfa_[m]) {

          active.push_back(mode_names_[m]);

        }

      }

      return active;

    }


  private:


    friend class token_iterator;


    static std::string position_label(position where)

    {

      return std::to_string(where.line) + ":" + std::to_string(where.column);

    }


    bool scan_next(std::string_view    source,

                   position&           cursor,

                   std::vector<frame>& stack,

                   token&              out) const

    {

      while (cursor.offset < source.size()) {

        const std::string_view rest {source.substr(cursor.offset)};

        const std::size_t      mode {stack.back().mode_id};

        const munch_result     m    {munch_at(mode, rest, static_cast<unsigned char>(source[cursor.offset]))};


        if (!m.have) {

          if (errors_ == error_policy::raise) {

            throw lex_error("no rule matches in mode '" + mode_names_[mode] + "' (entered at "

                            + position_label(stack.back().entry_pos) + ")", cursor); // #1

          }

          // Recovery (error_policy::token): accumulate the maximal run of bytes that no rule in this

          // mode can begin into ONE reserved-kind error token, then resume — no throw, no transition

          // (the run stays in its mode). The run ends at the first position where a rule matches (>0);

          // may_start is an O(1) first-byte pre-filter that skips the bulk of the noise without a full

          // match attempt. The lexeme is the exact offending bytes.

          const position err_start {cursor};

          advance(source, cursor, 1); // the byte at err_start is unmatched by definition

          while (cursor.offset < source.size()

                 && !starts_a_match(mode, source, cursor.offset)) {

            advance(source, cursor, 1);

          }

          out = token {scilex::error, source.substr(err_start.offset, cursor.offset - err_start.offset),

                       err_start, mode};

          return true;

        }

        if (m.len == 0) {

          // A rule won with a zero-length match (a nullable rule and no longer match at this

          // position). Advancing by 0 would spin forever, so report it as a lexical error — fatal

          // under either policy (recovery cannot make progress here). The shared advance point, so

          // both the Pike and DFA paths are covered.

          throw lex_error("zero-length match in mode '" + mode_names_[mode]

                          + "' (rule never advances)", cursor); // #4

        }


        const std::size_t best_idx {m.idx};

        const std::size_t best_len {m.len};

        const position    start    {cursor};

        advance(source, cursor, best_len);


        apply_transition(rules_[best_idx], start, stack); // advances, then transitions (#2 on a bad pop)


        if (!rules_[best_idx].skip) {

          // Tag the token with the mode it was lexed in (captured before the

          // transition above) — Layout Awareness reads it; the scan is untouched.

          out = token {rules_[best_idx].kind, source.substr(start.offset, best_len), start, mode};

          return true;

        }

        // Skip rule: keep scanning for the next emitted token (possibly in a new mode).

      }

      if (stack.size() > 1) {

        if (errors_ == error_policy::raise) {

          throw lex_error("unterminated mode '" + mode_names_[stack.back().mode_id] + "' (entered at "

                          + position_label(stack.back().entry_pos) + ")", stack.back().entry_pos); // #3

        }

        // Recovery (error_policy::token): a mode was still pushed at end of input. Emit one zero-width

        // error token positioned at the EOF (the partial tokens already emitted stay), then unwind to

        // the root so the next call reports a clean end of input.

        out = token {scilex::error, source.substr(cursor.offset, 0), cursor, stack.back().mode_id};

        stack.resize(1);

        return true;

      }

      return false;

    }


    std::size_t intern_mode(const std::string& name)

    {

      const auto [it, inserted] {mode_id_.emplace(name, mode_names_.size())};

      if (inserted) {

        mode_names_.push_back(name);

      }

      return it->second;

    }


    void add_to_mode(std::size_t m,

                     std::size_t idx)

    {

      const real::regex& pattern {rules_[idx].pattern};

      dispatch&          target  {per_mode_[m]};

      if (!pattern.has_first_byte_set()) {

        target.general.push_back(idx);

      }

      else if (const std::optional<unsigned char> byte {pattern.unique_first_byte()}) {

        target.first_byte_index[*byte].push_back(idx);

      }

      else {

        for (int candidate {0}; candidate < 256; ++candidate) {

          if (pattern.may_start_with(static_cast<unsigned char>(candidate))) {

            target.first_byte_index[static_cast<unsigned char>(candidate)].push_back(idx);

          }

        }

      }

    }


    [[nodiscard]] bool mode_is_empty(std::size_t m) const

    {

      const dispatch& d {per_mode_[m]};

      return d.general.empty()

             && std::all_of(d.first_byte_index.begin(), d.first_byte_index.end(),

                            [](const std::vector<std::size_t>& bucket) { return bucket.empty(); });

    }


    void validate_transitions() const

    {

      for (const rule& candidate : rules_) {

        if (!candidate.action) {

          continue;

        }

        if (candidate.pattern.pattern().empty()) {

          throw std::invalid_argument("a transition rule must consume input (empty pattern)");

        }

        if (candidate.action->operation != mode_action::op::pop

            && mode_is_empty(mode_id_.at(candidate.action->target))) {

          throw std::invalid_argument("a transition targets the empty mode '"

                                      + candidate.action->target + "' (no rule is active in it)");

        }

      }

    }


    void build_dispatch()

    {

      intern_mode("default"); // mode 0, always present

      for (const rule& candidate : rules_) {

        for (const std::string& name : candidate.in_mode) {

          intern_mode(name);

        }

        if (const std::optional<mode_action> action {candidate.action};

            action.has_value() && action->operation != mode_action::op::pop) {

          intern_mode(action->target);

        }

      }

      per_mode_.resize(mode_names_.size());


      for (std::size_t idx {0}; idx < rules_.size(); ++idx) {

        if (rules_[idx].in_mode.empty()) {

          add_to_mode(0, idx); // an undeclared rule is active in "default" only

        }

        else {

          for (const std::string& name : rules_[idx].in_mode) {

            add_to_mode(mode_id_.at(name), idx);

          }

        }

      }

      validate_transitions();


      // Pre-resolve each transition's target mode id once, now that every mode is

      // interned and validated, so the per-token apply_transition reads a field instead

      // of a name→id map lookup. The target string stays for diagnostics; pop has none.

      for (rule& candidate : rules_) {

        if (candidate.action && candidate.action->operation != mode_action::op::pop) {

          candidate.action->target_id = mode_id_.at(candidate.action->target);

        }

      }

    }


    void build_significance(const std::unordered_set<std::string>& insignificant_modes)

    {

      if (insignificant_modes.empty()) {

        return;

      }

      mode_significant_.assign(mode_names_.size(), true);

      for (const std::string& name : insignificant_modes) {

        const auto found {mode_id_.find(name)};

        if (found == mode_id_.end()) {

          throw std::invalid_argument("insignificant_modes names an unknown mode: " + name);

        }

        mode_significant_[found->second] = false;

      }

    }


    struct mode_dfa

    {

      real::dfa                dfa;

      std::vector<std::size_t> to_global;

    };


    struct munch_result

    {

      bool        have {false};

      std::size_t idx  {0};

      std::size_t len  {0};

    };


    munch_result munch_at(std::size_t      mode,

                          std::string_view rest,

                          unsigned char    lead) const

    {

      if (per_mode_dfa_[mode]) {

        if (const std::optional<real::dfa_match> matched {per_mode_dfa_[mode]->dfa.match(rest)}) {

          return munch_result {.have = true, .idx = per_mode_dfa_[mode]->to_global[matched->rule_index],

                               .len = matched->length};

        }

        return munch_result {};

      }

      return pike_munch_in_mode(mode, rest, lead);

    }


    bool may_start(std::size_t   mode,

                   unsigned char byte) const

    {

      return !per_mode_[mode].first_byte_index[byte].empty();

    }


    bool starts_a_match(std::size_t      mode,

                        std::string_view source,

                        std::size_t      offset) const

    {

      const unsigned char lead {static_cast<unsigned char>(source[offset])};

      if (!may_start(mode, lead)) {

        return false;

      }

      return munch_at(mode, source.substr(offset), lead).have;

    }


    void advance(std::string_view source,

                 position&        cursor,

                 std::size_t      n) const

    {

      for (std::size_t i {0}; i < n; ++i) {

        if (source[cursor.offset] == '\n') {

          ++cursor.line;

          cursor.column = 1;

        }

        else {

          cursor.column += column_step(source, cursor.offset, columns_);

        }

        ++cursor.offset;

      }

    }


    static std::size_t valid_utf8_len(std::string_view s,

                                      std::size_t      off)

    {

      const unsigned char b0  {static_cast<unsigned char>(s[off])};

      std::size_t         len {0};

      unsigned int        cp  {0};

      if (b0 < 0x80U) {

        return 1; // ASCII

      }

      if ((b0 & 0xE0U) == 0xC0U) {

        len = 2;

        cp  = b0 & 0x1FU;

      }

      else if ((b0 & 0xF0U) == 0xE0U) {

        len = 3;

        cp  = b0 & 0x0FU;

      }

      else if ((b0 & 0xF8U) == 0xF0U) {

        len = 4;

        cp  = b0 & 0x07U;

      }

      else {

        return 0; // a continuation byte (0x80–0xBF) or an invalid lead (0xF8–0xFF)

      }

      if (off + len > s.size()) {

        return 0; // truncated

      }

      for (std::size_t i {1}; i < len; ++i) {

        const unsigned char bi {static_cast<unsigned char>(s[off + i])};

        if ((bi & 0xC0U) != 0x80U) {

          return 0; // a missing continuation

        }

        cp = (cp << 6U) | (bi & 0x3FU);

      }

      static constexpr unsigned int min_for_len[5] {0, 0, 0x80U, 0x800U, 0x10000U};

      if (cp < min_for_len[len] || (cp >= 0xD800U && cp <= 0xDFFFU) || cp > 0x10FFFFU) {

        return 0; // overlong, a UTF-16 surrogate, or beyond U+10FFFF

      }

      return len;

    }


    static std::size_t column_step(std::string_view        source,

                                   std::size_t             off,

                                   scilex::column_unit     unit)

    {

      if (unit == scilex::column_unit::bytes) {

        return 1;

      }

      const unsigned char byte {static_cast<unsigned char>(source[off])};

      if ((byte & 0xC0U) == 0x80U) { // a continuation byte

        // Score 0 only if it belongs to a valid codepoint whose lead is 1–3 bytes back; an orphan

        // continuation is malformed and scores 1. (A codepoint never spans a newline, so this

        // fixed look-back cannot cross a line boundary in a way that matters.)

        for (std::size_t back {1}; back <= 3 && back <= off; ++back) {

          if (valid_utf8_len(source, off - back) > back) {

            return 0;

          }

        }

        return 1;

      }

      if (unit == scilex::column_unit::utf16) {

        return valid_utf8_len(source, off) == 4 ? 2 : 1; // an astral codepoint is a surrogate pair

      }

      return 1; // codepoints: an ASCII byte or a lead (its continuations already scored 0)

    }


    munch_result pike_munch_in_mode(std::size_t      mode,

                                    std::string_view rest,

                                    unsigned char    lead) const

    {

      std::size_t best_len {0};

      std::size_t best_idx {0};

      bool        have     {false};

      const auto  consider {[&](std::size_t idx) {

                              // idx comes from this mode's first-byte dispatch, populated

                              // in build_dispatch() from rules_ indices, so it is always in

                              // range. The analyzer cannot prove that cross-vector invariant

                              // once this munch is a standalone shared method; a bounds guard

                              // would be an unreachable branch the 100%-4D gate rejects, so the

                              // proven false positive is suppressed here (see REPORT note).

                              // NOLINTNEXTLINE(clang-analyzer-core.NonNullParamChecker)

                              const auto matched {rules_[idx].pattern.match(rest)};

                              // A zero-length match participates (it can only win when no rule

                              // matches >0 here); the shared guard in scan_next turns that win

                              // into a lexical error rather than a stalled scan. Maximal munch

                              // still prefers any longer non-empty match.

                              if (matched

                                  && (!have || matched.end() > best_len

                                      || (matched.end() == best_len && idx < best_idx))) {

                                best_len = matched.end();

                                best_idx = idx;

                                have     = true;

                              }

                            }};

      const dispatch& active {per_mode_[mode]};

      for (const std::size_t idx : active.first_byte_index[lead]) {

        consider(idx);

      }

      for (const std::size_t idx : active.general) {

        consider(idx);

      }

      return {.have = have, .idx = best_idx, .len = best_len};

    }


    [[nodiscard]] bool rule_active_in_mode(std::size_t idx,

                                           std::size_t mode) const

    {

      const std::vector<std::string>& modes {rules_[idx].in_mode};

      if (modes.empty()) {

        return mode == 0;

      }

      for (const std::string& name : modes) {

        if (mode_id_.at(name) == mode) {

          return true;

        }

      }

      return false;

    }


    std::vector<std::string> audit_probes(const std::vector<std::size_t>& to_global) const

    {

      std::array<bool, 256>      seen {};

      std::vector<unsigned char> alpha;

      const auto                 add {[&](unsigned char b) {

                                        if (!seen[b]) {

                                          seen[b] = true;

                                          alpha.push_back(b);

                                        }

                                      }};

      for (const std::size_t g : to_global) {

        for (int b {0}; b < 256; ++b) {

          if (rules_[g].pattern.may_start_with(static_cast<unsigned char>(b))) {

            add(static_cast<unsigned char>(b));

          }

        }

      }

      for (const char structural : std::string_view {" \t\n\"'/*-+=<>()[]{};.:,aAz09_"}) {

        add(static_cast<unsigned char>(structural));

      }


      // alpha is always non-empty (the structural bytes above are unconditional), so

      // the probe count is O(alphabet) + a fixed random batch — deterministic, bounded,

      // and free of cap branches. Singletons + short repeats expose lazy delimiters and

      // quantifier boundaries (the hard cases); the random batch broadens coverage.

      std::vector<std::string> probes;

      for (const unsigned char b : alpha) {

        for (const std::size_t n : std::array<std::size_t, 5> {1, 2, 3, 6, 8}) {

          probes.emplace_back(n, static_cast<char>(b));

        }

      }

      // Fixed seed by design: this RNG only generates local probe strings for the

      // build-time DFA equivalence audit, which must be reproducible. No security

      // role (no tokens, crypto or identifiers) — a constant seed is correct here.

      // NOLINTNEXTLINE(bugprone-random-generator-seed,cert-msc32-c,cert-msc51-cpp)

      std::mt19937                               rng   {0x5C11EFU}; // fixed seed: the audit is reproducible

      std::uniform_int_distribution<std::size_t> len_d {1, 48};

      std::uniform_int_distribution<std::size_t> sym_d {0, alpha.size() - 1};

      for (int batch {0}; batch < 256; ++batch) {

        std::string       input;

        const std::size_t len {len_d(rng)};

        for (std::size_t i {0}; i < len; ++i) {

          input.push_back(static_cast<char>(alpha[sym_d(rng)]));

        }

        probes.push_back(std::move(input));

      }

      return probes;

    }


    [[nodiscard]] bool audit_passes(const real::dfa&                candidate,

                                    const std::vector<std::size_t>& to_global,

                                    std::size_t                     mode) const

    {

      const std::vector<std::string> probes {audit_probes(to_global)};

      for (const std::string& probe : probes) {

        const std::string_view               rest    {probe}; // probes always have length >= 1

        const std::optional<real::dfa_match> hit     {candidate.match(rest)};

        const munch_result                   pike    {pike_munch_in_mode(mode, rest, static_cast<unsigned char>(rest[0]))};

        std::size_t                          dfa_idx {0};

        std::size_t                          dfa_len {0};

        if (hit.has_value()) {

          dfa_idx = to_global[hit->rule_index];

          dfa_len = hit->length;

        }

        // One comparison — the tuple's element-wise short-circuit lives in <tuple>, not

        // here — so any divergence (chiefly a lazy rule's shortest-vs-longest) rejects.

        if (std::tuple {hit.has_value(), dfa_idx, dfa_len} != std::tuple {pike.have, pike.idx, pike.len}) {

          return false;

        }

      }

      return true;

    }


    std::optional<mode_dfa> try_build_mode_dfa(std::vector<std::size_t> to_global,

                                               std::size_t              mode)

    {

      std::vector<real::detail::program_view> programs;

      programs.reserve(to_global.size());

      for (const std::size_t g : to_global) {

        programs.push_back(rules_[g].pattern.raw_program());

      }

      try {

        real::dfa candidate {std::span<const real::detail::program_view>(programs)};

        if (!audit_passes(candidate, to_global, mode)) {

          return std::nullopt; // a divergence (e.g. a lazy rule) → keep this mode on Pike

        }

        return mode_dfa {.dfa = std::move(candidate), .to_global = std::move(to_global)};

      }

      catch (const real::dfa_error&) {

        return std::nullopt; // un-DFA-able assertion ($, \b, multiline ^/$): keep on Pike

      }

    }


    void build_dfa_modes(const std::unordered_set<std::string>& dfa_modes)

    {

      per_mode_dfa_.assign(mode_names_.size(), nullptr);

      for (const std::string& name : dfa_modes) {

        const auto found {mode_id_.find(name)};

        if (found == mode_id_.end()) {

          throw std::invalid_argument("dfa_modes names an unknown mode: " + name);

        }

        const std::size_t        mode {found->second};

        std::vector<std::size_t> to_global;

        for (std::size_t idx {0}; idx < rules_.size(); ++idx) {

          if (rule_active_in_mode(idx, mode)) {

            to_global.push_back(idx);

          }

        }

        if (auto built {try_build_mode_dfa(std::move(to_global), mode)}) {

          per_mode_dfa_[mode] = std::make_shared<const mode_dfa>(std::move(*built));

        }

      }

    }


    struct dispatch

    {

      std::array<std::vector<std::size_t>, 256> first_byte_index;

      std::vector<std::size_t>                  general;

    };


    std::vector<rule>                            rules_;

    error_policy                                 errors_;

    scilex::column_unit                          columns_;

    std::vector<std::string>                     mode_names_;

    std::map<std::string, std::size_t>           mode_id_;

    std::vector<dispatch>                        per_mode_;

    std::vector<std::shared_ptr<const mode_dfa>> per_mode_dfa_;

    std::vector<bool>                            mode_significant_;

  };


  class token_iterator

  {

  public:


    using iterator_category = std::input_iterator_tag;

    using value_type        = token;

    using difference_type   = std::ptrdiff_t;

    using pointer           = const token*;

    using reference         = const token&;


    token_iterator() = default;


    token_iterator(const lexer&     owner,

                   std::string_view source,

                   eof_policy       policy)

      : owner_(&owner),

        source_(source),

        policy_(policy),

        done_(false)

    {

      advance();

    }


    reference operator*() const

    {

      return current_;

    }


    pointer operator->() const

    {

      return &current_;

    }


    token_iterator& operator++()

    {

      advance();

      return *this;

    }


    void operator++(int)

    {

      advance();

    }


    [[nodiscard]] bool operator==(const token_iterator& other) const

    {

      return done_ == other.done_ && (done_ || cursor_.offset == other.cursor_.offset);

    }


    [[nodiscard]] bool operator!=(const token_iterator& other) const

    {

      return !(*this == other);

    }


  private:


    const lexer*       owner_  {nullptr};

    std::string_view   source_;

    position           cursor_ {0, 1, 1};

    std::vector<frame> stack_  {frame {.mode_id = 0, .entry_pos = position {0, 1, 1}}};

    token              current_  {};

    eof_policy         policy_   {eof_policy::omit};

    bool               eof_done_ {false};

    bool               done_     {true};


    void advance()

    {

      if (done_) {

        return;

      }

      if (owner_->scan_next(source_, cursor_, stack_, current_)) {

        return;

      }

      // Input exhausted: yield one end-of-input token if requested, else stop.

      if (policy_ == eof_policy::append && !eof_done_) {

        current_  = token {end_of_input, source_.substr(cursor_.offset), cursor_};

        eof_done_ = true;

        return;

      }

      done_ = true;

    }


  };


  class token_range

  {

  public:


    token_range(const lexer&     owner,

                std::string_view source,

                eof_policy       policy)

      : owner_(&owner),

        source_(source),

        policy_(policy)

    {}


    [[nodiscard]] token_iterator begin() const

    {

      return token_iterator(*owner_, source_, policy_);

    }


    [[nodiscard]] token_iterator end() const

    {

      return token_iterator();

    }


  private:


    const lexer*     owner_  {nullptr};

    std::string_view source_;

    eof_policy       policy_ {eof_policy::omit};

  };


  inline token_range lexer::scan(std::string_view source,

                                 eof_policy       policy) const&

  {

    return token_range(*this, source, policy);

  }


} // namespace scilex


#endif // SCILEX_LEXER_HPP

scilex::lex_error
Thrown when no rule matches at a position (a lexical error).
Definition lexer.hpp:124

scilex::lex_error::lex_error
lex_error(const std::string &message, position where)
Builds the error.
Definition lexer.hpp:132

scilex::lex_error::where
position where() const noexcept
Returns the position of the unmatched byte.
Definition lexer.hpp:141

scilex::lex_error::where_
position where_
Where tokenization failed.
Definition lexer.hpp:148

scilex::lexer
A lexer built from an ordered list of rules.
Definition lexer.hpp:236

scilex::lexer::validate_transitions
void validate_transitions() const
Fail-fast transition checks: a transition rule must consume input, and a push/set target must be a de...
Definition lexer.hpp:511

scilex::lexer::add_to_mode
void add_to_mode(std::size_t m, std::size_t idx)
Adds rule idx to mode m's dispatch via REAL's exact first-byte API — the same 3-way split (nullable →...
Definition lexer.hpp:479

scilex::lexer::per_mode_
std::vector< dispatch > per_mode_
Dispatch index, one per mode (by id).
Definition lexer.hpp:958

scilex::lexer::columns
column_unit columns() const noexcept
The unit this lexer counts position::column in (positions do not carry it, so a consumer that needs t...
Definition lexer.hpp:279

scilex::lexer::build_dispatch
void build_dispatch()
Builds the per-mode first-byte dispatch from rules_ (once, at construction). "default" is mode 0; eve...
Definition lexer.hpp:536

scilex::lexer::columns_
scilex::column_unit columns_
The unit position::column is counted in.
Definition lexer.hpp:955

scilex::lexer::audit_probes
std::vector< std::string > audit_probes(const std::vector< std::size_t > &to_global) const
The bounded, deterministic probe inputs for the audit: every active rule's possible first bytes ∪ str...
Definition lexer.hpp:812

scilex::lexer::rules_
std::vector< rule > rules_
The ordered token rules.
Definition lexer.hpp:953

scilex::lexer::mode_significant_
std::vector< bool > mode_significant_
Layout policy (empty = all significant).
Definition lexer.hpp:960

scilex::lexer::try_build_mode_dfa
std::optional< mode_dfa > try_build_mode_dfa(std::vector< std::size_t > to_global, std::size_t mode)
Builds the mode_dfa for one mode, or std::nullopt if the mode cannot take the DFA fast path....
Definition lexer.hpp:897

scilex::lexer::valid_utf8_len
static std::size_t valid_utf8_len(std::string_view s, std::size_t off)
The length (1–4) of a valid UTF-8 codepoint starting at off in s, or 0 when the byte there is not a v...
Definition lexer.hpp:676

scilex::lexer::mode_is_empty
bool mode_is_empty(std::size_t m) const
Whether mode m has no active rule (so nothing can match in it).
Definition lexer.hpp:500

scilex::lexer::starts_a_match
bool starts_a_match(std::size_t mode, std::string_view source, std::size_t offset) const
Does a rule in mode match at offset in source? The error-recovery loop's stop test — the smallest suc...
Definition lexer.hpp:643

scilex::lexer::munch_at
munch_result munch_at(std::size_t mode, std::string_view rest, unsigned char lead) const
The winning munch in mode at the start of rest (lead is rest's first byte), dispatching to the mode's...
Definition lexer.hpp:611

scilex::lexer::mode_significant
const std::vector< bool > & mode_significant() const noexcept
The per-mode-id layout-significance policy (see scilex::layout). Index by a token's mode_id; false ma...
Definition lexer.hpp:339

scilex::lexer::mode_name
const std::string & mode_name(std::size_t id) const noexcept
The name of mode id (0 is "default"), for labelling tokens.
Definition lexer.hpp:345

scilex::lexer::position_label
static std::string position_label(position where)
Formats a position as "line:column" for diagnostics.
Definition lexer.hpp:373

scilex::lexer::audit_passes
bool audit_passes(const real::dfa &candidate, const std::vector< std::size_t > &to_global, std::size_t mode) const
The candidate DFA must reproduce the Pike munch on every probe: catches divergences the bytecode cann...
Definition lexer.hpp:864

scilex::lexer::build_significance
void build_significance(const std::unordered_set< std::string > &insignificant_modes)
Builds the layout-significance policy from the insignificant-mode names (validated against the intern...
Definition lexer.hpp:576

scilex::lexer::intern_mode
std::size_t intern_mode(const std::string &name)
Interns a mode name to its id, assigning the next id on first sight.
Definition lexer.hpp:467

scilex::lexer::rule_active_in_mode
bool rule_active_in_mode(std::size_t idx, std::size_t mode) const
Whether rule idx is active in mode mode (mirrors build_dispatch, an empty in_mode is the default mode...
Definition lexer.hpp:793

scilex::lexer::scan_next
bool scan_next(std::string_view source, position &cursor, std::vector< frame > &stack, token &out) const
Advances cursor to and past the next non-skipped token in the active mode, applying the winning rule'...
Definition lexer.hpp:397

scilex::lexer::mode_id_
std::map< std::string, std::size_t > mode_id_
Mode name -> id.
Definition lexer.hpp:957

scilex::lexer::tokenize
std::vector< token > tokenize(std::string_view source, eof_policy policy=eof_policy::omit) const
Tokenizes source into the sequence of non-skipped tokens.
Definition lexer.hpp:299

scilex::lexer::scan
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &&=delete
Deleted: the range would point into a temporary lexer.

scilex::lexer::pike_munch_in_mode
munch_result pike_munch_in_mode(std::size_t mode, std::string_view rest, unsigned char lead) const
The per-rule Pike + first-byte-dispatch munch in mode at the start of rest (lead is rest's first byte...
Definition lexer.hpp:753

scilex::lexer::column_step
static std::size_t column_step(std::string_view source, std::size_t off, scilex::column_unit unit)
How much the column advances when the byte at off in source is consumed, under unit....
Definition lexer.hpp:723

scilex::lexer::advance
void advance(std::string_view source, position &cursor, std::size_t n) const
Advances cursor by n bytes of source, maintaining the 1-based line/column tracker (a newline resets t...
Definition lexer.hpp:657

scilex::lexer::per_mode_dfa_
std::vector< std::shared_ptr< const mode_dfa > > per_mode_dfa_
Per-mode DFA fast path (nullptr = Pike).
Definition lexer.hpp:959

scilex::lexer::dfa_modes_active
std::vector< std::string > dfa_modes_active() const
The modes actually accelerated by a DFA fast path.
Definition lexer.hpp:357

scilex::lexer::lexer
lexer(std::vector< rule > rules, std::unordered_set< std::string > insignificant_modes={}, std::unordered_set< std::string > dfa_modes={}, error_policy errors=error_policy::raise, column_unit columns=column_unit::bytes)
Builds a lexer from rules (taken by value, then moved in).
Definition lexer.hpp:263

scilex::lexer::errors_
error_policy errors_
What to do at an unmatched byte.
Definition lexer.hpp:954

scilex::lexer::build_dfa_modes
void build_dfa_modes(const std::unordered_set< std::string > &dfa_modes)
Opts the named dfa_modes into the DFA fast path (called once, after build_dispatch)....
Definition lexer.hpp:925

scilex::lexer::scan
token_range scan(std::string_view source, eof_policy policy=eof_policy::omit) const &
Returns a lazy range over the non-skipped tokens of source.
Definition lexer.hpp:1117

scilex::lexer::mode_names_
std::vector< std::string > mode_names_
Mode id -> name ("default" is id 0).
Definition lexer.hpp:956

scilex::lexer::may_start
bool may_start(std::size_t mode, unsigned char byte) const
O(1) pre-filter for error recovery: can a fixed-lead rule in mode begin with byte?...
Definition lexer.hpp:633

scilex::token_iterator
Forward (single-pass) iterator yielding one token at a time.
Definition lexer.hpp:971

scilex::token_iterator::token_iterator
token_iterator(const lexer &owner, std::string_view source, eof_policy policy)
Constructs a begin iterator over source for owner.
Definition lexer.hpp:989

scilex::token_iterator::difference_type
std::ptrdiff_t difference_type
Required typedef.
Definition lexer.hpp:976

scilex::token_iterator::operator++
void operator++(int)
Post-increment (single-pass: no useful copy is returned).
Definition lexer.hpp:1020

scilex::token_iterator::stack_
std::vector< frame > stack_
Mode stack (top = active).
Definition lexer.hpp:1050

scilex::token_iterator::operator*
reference operator*() const
Returns the current token.
Definition lexer.hpp:1001

scilex::token_iterator::current_
token current_
The current token.
Definition lexer.hpp:1051

scilex::token_iterator::operator==
bool operator==(const token_iterator &other) const
Equality: both exhausted, or both at the same offset.
Definition lexer.hpp:1030

scilex::token_iterator::operator->
pointer operator->() const
Member access to the current token.
Definition lexer.hpp:1007

scilex::token_iterator::cursor_
position cursor_
Current scan position.
Definition lexer.hpp:1049

scilex::token_iterator::done_
bool done_
True once exhausted (end sentinel).
Definition lexer.hpp:1054

scilex::token_iterator::owner_
const lexer * owner_
Rules provider (not owned).
Definition lexer.hpp:1047

scilex::token_iterator::source_
std::string_view source_
Text being scanned.
Definition lexer.hpp:1048

scilex::token_iterator::operator++
token_iterator & operator++()
Advances to the next token.
Definition lexer.hpp:1013

scilex::token_iterator::token_iterator
token_iterator()=default
Constructs the end sentinel.

scilex::token_iterator::advance
void advance()
Produces the next token, or marks the iterator exhausted.
Definition lexer.hpp:1057

scilex::token_iterator::operator!=
bool operator!=(const token_iterator &other) const
Inequality.
Definition lexer.hpp:1040

scilex::token_iterator::iterator_category
std::input_iterator_tag iterator_category
Single-pass.
Definition lexer.hpp:974

scilex::token_iterator::eof_done_
bool eof_done_
End-of-input token already yielded.
Definition lexer.hpp:1053

scilex::token_iterator::policy_
eof_policy policy_
End-of-input policy.
Definition lexer.hpp:1052

scilex::token_range
A lazy range of tokens, returned by lexer::scan.
Definition lexer.hpp:1081

scilex::token_range::end
token_iterator end() const
End sentinel.
Definition lexer.hpp:1105

scilex::token_range::source_
std::string_view source_
Text being scanned.
Definition lexer.hpp:1113

scilex::token_range::begin
token_iterator begin() const
Begin iterator (produces the first token).
Definition lexer.hpp:1099

scilex::token_range::owner_
const lexer * owner_
Rules provider (not owned).
Definition lexer.hpp:1112

scilex::token_range::token_range
token_range(const lexer &owner, std::string_view source, eof_policy policy)
Builds the range.
Definition lexer.hpp:1090

scilex::token_range::policy_
eof_policy policy_
End-of-input policy.
Definition lexer.hpp:1114

scilex
The SciLex public API (scilex::lexer, scilex::rule, scilex::token).
Definition layout.hpp:47

scilex::apply_transition
void apply_transition(const rule &r, position start, std::vector< frame > &stack)
Applies rule r's mode transition (if any) to stack — the per-scan mode-stack mutation,...
Definition lexer.hpp:174

scilex::error
constexpr int error
Reserved token kind for a lexical-error run under scilex::error_policy::token.
Definition token.hpp:37

scilex::column_unit
column_unit
The unit a token's position::column is counted in.
Definition lexer.hpp:222

scilex::column_unit::codepoints
@ codepoints
One column per Unicode scalar value (a valid UTF-8 codepoint).

scilex::column_unit::bytes
@ bytes
One column per byte (the default; column == byte offset within the line + 1).

scilex::column_unit::utf16
@ utf16
One column per UTF-16 code unit (BMP = 1, astral = 2) — the LSP unit.

scilex::eof_policy
eof_policy
Whether tokenization appends a synthetic end-of-input token.
Definition lexer.hpp:54

scilex::eof_policy::append
@ append
Append one end_of_input token at the end position.

scilex::eof_policy::omit
@ omit
Stop at the last real token (default).

scilex::end_of_input
constexpr int end_of_input
Reserved token kind for the synthetic end-of-input token.
Definition token.hpp:26

scilex::error_policy
error_policy
What a lexer does when it reaches a byte that no rule in the active mode can begin.
Definition lexer.hpp:201

scilex::error_policy::raise
@ raise

scilex::frame
One entry on the per-scan mode stack: the active mode and where it was entered (the entry position fe...
Definition lexer.hpp:156

scilex::frame::mode_id
std::size_t mode_id
Id of the active mode.
Definition lexer.hpp:157

scilex::frame::entry_pos
position entry_pos
Where this mode was entered.
Definition lexer.hpp:158

scilex::lexer::dispatch
Per-mode dispatch index: the first-byte buckets scoped to one mode.
Definition lexer.hpp:948

scilex::lexer::dispatch::general
std::vector< std::size_t > general
Nullable rules (tried everywhere).
Definition lexer.hpp:950

scilex::lexer::dispatch::first_byte_index
std::array< std::vector< std::size_t >, 256 > first_byte_index
Rule indices by leading byte.
Definition lexer.hpp:949

scilex::lexer::mode_dfa
An adopted per-mode DFA: the automaton plus its local→global rule map.
Definition lexer.hpp:593

scilex::lexer::mode_dfa::to_global
std::vector< std::size_t > to_global
DFA local rule index -> global rules_ index.
Definition lexer.hpp:595

scilex::lexer::mode_dfa::dfa
real::dfa dfa
Recognizes the mode's rules in one pass.
Definition lexer.hpp:594

scilex::lexer::munch_result
A munch decision: whether a rule matched, which (global index), how many bytes — the small value scan...
Definition lexer.hpp:601

scilex::lexer::munch_result::idx
std::size_t idx
Definition lexer.hpp:603

scilex::lexer::munch_result::len
std::size_t len
Definition lexer.hpp:604

scilex::lexer::munch_result::have
bool have
Definition lexer.hpp:602

scilex::mode_action
A mode transition, fired when its rule wins, acting on the scan's mode stack: enter a nested mode,...
Definition lexer.hpp:64

scilex::mode_action::target
std::string target
The mode push/set enters; ignored (and omittable) for pop.
Definition lexer.hpp:74

scilex::mode_action::operation
op operation
Which transition to perform.
Definition lexer.hpp:73

scilex::mode_action::target_id
std::size_t target_id
The interned id of target, resolved once when the lexer is built (see scilex::lexer::build_dispatch) ...
Definition lexer.hpp:80

scilex::mode_action::op
op
The kind of transition.
Definition lexer.hpp:67

scilex::mode_action::op::push
@ push
Enter target, remembering the mode below it (a nested context).

scilex::mode_action::op::pop
@ pop
Leave the current mode, returning to the one beneath it.

scilex::mode_action::op::set
@ set
Replace the current mode with target (stack depth unchanged).

scilex::position
A location in the source text.
Definition token.hpp:48

scilex::position::offset
std::size_t offset
0-based byte offset from the start of the source.
Definition token.hpp:49

scilex::position::column
std::size_t column
1-based byte column within the line.
Definition token.hpp:51

scilex::position::line
std::size_t line
1-based line number.
Definition token.hpp:50

scilex::rule
A token rule: a kind, the pattern that recognizes it, whether matches are discarded (whitespace,...
Definition lexer.hpp:110

scilex::rule::kind
int kind
Kind assigned to tokens this rule produces.
Definition lexer.hpp:111

scilex::rule::action
std::optional< mode_action > action
Mode transition fired when this rule wins.
Definition lexer.hpp:115

scilex::rule::skip
bool skip
If true, matches are consumed but not emitted.
Definition lexer.hpp:113

scilex::rule::pattern
real::regex pattern
The recognizer (a linear-time REAL regex; its flags are the author's — see above).
Definition lexer.hpp:112

scilex::rule::in_mode
std::vector< std::string > in_mode
Modes this rule is active in; empty ⇒ {"default"}.
Definition lexer.hpp:114

scilex::token
One lexical token: a typed slice of the source.
Definition token.hpp:58

scilex::token::kind
int kind
Caller-defined token kind (e.g. an enum value).
Definition token.hpp:59

token.hpp
The token produced by the lexer and its source position.