Flash/selfhost/lexer.flash

Flash 853 lines
// Flash lexer — source bytes to a token stream for Flash.
//
// A single forward pass, zero allocation: tokens carry byte spans into the
// caller's source buffer rather than copied text (see token.flash). The
// grammar is small enough that a table-driven scanner would be more machinery
// than the language earns today; this stays a readable switch the parser can
// trust.
//
// Trivia handling: only spaces, tabs, CR and LF are skipped between tokens.
// Newlines bump the line counter, which the parser reads both for diagnostics
// and for the semicolon-free statement boundary (a binary/postfix operator
// opening a new line begins a new statement). Comments are NOT trivia: a `///`
// doc comment is a content-bearing `doc_comment` token, and every other `//…`
// shape is a `line_comment` token (the parser filters those out of the descent
// stream into `p.comments` for the formatter). The one other line-bound token
// is a `\\…` multiline-string line — its span ends at the newline, and the
// parser folds a run of consecutive lines into a single string value.

use "token"
use "support" as sup

const Kind = token.Kind
const Token = token.Token

pub const Lexer = struct {
    src []u8,
    pos u32 = 0,
    line u32 = 1,

    pub fn init(src []u8) Lexer {
        return .{ .src = src }
    }

    fn at(self *Lexer) u8 {
        return if (self.pos < self.src.len) self.src[self.pos] else 0
    }

    fn peek(self *Lexer, n u32) u8 {
        i := self.pos + n
        return if (i < self.src.len) self.src[i] else 0
    }

    // `///` (exactly three slashes) marks a doc comment — a content-bearing
    // token. A bare `//`, four-or-more slashes (`////…`), and the top-level
    // `//!` are all ordinary line comments (also tokens — see lexLineComment).
    // next() uses this to route the `///` shape to lexDocComment and every
    // other `//…` to lexLineComment, so the doc-vs-comment rule lives in one
    // place.
    fn atDocComment(self *Lexer) bool {
        return self.at() == '/' && self.peek(1) == '/' && self.peek(2) == '/' && self.peek(3) != '/'
    }

    // Advance past whitespace only — spaces, tabs, carriage returns and
    // newlines. Comments are not trivia: both the `///` doc comment and
    // every other `//…` line comment surface as tokens, lexed by next().
    fn skipTrivia(self *mut Lexer) {
        while self.pos < self.src.len {
            switch self.src[self.pos] {
                ' ', '\t', '\r' => {
                    self.pos += 1
                },
                '\n' => {
                    self.line += 1
                    self.pos += 1
                },
                else => return,
            }
        }
    }

    pub fn next(self *mut Lexer) Token {
        self.skipTrivia()
        start := self.pos
        line := self.line
        if self.pos >= self.src.len {
            return self.make(.eof, start, line)
        }

        c := self.src[self.pos]
        if isIdentStart(c) {
            return self.lexIdent(start, line)
        }
        if isDigit(c) {
            return self.lexNumber(start, line)
        }
        if c == '"' {
            return self.lexString(start, line)
        }
        if c == '\'' {
            return self.lexChar(start, line)
        }
        if c == '#' {
            return self.lexBuiltin(start, line)
        }
        if c == '\\' && self.peek(1) == '\\' {
            return self.lexMultilineStr(start, line)
        }
        if self.atDocComment() {
            return self.lexDocComment(start, line)
        }
        if c == '/' && self.peek(1) == '/' {
            return self.lexLineComment(start, line)
        }

        // single- and two-character punctuation / operators
        self.pos += 1
        const k Kind = switch c {
            '(' => .l_paren,
            ')' => .r_paren,
            '{' => .l_brace,
            '}' => .r_brace,
            '[' => .l_bracket,
            ']' => .r_bracket,
            ',' => .comma,
            '+' => self.lexPlus(),
            '*' => self.lexStar(),
            '/' => self.pick('=', .slash_eq, .slash),
            '%' => self.pick('=', .percent_eq, .percent),
            '?' => .question,
            ':' => self.pick('=', .colon_equal, .colon),
            '.' => self.lexDot(),
            '=' => self.lexEq(),
            '!' => self.pick('=', .bang_eq, .bang),
            '<' => self.lexLt(),
            '>' => self.lexGt(),
            '&' => self.lexAmp(),
            '|' => self.lexPipe(),
            '^' => self.pick('=', .caret_eq, .caret),
            '~' => .tilde, // unary bitwise-NOT; Zig has no `~=`, so it is one-way
            '-' => self.lexMinus(),
            else => .invalid,
        }
        return self.make(k, start, line)
    }

    // If the byte after the just-consumed one is `second`, consume it and
    // return `both` (the two-char token); otherwise return `one`.
    fn pick(self *mut Lexer, second u8, both Kind, one Kind) Kind {
        if self.at() == second {
            self.pos += 1
            return both
        }
        return one
    }

    // '+' is five-way and so cannot use the two-way `pick`: "++" is
    // concatenation, "+%=" is wrapping add-assign, "+%" is wrapping addition,
    // "+=" is add-assign, and a lone "+" is addition.
    fn lexPlus(self *mut Lexer) Kind {
        switch self.at() {
            '+' => {
                self.pos += 1
                return .plus_plus
            },
            '%' => {
                self.pos += 1
                return self.pick('=', .plus_percent_eq, .plus_percent)
            },
            '=' => {
                self.pos += 1
                return .plus_eq
            },
            else => return .plus,
        }
    }

    // '-' is five-way and so cannot use the two-way `pick`: "->" is the
    // return arrow, "-%=" is wrapping subtract-assign, "-%" is wrapping
    // subtraction, "-=" is subtract-assign, and a lone "-" is negation /
    // subtraction.
    fn lexMinus(self *mut Lexer) Kind {
        switch self.at() {
            '>' => {
                self.pos += 1
                return .arrow
            },
            '%' => {
                self.pos += 1
                return self.pick('=', .minus_percent_eq, .minus_percent)
            },
            '=' => {
                self.pos += 1
                return .minus_eq
            },
            else => return .minus,
        }
    }

    // '*' is five-way (like lexPlus): "**" is array repetition, "*%=" is
    // wrapping multiply-assign, "*%" is wrapping multiplication, "*=" is
    // multiply-assign, and a lone "*" is multiplication (or a pointer sigil —
    // the parser tells those apart by position). It cannot use the two-way
    // `pick`.
    fn lexStar(self *mut Lexer) Kind {
        switch self.at() {
            '*' => {
                self.pos += 1
                return .star_star
            },
            '%' => {
                self.pos += 1
                return self.pick('=', .star_percent_eq, .star_percent)
            },
            '=' => {
                self.pos += 1
                return .star_eq
            },
            else => return .star,
        }
    }

    // '=' is three-way (like lexMinus): "=>" is the switch prong arrow, "==" is
    // equality, and a lone "=" is assignment / a binding.
    fn lexEq(self *mut Lexer) Kind {
        switch self.at() {
            '>' => {
                self.pos += 1
                return .fat_arrow
            },
            '=' => {
                self.pos += 1
                return .eq_eq
            },
            else => return .equal,
        }
    }

    // '.' is three-way: "..." is an inclusive switch range, ".." a slice / range
    // bound, and a lone "." is member access or a literal prefix.
    fn lexDot(self *mut Lexer) Kind {
        if self.at() == '.' {
            self.pos += 1
            if self.at() == '.' {
                self.pos += 1
                return .ellipsis3
            }
            return .dot_dot
        }
        return .dot
    }

    // '&' and '|' are three-way (like lexMinus): "&&"/"||" are the logical
    // operators, "&="/"|=" the bitwise compound assignments, and a lone "&"/"|"
    // is bitwise-and/or (a prefix "&" is address-of, told apart by the parser).
    fn lexAmp(self *mut Lexer) Kind {
        switch self.at() {
            '&' => {
                self.pos += 1
                return .amp_amp
            },
            '=' => {
                self.pos += 1
                return .amp_eq
            },
            else => return .amp,
        }
    }

    fn lexPipe(self *mut Lexer) Kind {
        switch self.at() {
            '|' => {
                self.pos += 1
                return .pipe_pipe
            },
            '=' => {
                self.pos += 1
                return .pipe_eq
            },
            else => return .pipe,
        }
    }

    // '<' and '>' are four-way: "<<="/">>=" are the shift compound assignments,
    // "<<"/">>" the shifts, "<="/">=" the comparisons, and a lone "<"/">"
    // less/greater. The shift case consumes its second angle then reuses `pick`
    // for the trailing '='. Maximal munch means a spaced `< <` stays two
    // compares and `<< =` a shift then a store.
    fn lexLt(self *mut Lexer) Kind {
        switch self.at() {
            '<' => {
                self.pos += 1
                return self.pick('=', .lt_lt_eq, .lt_lt)
            },
            '=' => {
                self.pos += 1
                return .lt_eq
            },
            else => return .lt,
        }
    }

    fn lexGt(self *mut Lexer) Kind {
        switch self.at() {
            '>' => {
                self.pos += 1
                return self.pick('=', .gt_gt_eq, .gt_gt)
            },
            '=' => {
                self.pos += 1
                return .gt_eq
            },
            else => return .gt,
        }
    }

    fn lexIdent(self *mut Lexer, start u32, line u32) Token {
        while self.pos < self.src.len && isIdentCont(self.src[self.pos]) {
            self.pos += 1
        }
        text := self.src[start..self.pos]
        if text.len == 1 && text[0] == '_' {
            return self.make(.underscore, start, line)
        }
        const k Kind = token.keywords.get(text) orelse Kind.ident
        return self.make(k, start, line)
    }

    // Integer and float literals. Handles decimal, hexadecimal (0x…), octal
    // (0o…), and binary (0b…) integers with optional `_` digit separators, and
    // decimal float literals (`3.14`, `1.0e-10`). The value passes through to
    // the emitted Zig verbatim; the exact digit content is validated downstream.
    // A letter or digit immediately adjacent (no whitespace) after the literal
    // is a lexer error — the guard that kills the old silent-mis-lex hazard
    // (`0o755` once split as two tokens, `1_000` as `1` + `_000`).
    fn lexNumber(self *mut Lexer, start u32, line u32) Token {
        if self.at() == '0' {
            switch self.peek(1) {
                'x', 'X' => {
                    self.pos += 2
                    while self.pos < self.src.len && (isHexDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
                        self.pos += 1
                    }
                    return self.lexNumEnd(start, line, .int)
                },
                'b', 'B' => {
                    self.pos += 2
                    while self.pos < self.src.len && (isBinaryDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
                        self.pos += 1
                    }
                    return self.lexNumEnd(start, line, .int)
                },
                'o', 'O' => {
                    self.pos += 2
                    while self.pos < self.src.len && (isOctalDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
                        self.pos += 1
                    }
                    return self.lexNumEnd(start, line, .int)
                },
                else => {},
            }
        }
        // Decimal: consume the integer part (digits and `_` separators).
        while self.pos < self.src.len && (isDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
            self.pos += 1
        }
        // A `.` followed by a digit starts the fractional part of a float literal.
        // A lone `.` or `..` stays a punctuation token (member access / range).
        if self.at() == '.' && isDigit(self.peek(1)) {
            self.pos += 1 // '.'
            while self.pos < self.src.len && (isDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
                self.pos += 1
            }
            if self.at() == 'e' || self.at() == 'E' {
                self.pos += 1
                if self.at() == '+' || self.at() == '-' {
                    self.pos += 1
                }
                while self.pos < self.src.len && (isDigit(self.src[self.pos]) || self.src[self.pos] == '_') {
                    self.pos += 1
                }
            }
            return self.lexNumEnd(start, line, .float)
        }
        return self.lexNumEnd(start, line, .int)
    }

    // Emit `k` or `.invalid` when an identifier-start or digit immediately
    // follows the literal (no whitespace). On error the bad suffix is consumed
    // so the error span is informative for diagnostics.
    fn lexNumEnd(self *mut Lexer, start u32, line u32, k Kind) Token {
        if self.pos < self.src.len && (isIdentStart(self.src[self.pos]) || isDigit(self.src[self.pos])) {
            while self.pos < self.src.len && (isIdentCont(self.src[self.pos]) || isDigit(self.src[self.pos])) {
                self.pos += 1
            }
            return self.make(.invalid, start, line)
        }
        return self.make(k, start, line)
    }

    // Char literals: `'c'`, `'\n'`, `'\xNN'`, `'\u{NNNNNN}'`. The lexer
    // validates termination and escape structure; the byte value passes through
    // to the emitted Zig verbatim (Zig uses the same spellings). An unterminated
    // or malformed literal emits `.invalid`.
    fn lexChar(self *mut Lexer, start u32, line u32) Token {
        self.pos += 1 // opening quote
        if self.at() == '\\' {
            self.pos += 1 // backslash
            switch self.at() {
                'x' => {
                    self.pos += 1 // 'x'
                    var i usize = 0
                    while i < 2 && self.pos < self.src.len && isHexDigit(self.src[self.pos]) {
                        self.pos += 1
                        i += 1
                        // 'u'
                        // '{'
                        // single-char escape: \n \r \t \0 \\ \' \"
                    }
                },
                'u' => {
                    self.pos += 1
                    if self.at() == '{' {
                        self.pos += 1
                        while self.pos < self.src.len && isHexDigit(self.src[self.pos]) {
                            self.pos += 1
                        }
                        if self.at() == '}' {
                            self.pos += 1
                        }
                    }
                },
                else => {
                    if self.pos < self.src.len {
                        self.pos += 1
                    }
                },
            }
        } else if self.pos < self.src.len {
            self.pos += 1 // the single byte
        }
        if self.at() == '\'' {
            self.pos += 1
            return self.make(.char, start, line)
        }
        return self.make(.invalid, start, line)
    }

    // #name builtins (#intCast, #as, …). Lexed as one token carrying the
    // leading '#'; the parser strips the sigil and lowering re-sigils to Zig's
    // '@' (Tier-0 passthrough of the intrinsic).
    fn lexBuiltin(self *mut Lexer, start u32, line u32) Token {
        self.pos += 1 // '#'
        while self.pos < self.src.len && isIdentCont(self.src[self.pos]) {
            self.pos += 1
        }
        return self.make(.builtin, start, line)
    }

    // Single-line strings with escape sequences: `\n \r \t \0 \\ \" \xNN
    // \u{NNNNNN}`. The lexer validates termination and escape structure; escape
    // *values* pass through to the emitted Zig verbatim (Zig uses the same set).
    fn lexString(self *mut Lexer, start u32, line u32) Token {
        self.pos += 1 // opening quote
        while self.pos < self.src.len {
            c := self.src[self.pos]
            if c == '\\' {
                self.pos += 1 // backslash
                if self.pos >= self.src.len {
                    break
                }
                switch self.src[self.pos] {
                    'x' => {
                        self.pos += 1 // 'x'
                        var i usize = 0
                        while i < 2 && self.pos < self.src.len && isHexDigit(self.src[self.pos]) {
                            self.pos += 1
                            i += 1
                            // 'u'
                            // '{'
                            // single-char escape or unknown (validated downstream)
                        }
                    },
                    'u' => {
                        self.pos += 1
                        if self.pos < self.src.len && self.src[self.pos] == '{' {
                            self.pos += 1
                            while self.pos < self.src.len && isHexDigit(self.src[self.pos]) {
                                self.pos += 1
                            }
                            if self.pos < self.src.len && self.src[self.pos] == '}' {
                                self.pos += 1
                            }
                        }
                    },
                    else => {
                        self.pos += 1
                    },
                }
                continue
            }
            if c == '"' {
                self.pos += 1
                return self.make(.string, start, line)
            }
            if c == '\n' {
                break
                // unterminated on this line
            }
            self.pos += 1
        }
        return self.make(.invalid, start, line)
    }

    // A Zig-style multiline / raw string line: `\\` followed by the rest of the
    // physical line, with no escape processing. The token spans `\\` through the
    // last byte before the newline (the newline itself stays trivia). Each line
    // is one token; the parser folds a run of consecutive lines into one string
    // value, so a single multiline literal is several of these tokens.
    fn lexMultilineStr(self *mut Lexer, start u32, line u32) Token {
        self.pos += 2 // the two backslashes
        while self.pos < self.src.len && self.src[self.pos] != '\n' {
            self.pos += 1
        }
        return self.make(.multiline_str, start, line)
    }

    // A `///` doc-comment line: the three slashes plus the rest of the physical
    // line (the newline stays trivia). One token per line; the parser folds a run
    // of consecutive lines into one doc block and re-emits it before the
    // declaration. Regular `//`, `////…` and `//!` never reach here — next()
    // routes them to lexLineComment (see atDocComment).
    fn lexDocComment(self *mut Lexer, start u32, line u32) Token {
        self.pos += 3 // the three slashes
        while self.pos < self.src.len && self.src[self.pos] != '\n' {
            self.pos += 1
        }
        return self.make(.doc_comment, start, line)
    }

    // A `//…` line comment in any non-doc shape — a bare `//`, the four-or-more
    // slash `////…`, or the module-head `//!`. The token spans the slashes
    // through the last byte before the newline (the newline stays trivia). A
    // trailing `\r` is trimmed from the span so re-emission on a CRLF input
    // embeds no carriage return mid-line. One token per line; the parser filters
    // these out of the descent stream into `p.comments` (the formatter's input),
    // so the grammar never sees them. The exactly-three-slash `///` is the
    // separate doc_comment token (see atDocComment), routed before this in next().
    fn lexLineComment(self *mut Lexer, start u32, line u32) Token {
        self.pos += 2 // the two leading slashes
        while self.pos < self.src.len && self.src[self.pos] != '\n' {
            self.pos += 1
        }
        var end u32 = self.pos
        if end > start && self.src[end - 1] == '\r' {
            end -= 1
        }
        return .{ .kind = .line_comment, .start = start, .end = end, .line = line }
    }

    fn make(self *Lexer, k Kind, start u32, line u32) Token {
        return .{ .kind = k, .start = start, .end = self.pos, .line = line }
    }
}

fn isDigit(c u8) bool {
    return c >= '0' && c <= '9'
}
fn isHexDigit(c u8) bool {
    return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}
fn isOctalDigit(c u8) bool {
    return c >= '0' && c <= '7'
}
fn isBinaryDigit(c u8) bool {
    return c == '0' || c == '1'
}
fn isIdentStart(c u8) bool {
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
}
fn isIdentCont(c u8) bool {
    return isIdentStart(c) || isDigit(c)
}

// --- tests ---------------------------------------------------------------

fn collect(src []u8, out []mut Kind) []Kind {
    var lx = Lexer.init(src)
    var i usize = 0
    while true {
        t := lx.next()
        out[i] = t.kind
        i += 1
        if t.kind == .eof {
            break
        }
    }
    return out[0..i]
}

test "keywords and identifiers" {
    var buf [32]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_use, .ident, .kw_fn, .ident, .eof }, collect("use flibc fn main", &buf))
}

test "underscore is its own token, not an identifier" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .underscore, .ident, .eof }, collect("_ _x", &buf))
}

test "colon-equal is one token; a lone colon is its own" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .colon_equal, .colon, .eof }, collect(":= :", &buf))
}

test "line comments surface as tokens between real tokens" {
    var buf [16]Kind = undefined
    got := collect("// header\nfn main // trailing\n{ }", &buf)
    try sup.expectEqualSlices(Kind, &[_]Kind{ .line_comment, .kw_fn, .ident, .line_comment, .l_brace, .r_brace, .eof }, got)
}

test "string literal with an escape" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .string, .eof }, collect("\"hello\\n\"", &buf))
}

test "operators: one- and two-character forms" {
    var buf [32]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .plus, .minus, .star, .slash, .percent, .eq_eq, .bang_eq, .lt, .lt_eq, .gt, .gt_eq, .amp_amp, .pipe_pipe, .bang, .amp, .question, .dot_dot, .arrow, .colon_equal, .tilde, .eof }, collect("+ - * / % == != < <= > >= && || ! & ? .. -> := ~", &buf))
}

test "bitwise and shift operators, and shift vs spaced compares" {
    var buf [16]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .amp, .pipe, .caret, .lt_lt, .gt_gt, .eof }, collect("& | ^ << >>", &buf))
    // Maximal munch: `<<` is one shift token, but a spaced `< <` stays two
    // compares (and likewise `> >`), so a comparison never absorbs a shift.
    var buf2 [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .lt_lt, .lt, .lt, .eof }, collect("<< < <", &buf2))
}

test "char literal, hex int, and #builtin" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .char, .int, .builtin, .eof }, collect("'0' 0xFF #intCast", &buf))
}

test "control-flow keywords lex to their kinds" {
    var buf [16]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_if, .kw_else, .kw_while, .kw_for, .kw_in, .kw_break, .kw_continue, .kw_return, .eof }, collect("if else while for in break continue return", &buf))
}

test "error-union keywords lex to their kinds" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_try, .kw_catch, .kw_defer, .kw_errdefer, .eof }, collect("try catch defer errdefer", &buf))
}

test "type-definition keywords lex to their kinds" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_struct, .kw_enum, .kw_union, .eof }, collect("struct enum union", &buf))
}

test "the test keyword lexes to kw_test" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_test, .string, .l_brace, .r_brace, .eof }, collect("test \"adds\" {}", &buf))
}

test "error is a keyword heading a set or an origination" {
    var buf [16]Kind = undefined
    // `error` reserves its own kind so it can head an `error{…}` set or an
    // `error.Name` origination; the lexer recognises it through the keyword map.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_error, .l_brace, .ident, .r_brace, .kw_error, .dot, .ident, .eof }, collect("error{ Bad } error.Bad", &buf))
}

test "asm is a keyword; volatile stays a contextual identifier" {
    var buf [8]Kind = undefined
    // `asm` reserves its own kind so it can head an expression; `volatile` is
    // recognised positionally by the parser, so it still lexes as a plain ident.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_asm, .ident, .eof }, collect("asm volatile", &buf))
}

test "a tagged-union variant list lexes name-then-type" {
    var buf [16]Kind = undefined
    // `union` is its own keyword; `(enum)` reuses kw_enum inside the parens; a
    // bare variant is a lone ident, a typed variant lexes name-then-type.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_union, .l_paren, .kw_enum, .r_paren, .l_brace, .ident, .comma, .ident, .ident, .r_brace, .eof }, collect("union(enum) { empty, single usize }", &buf))
}

test "visibility and declaration-modifier keywords lex to their kinds" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_pub, .kw_export, .kw_inline, .kw_fn, .eof }, collect("pub export inline fn", &buf))
}

test "each multiline-string line is its own token" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .kw_const, .ident, .equal, .multiline_str, .multiline_str, .eof }, collect("const usage =\n    \\\\line one\n    \\\\line two", &buf))
}

test "a multiline-string token's content is the bytes after the backslashes" {
    var lx = Lexer.init("    \\\\  hi there")
    t := lx.next()
    try sup.expectEqual(Kind.multiline_str, t.kind)
    // The lexeme keeps the leading `\\`; the parser strips it to get the content.
    try sup.expectEqualStrings("\\\\  hi there", t.lexeme(lx.src))
}

test "compound-assign operators and the three-way minus" {
    var buf [16]Kind = undefined
    // `-` is three-way: "->" arrow, "-=" minus_eq, and a lone "-" minus.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .plus_eq, .minus_eq, .star_eq, .slash_eq, .percent_eq, .arrow, .minus_eq, .minus, .eof }, collect("+= -= *= /= %= -> -= -", &buf))
}

test "wrapping compound-assignment operators munch maximally" {
    var buf [16]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .plus_percent_eq, .minus_percent_eq, .star_percent_eq, .eof }, collect("+%= -%= *%=", &buf))
    // Maximal munch around the three-byte forms: `+% =` (spaced) is a wrapping
    // add then a store, and a lone `+%` stays the wrapping binop — a wrapping
    // compound assign never forms across whitespace.
    var buf2 [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .plus_percent, .equal, .minus_percent, .equal, .star_percent, .eof }, collect("+% = -% = *%", &buf2))
}

test "star family: repetition, wrapping forms, assign, and the lone star" {
    var buf [16]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .star_star, .star_percent, .star_percent_eq, .star_eq, .star, .eof }, collect("** *% *%= *= *", &buf))
    // A spaced `* *` stays two stars (deref-then-multiply territory); `**`
    // never forms across whitespace.
    var buf2 [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .star, .star, .eof }, collect("* *", &buf2))
}

test "bitwise and shift compound-assignment operators" {
    var buf [16]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .amp_eq, .pipe_eq, .caret_eq, .lt_lt_eq, .gt_gt_eq, .eof }, collect("&= |= ^= <<= >>=", &buf))
    // Maximal munch around the three-byte shift-assign: `<< =` (spaced) is a
    // shift then a store, and a lone `<<` stays a bare shift — a shift-assign
    // never forms across whitespace.
    var buf2 [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .lt_lt, .equal, .lt_lt, .eof }, collect("<< = <<", &buf2))
}

test "clear.flash lexes without an invalid token" {
    var lx = Lexer.init("use flibc\nfn sink(bytes []u8) {\n    _ = flibc.sys.write_fd(1, bytes.ptr, bytes.len)\n}")
    while true {
        t := lx.next()
        try sup.expect(t.kind != .invalid)
        if t.kind == .eof {
            break
        }
    }
}

test "/// is a doc_comment; //, ////, //! are line_comments" {
    var buf [16]Kind = undefined
    // The exactly-three-slash line is the content-bearing doc_comment; a bare
    // `//`, the four-slash `////`, and the top-level `//!` are each a
    // line_comment. All four are tokens (none are trivia).
    try sup.expectEqualSlices(Kind, &[_]Kind{ .line_comment, .line_comment, .line_comment, .doc_comment, .kw_pub, .kw_fn, .ident, .eof }, collect("// regular\n//! module\n//// not a doc\n/// a doc\npub fn f", &buf))
}

test "a doc-comment token's content is the bytes after the three slashes" {
    var lx = Lexer.init("/// hello")
    t := lx.next()
    try sup.expectEqual(Kind.doc_comment, t.kind)
    // The lexeme keeps the leading `///`; the parser strips it to get the content.
    try sup.expectEqualStrings("/// hello", t.lexeme(lx.src))
}

test "line comments surface as tokens, lexeme verbatim across shapes" {
    // Every non-doc `//…` shape is one line_comment token whose lexeme is the
    // full text from the slashes to end of line — the content is never edited.
    var lx1 = Lexer.init("// x")
    t1 := lx1.next()
    try sup.expectEqual(Kind.line_comment, t1.kind)
    try sup.expectEqualStrings("// x", t1.lexeme(lx1.src))

    var lx2 = Lexer.init("//! mod")
    t2 := lx2.next()
    try sup.expectEqual(Kind.line_comment, t2.kind)
    try sup.expectEqualStrings("//! mod", t2.lexeme(lx2.src))

    var lx3 = Lexer.init("//// rule")
    t3 := lx3.next()
    try sup.expectEqual(Kind.line_comment, t3.kind)
    try sup.expectEqualStrings("//// rule", t3.lexeme(lx3.src))
}

test "a line comment's trailing carriage return is trimmed from its span" {
    // CRLF input: the `\r` before the `\n` is not part of the comment lexeme,
    // so re-emission embeds no carriage return mid-line.
    var lx = Lexer.init("// x\r\nfn")
    t := lx.next()
    try sup.expectEqual(Kind.line_comment, t.kind)
    try sup.expectEqualStrings("// x", t.lexeme(lx.src))
}

test "binary and octal integer literals lex to .int" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0b101", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0B110", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0o755", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0O644", &buf))
}

test "digit-separator _ in integer literals" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("1_000_000", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0xFF_AA", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0b1010_1010", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .eof }, collect("0o7_7_7", &buf))
}

test "float literals lex to .float" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .float, .eof }, collect("3.14", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .float, .eof }, collect("1.0e10", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .float, .eof }, collect("1.5e-3", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .float, .eof }, collect("1.5e+3", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .float, .eof }, collect("1_000.5", &buf))
}

test "float literal lexeme is verbatim" {
    var lx = Lexer.init("3.14")
    t := lx.next()
    try sup.expectEqual(Kind.float, t.kind)
    try sup.expectEqualStrings("3.14", t.lexeme(lx.src))
}

test "3.method is not a float — dot floats only when followed by a digit" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .dot, .ident, .eof }, collect("3.method", &buf))
}

test "1..2 is a range, not a float" {
    var buf [8]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .int, .dot_dot, .int, .eof }, collect("1..2", &buf))
}

test "literal-adjacency guard: adjacent ident or wrong-base digit after a number is .invalid" {
    var buf [4]Kind = undefined
    // Old silent mis-lex: `0o755` used to split as int(0) + ident(o755).
    // An ident after any number is now a lexer error.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .invalid, .eof }, collect("123abc", &buf))
    // Wrong-base digits: `2` is not a binary digit; `8`/`9` not octal.
    try sup.expectEqualSlices(Kind, &[_]Kind{ .invalid, .eof }, collect("0b102", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .invalid, .eof }, collect("0o89", &buf))
}

test "char literal with hex escape \\xNN" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .char, .eof }, collect("'\\x1b'", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .char, .eof }, collect("'\\x00'", &buf))
}

test "char literal with unicode escape \\u{N}" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .char, .eof }, collect("'\\u{41}'", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .char, .eof }, collect("'\\u{1F600}'", &buf))
}

test "string literal with hex and unicode escapes" {
    var buf [4]Kind = undefined
    try sup.expectEqualSlices(Kind, &[_]Kind{ .string, .eof }, collect("\"\\x1b[0m\"", &buf))
    try sup.expectEqualSlices(Kind, &[_]Kind{ .string, .eof }, collect("\"\\u{263A}\"", &buf))
}
raw view on GitHub →