Zig 807 lines
// Flash lexer — source bytes to a token stream for Flash.
//
// Hand-written, single forward pass, zero allocation: tokens carry byte
// spans into the caller's source buffer rather than copied text (see
// token.zig). The grammar is small enough that a
// table-driven scanner would be more machinery than the language earns
// today; this stays a readable switch the parser can trust.
//
// Trivia handling: only spaces, tabs, CR and LF are skipped between tokens.
// Newlines bump the line counter, which the parser reads both for diagnostics
// and for the semicolon-free statement boundary (a binary/postfix operator
// opening a new line begins a new statement). Comments are NOT trivia: a `///`
// doc comment is a content-bearing `doc_comment` token, and every other `//…`
// shape is a `line_comment` token (the parser filters those out of the descent
// stream into `p.comments` for the formatter). The one other line-bound token
// is a `\\…` multiline-string line — its span ends at the newline, and the
// parser folds a run of consecutive lines into a single string value.
const std = @import("std");
const token = @import("token.zig");
const Kind = token.Kind;
const Token = token.Token;
pub const Lexer = struct {
src: []const u8,
pos: u32 = 0,
line: u32 = 1,
pub fn init(src: []const u8) Lexer {
return .{ .src = src };
}
fn at(self: *Lexer) u8 {
return if (self.pos < self.src.len) self.src[self.pos] else 0;
}
fn peek(self: *Lexer, n: u32) u8 {
const i = self.pos + n;
return if (i < self.src.len) self.src[i] else 0;
}
// `///` (exactly three slashes) marks a doc comment — a content-bearing
// token. A bare `//`, four-or-more slashes (`////…`), and the top-level
// `//!` are all ordinary line comments (also tokens now — see
// lexLineComment). next() uses this to route the `///` shape to
// lexDocComment and every other `//…` to lexLineComment, so the
// doc-vs-comment rule lives in one place.
fn atDocComment(self: *Lexer) bool {
return self.at() == '/' and self.peek(1) == '/' and self.peek(2) == '/' and self.peek(3) != '/';
}
// Advance past whitespace only — spaces, tabs, carriage returns and
// newlines. Comments are no longer trivia: both the `///` doc comment and
// every other `//…` line comment surface as tokens, lexed by next().
fn skipTrivia(self: *Lexer) void {
while (self.pos < self.src.len) {
switch (self.src[self.pos]) {
' ', '\t', '\r' => self.pos += 1,
'\n' => {
self.line += 1;
self.pos += 1;
},
else => return,
}
}
}
pub fn next(self: *Lexer) Token {
self.skipTrivia();
const start = self.pos;
const line = self.line;
if (self.pos >= self.src.len) return self.make(.eof, start, line);
const c = self.src[self.pos];
if (isIdentStart(c)) return self.lexIdent(start, line);
if (isDigit(c)) return self.lexNumber(start, line);
if (c == '"') return self.lexString(start, line);
if (c == '\'') return self.lexChar(start, line);
if (c == '#') return self.lexBuiltin(start, line);
if (c == '\\' and self.peek(1) == '\\') return self.lexMultilineStr(start, line);
if (self.atDocComment()) return self.lexDocComment(start, line);
if (c == '/' and self.peek(1) == '/') return self.lexLineComment(start, line);
// single- and two-character punctuation / operators
self.pos += 1;
const k: Kind = switch (c) {
'(' => .l_paren,
')' => .r_paren,
'{' => .l_brace,
'}' => .r_brace,
'[' => .l_bracket,
']' => .r_bracket,
',' => .comma,
'+' => self.lexPlus(),
'*' => self.lexStar(),
'/' => self.pick('=', .slash_eq, .slash),
'%' => self.pick('=', .percent_eq, .percent),
'?' => .question,
':' => self.pick('=', .colon_equal, .colon),
'.' => self.lexDot(),
'=' => self.lexEq(),
'!' => self.pick('=', .bang_eq, .bang),
'<' => self.lexLt(),
'>' => self.lexGt(),
'&' => self.lexAmp(),
'|' => self.lexPipe(),
'^' => self.pick('=', .caret_eq, .caret),
'~' => .tilde, // unary bitwise-NOT; Zig has no `~=`, so it is one-way
'-' => self.lexMinus(),
else => .invalid,
};
return self.make(k, start, line);
}
// If the byte after the just-consumed one is `second`, consume it and
// return `both` (the two-char token); otherwise return `one`.
fn pick(self: *Lexer, second: u8, both: Kind, one: Kind) Kind {
if (self.at() == second) {
self.pos += 1;
return both;
}
return one;
}
// '+' is four-way and so cannot use the two-way `pick`: "++" is
// concatenation, "+%" is wrapping addition, "+=" is add-assign, and a lone
// "+" is addition.
fn lexPlus(self: *Lexer) Kind {
switch (self.at()) {
'+' => {
self.pos += 1;
return .plus_plus;
},
'%' => {
self.pos += 1;
return .plus_percent;
},
'=' => {
self.pos += 1;
return .plus_eq;
},
else => return .plus,
}
}
// '-' is three-way and so cannot use the two-way `pick`: "->" is the
// return arrow, "-=" is subtract-assign, and a lone "-" is negation /
// subtraction.
fn lexMinus(self: *Lexer) Kind {
switch (self.at()) {
'>' => {
self.pos += 1;
return .arrow;
},
'%' => {
self.pos += 1;
return .minus_percent;
},
'=' => {
self.pos += 1;
return .minus_eq;
},
else => return .minus,
}
}
// '*' is three-way (like lexPlus): "*%" is wrapping multiplication, "*=" is
// multiply-assign, and a lone "*" is multiplication (or a pointer sigil — the
// parser tells those apart by position). It cannot use the two-way `pick`.
fn lexStar(self: *Lexer) Kind {
switch (self.at()) {
'%' => {
self.pos += 1;
return .star_percent;
},
'=' => {
self.pos += 1;
return .star_eq;
},
else => return .star,
}
}
// '=' is three-way (like lexMinus): "=>" is the switch prong arrow, "==" is
// equality, and a lone "=" is assignment / a binding.
fn lexEq(self: *Lexer) Kind {
switch (self.at()) {
'>' => {
self.pos += 1;
return .fat_arrow;
},
'=' => {
self.pos += 1;
return .eq_eq;
},
else => return .equal,
}
}
// '.' is three-way: "..." is an inclusive switch range, ".." a slice / range
// bound, and a lone "." is member access or a literal prefix.
fn lexDot(self: *Lexer) Kind {
if (self.at() == '.') {
self.pos += 1;
if (self.at() == '.') {
self.pos += 1;
return .ellipsis3;
}
return .dot_dot;
}
return .dot;
}
// '&' and '|' are three-way (like lexMinus): "&&"/"||" are the logical
// operators, "&="/"|=" the bitwise compound assignments, and a lone "&"/"|"
// is bitwise-and/or (a prefix "&" is address-of, told apart by the parser).
fn lexAmp(self: *Lexer) Kind {
switch (self.at()) {
'&' => {
self.pos += 1;
return .amp_amp;
},
'=' => {
self.pos += 1;
return .amp_eq;
},
else => return .amp,
}
}
fn lexPipe(self: *Lexer) Kind {
switch (self.at()) {
'|' => {
self.pos += 1;
return .pipe_pipe;
},
'=' => {
self.pos += 1;
return .pipe_eq;
},
else => return .pipe,
}
}
// '<' and '>' are four-way: "<<="/">>=" are the shift compound assignments,
// "<<"/">>" the shifts, "<="/">=" the comparisons, and a lone "<"/">"
// less/greater. The shift case consumes its second angle then reuses `pick`
// for the trailing '='. Maximal munch means a spaced `< <` stays two
// compares and `<< =` a shift then a store.
fn lexLt(self: *Lexer) Kind {
switch (self.at()) {
'<' => {
self.pos += 1;
return self.pick('=', .lt_lt_eq, .lt_lt);
},
'=' => {
self.pos += 1;
return .lt_eq;
},
else => return .lt,
}
}
fn lexGt(self: *Lexer) Kind {
switch (self.at()) {
'>' => {
self.pos += 1;
return self.pick('=', .gt_gt_eq, .gt_gt);
},
'=' => {
self.pos += 1;
return .gt_eq;
},
else => return .gt,
}
}
fn lexIdent(self: *Lexer, start: u32, line: u32) Token {
while (self.pos < self.src.len and isIdentCont(self.src[self.pos])) self.pos += 1;
const text = self.src[start..self.pos];
if (text.len == 1 and text[0] == '_') return self.make(.underscore, start, line);
const k = token.keywords.get(text) orelse Kind.ident;
return self.make(k, start, line);
}
// Integer and float literals. Handles decimal, hexadecimal (0x…), octal
// (0o…), and binary (0b…) integers with optional `_` digit separators, and
// decimal float literals (`3.14`, `1.0e-10`). The value passes through to
// the emitted Zig verbatim; the exact digit content is validated downstream.
// A letter or digit immediately adjacent (no whitespace) after the literal
// is a lexer error — the guard that kills the old silent-mis-lex hazard
// (`0o755` once split as two tokens, `1_000` as `1` + `_000`).
fn lexNumber(self: *Lexer, start: u32, line: u32) Token {
if (self.at() == '0') {
switch (self.peek(1)) {
'x', 'X' => {
self.pos += 2;
while (self.pos < self.src.len and
(isHexDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
return self.lexNumEnd(start, line, .int);
},
'b', 'B' => {
self.pos += 2;
while (self.pos < self.src.len and
(isBinaryDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
return self.lexNumEnd(start, line, .int);
},
'o', 'O' => {
self.pos += 2;
while (self.pos < self.src.len and
(isOctalDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
return self.lexNumEnd(start, line, .int);
},
else => {},
}
}
// Decimal: consume the integer part (digits and `_` separators).
while (self.pos < self.src.len and
(isDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
// A `.` followed by a digit starts the fractional part of a float literal.
// A lone `.` or `..` stays a punctuation token (member access / range).
if (self.at() == '.' and isDigit(self.peek(1))) {
self.pos += 1; // '.'
while (self.pos < self.src.len and
(isDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
if (self.at() == 'e' or self.at() == 'E') {
self.pos += 1;
if (self.at() == '+' or self.at() == '-') self.pos += 1;
while (self.pos < self.src.len and
(isDigit(self.src[self.pos]) or self.src[self.pos] == '_')) self.pos += 1;
}
return self.lexNumEnd(start, line, .float);
}
return self.lexNumEnd(start, line, .int);
}
// Emit `k` or `.invalid` when an identifier-start or digit immediately
// follows the literal (no whitespace). On error the bad suffix is consumed
// so the error span is informative for diagnostics.
fn lexNumEnd(self: *Lexer, start: u32, line: u32, k: Kind) Token {
if (self.pos < self.src.len and
(isIdentStart(self.src[self.pos]) or isDigit(self.src[self.pos])))
{
while (self.pos < self.src.len and
(isIdentCont(self.src[self.pos]) or isDigit(self.src[self.pos]))) self.pos += 1;
return self.make(.invalid, start, line);
}
return self.make(k, start, line);
}
// Char literals: `'c'`, `'\n'`, `'\xNN'`, `'\u{NNNNNN}'`. The lexer
// validates termination and escape structure; the byte value passes through
// to the emitted Zig verbatim (Zig uses the same spellings). An unterminated
// or malformed literal emits `.invalid`.
fn lexChar(self: *Lexer, start: u32, line: u32) Token {
self.pos += 1; // opening quote
if (self.at() == '\\') {
self.pos += 1; // backslash
switch (self.at()) {
'x' => {
self.pos += 1; // 'x'
var i: usize = 0;
while (i < 2 and self.pos < self.src.len and isHexDigit(self.src[self.pos])) {
self.pos += 1;
i += 1;
}
},
'u' => {
self.pos += 1; // 'u'
if (self.at() == '{') {
self.pos += 1; // '{'
while (self.pos < self.src.len and isHexDigit(self.src[self.pos])) self.pos += 1;
if (self.at() == '}') self.pos += 1;
}
},
else => if (self.pos < self.src.len) {
self.pos += 1; // single-char escape: \n \r \t \0 \\ \' \"
},
}
} else if (self.pos < self.src.len) {
self.pos += 1; // the single byte
}
if (self.at() == '\'') {
self.pos += 1;
return self.make(.char, start, line);
}
return self.make(.invalid, start, line);
}
// #name builtins (#intCast, #as, …). Lexed as one token carrying the
// leading '#'; the parser strips the sigil and lowering re-sigils to Zig's
// '@' (Tier-0 passthrough of the intrinsic).
fn lexBuiltin(self: *Lexer, start: u32, line: u32) Token {
self.pos += 1; // '#'
while (self.pos < self.src.len and isIdentCont(self.src[self.pos])) self.pos += 1;
return self.make(.builtin, start, line);
}
// Single-line strings with escape sequences: `\n \r \t \0 \\ \" \xNN
// \u{NNNNNN}`. The lexer validates termination and escape structure; escape
// *values* pass through to the emitted Zig verbatim (Zig uses the same set).
fn lexString(self: *Lexer, start: u32, line: u32) Token {
self.pos += 1; // opening quote
while (self.pos < self.src.len) {
const c = self.src[self.pos];
if (c == '\\') {
self.pos += 1; // backslash
if (self.pos >= self.src.len) break;
switch (self.src[self.pos]) {
'x' => {
self.pos += 1; // 'x'
var i: usize = 0;
while (i < 2 and self.pos < self.src.len and isHexDigit(self.src[self.pos])) {
self.pos += 1;
i += 1;
}
},
'u' => {
self.pos += 1; // 'u'
if (self.pos < self.src.len and self.src[self.pos] == '{') {
self.pos += 1; // '{'
while (self.pos < self.src.len and isHexDigit(self.src[self.pos])) self.pos += 1;
if (self.pos < self.src.len and self.src[self.pos] == '}') self.pos += 1;
}
},
else => self.pos += 1, // single-char escape or unknown (validated downstream)
}
continue;
}
if (c == '"') {
self.pos += 1;
return self.make(.string, start, line);
}
if (c == '\n') break; // unterminated on this line
self.pos += 1;
}
return self.make(.invalid, start, line);
}
// A Zig-style multiline / raw string line: `\\` followed by the rest of the
// physical line, with no escape processing. The token spans `\\` through the
// last byte before the newline (the newline itself stays trivia). Each line
// is one token; the parser folds a run of consecutive lines into one string
// value, so a single multiline literal is several of these tokens.
fn lexMultilineStr(self: *Lexer, start: u32, line: u32) Token {
self.pos += 2; // the two backslashes
while (self.pos < self.src.len and self.src[self.pos] != '\n') self.pos += 1;
return self.make(.multiline_str, start, line);
}
// A `///` doc-comment line: the three slashes plus the rest of the physical
// line (the newline stays trivia). One token per line; the parser folds a run
// of consecutive lines into one doc block and re-emits it before the
// declaration. Regular `//`, `////…` and `//!` never reach here — skipTrivia
// consumes them (see atDocComment).
fn lexDocComment(self: *Lexer, start: u32, line: u32) Token {
self.pos += 3; // the three slashes
while (self.pos < self.src.len and self.src[self.pos] != '\n') self.pos += 1;
return self.make(.doc_comment, start, line);
}
// A `//…` line comment in any non-doc shape — a bare `//`, the four-or-more
// slash `////…`, or the module-head `//!`. The token spans the slashes
// through the last byte before the newline (the newline stays trivia). A
// trailing `\r` is trimmed from the span so re-emission on a CRLF input
// embeds no carriage return mid-line. One token per line; the parser filters
// these out of the descent stream into `p.comments` (the formatter's input),
// so the grammar never sees them. The exactly-three-slash `///` is the
// separate doc_comment token (see atDocComment), routed before this in next().
fn lexLineComment(self: *Lexer, start: u32, line: u32) Token {
self.pos += 2; // the two leading slashes
while (self.pos < self.src.len and self.src[self.pos] != '\n') self.pos += 1;
var end = self.pos;
if (end > start and self.src[end - 1] == '\r') end -= 1;
return .{ .kind = .line_comment, .start = start, .end = end, .line = line };
}
fn make(self: *Lexer, k: Kind, start: u32, line: u32) Token {
return .{ .kind = k, .start = start, .end = self.pos, .line = line };
}
};
fn isDigit(c: u8) bool {
return c >= '0' and c <= '9';
}
fn isHexDigit(c: u8) bool {
return isDigit(c) or (c >= 'a' and c <= 'f') or (c >= 'A' and c <= 'F');
}
fn isOctalDigit(c: u8) bool {
return c >= '0' and c <= '7';
}
fn isBinaryDigit(c: u8) bool {
return c == '0' or c == '1';
}
fn isIdentStart(c: u8) bool {
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '_';
}
fn isIdentCont(c: u8) bool {
return isIdentStart(c) or isDigit(c);
}
// --- tests ---------------------------------------------------------------
const testing = std.testing;
fn collect(src: []const u8, out: []Kind) []Kind {
var lx = Lexer.init(src);
var i: usize = 0;
while (true) {
const t = lx.next();
out[i] = t.kind;
i += 1;
if (t.kind == .eof) break;
}
return out[0..i];
}
test "keywords and identifiers" {
var buf: [32]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .kw_use, .ident, .kw_fn, .ident, .eof }, collect("use flibc fn main", &buf));
}
test "underscore is its own token, not an identifier" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .underscore, .ident, .eof }, collect("_ _x", &buf));
}
test "colon-equal is one token; a lone colon is its own" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .colon_equal, .colon, .eof }, collect(":= :", &buf));
}
test "line comments surface as tokens between real tokens" {
var buf: [16]Kind = undefined;
const got = collect(
\\// header
\\fn main // trailing
\\{ }
, &buf);
try testing.expectEqualSlices(Kind, &.{ .line_comment, .kw_fn, .ident, .line_comment, .l_brace, .r_brace, .eof }, got);
}
test "string literal with an escape" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .string, .eof }, collect(
\\"hello\n"
, &buf));
}
test "operators: one- and two-character forms" {
var buf: [32]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.plus, .minus, .star, .slash, .percent,
.eq_eq, .bang_eq, .lt, .lt_eq, .gt,
.gt_eq, .amp_amp, .pipe_pipe, .bang, .amp,
.question, .dot_dot, .arrow, .colon_equal, .tilde, .eof,
}, collect("+ - * / % == != < <= > >= && || ! & ? .. -> := ~", &buf));
}
test "bitwise and shift operators, and shift vs spaced compares" {
var buf: [16]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.amp, .pipe, .caret, .lt_lt, .gt_gt, .eof,
}, collect("& | ^ << >>", &buf));
// Maximal munch: `<<` is one shift token, but a spaced `< <` stays two
// compares (and likewise `> >`), so a comparison never absorbs a shift.
var buf2: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .lt_lt, .lt, .lt, .eof }, collect("<< < <", &buf2));
}
test "char literal, hex int, and #builtin" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .char, .int, .builtin, .eof }, collect("'0' 0xFF #intCast", &buf));
}
test "control-flow keywords lex to their kinds" {
var buf: [16]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.kw_if, .kw_else, .kw_while, .kw_for, .kw_in,
.kw_break, .kw_continue, .kw_return, .eof,
}, collect("if else while for in break continue return", &buf));
}
test "error-union keywords lex to their kinds" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.kw_try, .kw_catch, .kw_defer, .kw_errdefer, .eof,
}, collect("try catch defer errdefer", &buf));
}
test "type-definition keywords lex to their kinds" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .kw_struct, .kw_enum, .kw_union, .eof }, collect("struct enum union", &buf));
}
test "the test keyword lexes to kw_test" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.kw_test, .string, .l_brace, .r_brace, .eof,
}, collect("test \"adds\" {}", &buf));
}
test "error is a keyword heading a set or an origination" {
var buf: [16]Kind = undefined;
// `error` reserves its own kind so it can head an `error{…}` set or an
// `error.Name` origination; the lexer recognises it through the keyword map.
try testing.expectEqualSlices(Kind, &.{
.kw_error, .l_brace, .ident, .r_brace, .kw_error, .dot, .ident, .eof,
}, collect("error{ Bad } error.Bad", &buf));
}
test "asm is a keyword; volatile stays a contextual identifier" {
var buf: [8]Kind = undefined;
// `asm` reserves its own kind so it can head an expression; `volatile` is
// recognised positionally by the parser, so it still lexes as a plain ident.
try testing.expectEqualSlices(Kind, &.{ .kw_asm, .ident, .eof }, collect("asm volatile", &buf));
}
test "a tagged-union variant list lexes name-then-type" {
var buf: [16]Kind = undefined;
// `union` is its own keyword; `(enum)` reuses kw_enum inside the parens; a
// bare variant is a lone ident, a typed variant lexes name-then-type.
try testing.expectEqualSlices(Kind, &.{
.kw_union, .l_paren, .kw_enum, .r_paren, .l_brace,
.ident, .comma, .ident, .ident, .r_brace, .eof,
}, collect("union(enum) { empty, single usize }", &buf));
}
test "visibility and declaration-modifier keywords lex to their kinds" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .kw_pub, .kw_export, .kw_inline, .kw_fn, .eof }, collect("pub export inline fn", &buf));
}
test "each multiline-string line is its own token" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .kw_const, .ident, .equal, .multiline_str, .multiline_str, .eof }, collect(
\\const usage =
\\ \\line one
\\ \\line two
, &buf));
}
test "a multiline-string token's content is the bytes after the backslashes" {
var lx = Lexer.init(
\\ \\ hi there
);
const t = lx.next();
try testing.expectEqual(Kind.multiline_str, t.kind);
// The lexeme keeps the leading `\\`; the parser strips it to get the content.
try testing.expectEqualStrings("\\\\ hi there", t.lexeme(lx.src));
}
test "compound-assign operators and the three-way minus" {
var buf: [16]Kind = undefined;
// `-` is three-way: "->" arrow, "-=" minus_eq, and a lone "-" minus.
try testing.expectEqualSlices(Kind, &.{
.plus_eq, .minus_eq, .star_eq, .slash_eq, .percent_eq,
.arrow, .minus_eq, .minus, .eof,
}, collect("+= -= *= /= %= -> -= -", &buf));
}
test "bitwise and shift compound-assignment operators" {
var buf: [16]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{
.amp_eq, .pipe_eq, .caret_eq, .lt_lt_eq, .gt_gt_eq, .eof,
}, collect("&= |= ^= <<= >>=", &buf));
// Maximal munch around the three-byte shift-assign: `<< =` (spaced) is a
// shift then a store, and a lone `<<` stays a bare shift — a shift-assign
// never forms across whitespace.
var buf2: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .lt_lt, .equal, .lt_lt, .eof }, collect("<< = <<", &buf2));
}
test "clear.flash lexes without an invalid token" {
var lx = Lexer.init(
\\use flibc
\\fn sink(bytes []u8) {
\\ _ = flibc.sys.write_fd(1, bytes.ptr, bytes.len)
\\}
);
while (true) {
const t = lx.next();
try testing.expect(t.kind != .invalid);
if (t.kind == .eof) break;
}
}
test "/// is a doc_comment; //, ////, //! are line_comments" {
var buf: [16]Kind = undefined;
// The exactly-three-slash line is the content-bearing doc_comment; a bare
// `//`, the four-slash `////`, and the top-level `//!` are each a
// line_comment. All four are tokens now (none are trivia).
try testing.expectEqualSlices(Kind, &.{ .line_comment, .line_comment, .line_comment, .doc_comment, .kw_pub, .kw_fn, .ident, .eof }, collect(
\\// regular
\\//! module
\\//// not a doc
\\/// a doc
\\pub fn f
, &buf));
}
test "a doc-comment token's content is the bytes after the three slashes" {
var lx = Lexer.init("/// hello");
const t = lx.next();
try testing.expectEqual(Kind.doc_comment, t.kind);
// The lexeme keeps the leading `///`; the parser strips it to get the content.
try testing.expectEqualStrings("/// hello", t.lexeme(lx.src));
}
test "line comments surface as tokens, lexeme verbatim across shapes" {
// Every non-doc `//…` shape is one line_comment token whose lexeme is the
// full text from the slashes to end of line — the content is never edited.
var lx1 = Lexer.init("// x");
const t1 = lx1.next();
try testing.expectEqual(Kind.line_comment, t1.kind);
try testing.expectEqualStrings("// x", t1.lexeme(lx1.src));
var lx2 = Lexer.init("//! mod");
const t2 = lx2.next();
try testing.expectEqual(Kind.line_comment, t2.kind);
try testing.expectEqualStrings("//! mod", t2.lexeme(lx2.src));
var lx3 = Lexer.init("//// rule");
const t3 = lx3.next();
try testing.expectEqual(Kind.line_comment, t3.kind);
try testing.expectEqualStrings("//// rule", t3.lexeme(lx3.src));
}
test "a line comment's trailing carriage return is trimmed from its span" {
// CRLF input: the `\r` before the `\n` is not part of the comment lexeme,
// so re-emission embeds no carriage return mid-line.
var lx = Lexer.init("// x\r\nfn");
const t = lx.next();
try testing.expectEqual(Kind.line_comment, t.kind);
try testing.expectEqualStrings("// x", t.lexeme(lx.src));
}
test "binary and octal integer literals lex to .int" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0b101", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0B110", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0o755", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0O644", &buf));
}
test "digit-separator _ in integer literals" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("1_000_000", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0xFF_AA", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0b1010_1010", &buf));
try testing.expectEqualSlices(Kind, &.{ .int, .eof }, collect("0o7_7_7", &buf));
}
test "float literals lex to .float" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .float, .eof }, collect("3.14", &buf));
try testing.expectEqualSlices(Kind, &.{ .float, .eof }, collect("1.0e10", &buf));
try testing.expectEqualSlices(Kind, &.{ .float, .eof }, collect("1.5e-3", &buf));
try testing.expectEqualSlices(Kind, &.{ .float, .eof }, collect("1.5e+3", &buf));
try testing.expectEqualSlices(Kind, &.{ .float, .eof }, collect("1_000.5", &buf));
}
test "float literal lexeme is verbatim" {
var lx = Lexer.init("3.14");
const t = lx.next();
try testing.expectEqual(Kind.float, t.kind);
try testing.expectEqualStrings("3.14", t.lexeme(lx.src));
}
test "3.method is not a float — dot floats only when followed by a digit" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .int, .dot, .ident, .eof }, collect("3.method", &buf));
}
test "1..2 is a range, not a float" {
var buf: [8]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .int, .dot_dot, .int, .eof }, collect("1..2", &buf));
}
test "literal-adjacency guard: adjacent ident or wrong-base digit after a number is .invalid" {
var buf: [4]Kind = undefined;
// Old silent mis-lex: `0o755` used to split as int(0) + ident(o755).
// An ident after any number is now a lexer error.
try testing.expectEqualSlices(Kind, &.{ .invalid, .eof }, collect("123abc", &buf));
// Wrong-base digits: `2` is not a binary digit; `8`/`9` not octal.
try testing.expectEqualSlices(Kind, &.{ .invalid, .eof }, collect("0b102", &buf));
try testing.expectEqualSlices(Kind, &.{ .invalid, .eof }, collect("0o89", &buf));
}
test "char literal with hex escape \\xNN" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .char, .eof }, collect("'\\x1b'", &buf));
try testing.expectEqualSlices(Kind, &.{ .char, .eof }, collect("'\\x00'", &buf));
}
test "char literal with unicode escape \\u{N}" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .char, .eof }, collect("'\\u{41}'", &buf));
try testing.expectEqualSlices(Kind, &.{ .char, .eof }, collect("'\\u{1F600}'", &buf));
}
test "string literal with hex and unicode escapes" {
var buf: [4]Kind = undefined;
try testing.expectEqualSlices(Kind, &.{ .string, .eof }, collect("\"\\x1b[0m\"", &buf));
try testing.expectEqualSlices(Kind, &.{ .string, .eof }, collect("\"\\u{263A}\"", &buf));
}