Flash 235 lines
// fsh command tokenizer — whitespace splitter with an optional single
// `|` split. Pure: no syscalls, no allocator. The
// driver (fsh.flash) feeds in a submitted line and a caller-owned argv
// array + scratch buffer (rule 1 — fixed-size, no realloc); this fills
// the argv pointers and reports how the line decomposes. Host-tested in
// isolation (see the `test` blocks at the end), the same layout
// readline.flash / execvp.flash use for their pure cores.
//
// Decomposition:
// * tokens are maximal runs of non-whitespace, non-`|` bytes;
// * the first `|` (if any) splits the line into a left and a right
// command — fsh supports exactly one pipe stage. Each token
// is copied NUL-terminated into `buf`; its argv slot points there.
// * the pipe boundary and the line end are marked by a `null` argv
// slot, so `argv[0..]` is already an execve-ready NULL-terminated
// vector for the left command, and `argv[left_argc + 1 ..]` is one
// for the right command.
//
// Overflow truncates (rule 1): once the argv array or `buf` is full the
// rest of the line is dropped — matching readline's truncate-on-overflow
// rather than erroring. A second `|`, or a `|` with an empty side, is a
// hard error (the shells fsh imitates reject `a | | b` and `| b`).
/// argv capacity, including the interleaved `null` separators (the pipe
/// boundary and the trailing terminator). 16 covers a command plus a
/// generous argument list for demoware; longer lines truncate.
pub const MAX_ARGS usize = 16
/// Why the two sides of a `|` cannot both be commands, or why a second
/// `|` appeared.
pub const Err = enum {
too_many_pipes,
empty_side,
}
/// A single-pipe decomposition. The right command's argv begins at
/// `argv[left_argc + 1]` (the `+ 1` skips the `null` the tokenizer wrote
/// at the pipe boundary); both vectors are NULL-terminated in place.
pub const Piped = struct {
left_argc usize,
right_argc usize,
}
/// How a line decomposed.
pub const Result = union(enum) {
/// Blank or whitespace-only line — fsh redraws the prompt.
empty,
/// One command; `argv[0..argc]` valid, `argv[argc] == null`.
single usize,
/// One pipe stage; see `Piped`.
piped Piped,
/// Malformed pipe usage.
err Err,
}
inline fn is_space(c u8) bool {
return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}
/// Split `line` into `argv` (pointers into `buf`). See the module header
/// for the decomposition rules. `argv` and `buf` are caller-owned and
/// reused per line; the returned pointers are valid until the next call
/// that reuses them.
pub fn tokenize(line []u8, argv *mut [MAX_ARGS]?[*:0]mut u8, buf []mut u8) Result {
var argc usize = 0
var buf_pos usize = 0
var pipe_at ?usize = null
var pipes usize = 0
var i usize = 0
while i < line.len {
while i < line.len && is_space(line[i]) {
i += 1
}
if i >= line.len {
break
}
// Reserve the final slot for the trailing `null` terminator.
if argc >= MAX_ARGS - 1 {
break
}
if line[i] == '|' {
pipes += 1
if pipes > 1 {
return .{ .err = .too_many_pipes }
}
pipe_at = argc
argv[argc] = null
argc += 1
i += 1
continue
}
start := i
while i < line.len && !is_space(line[i]) && line[i] != '|' {
i += 1
}
tok := line[start..i]
// Need room for the bytes + a NUL; otherwise truncate the line.
if buf_pos + tok.len + 1 > buf.len {
break
}
#memcpy(buf[buf_pos..][0..tok.len], tok)
buf[buf_pos + tok.len] = 0
argv[argc] = buf[buf_pos .. buf_pos + tok.len :0].ptr
argc += 1
buf_pos += tok.len + 1
}
if argc < MAX_ARGS {
argv[argc] = null
}
if pipe_at |p| {
left_argc := p
right_argc := argc - p - 1
if left_argc == 0 || right_argc == 0 {
return .{ .err = .empty_side }
}
return .{ .piped = .{ .left_argc = left_argc, .right_argc = right_argc } }
}
if argc == 0 {
return .empty
}
return .{ .single = argc }
}
// ---- Host tests ----
const std = #import("std")
const testing = std.testing
fn argAt(argv *[MAX_ARGS]?[*:0]mut u8, idx usize) []u8 {
return std.mem.span(argv[idx].?)
}
test "tokenize: empty line" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
try testing.expectEqual(Result.empty, tokenize("", &argv, &buf))
}
test "tokenize: whitespace-only line is empty" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
try testing.expectEqual(Result.empty, tokenize(" \t ", &argv, &buf))
}
test "tokenize: single token" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
const r = tokenize("exit", &argv, &buf)
try testing.expectEqual(#as(usize, 1), r.single)
try testing.expectEqualStrings("exit", argAt(&argv, 0))
try testing.expectEqual(#as(?[*:0]mut u8, null), argv[1])
}
test "tokenize: multi-arg command, surrounding + collapsed whitespace" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
const r = tokenize(" cd /test ", &argv, &buf)
try testing.expectEqual(#as(usize, 2), r.single)
try testing.expectEqualStrings("cd", argAt(&argv, 0))
try testing.expectEqualStrings("/test", argAt(&argv, 1))
try testing.expectEqual(#as(?[*:0]mut u8, null), argv[2])
}
test "tokenize: one pipe splits left/right NULL-terminated vectors" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
const r = tokenize("echo hi | cat", &argv, &buf)
const p = r.piped
try testing.expectEqual(#as(usize, 2), p.left_argc)
try testing.expectEqual(#as(usize, 1), p.right_argc)
// left vector: argv[0..left_argc], terminated by the pipe `null`.
try testing.expectEqualStrings("echo", argAt(&argv, 0))
try testing.expectEqualStrings("hi", argAt(&argv, 1))
try testing.expectEqual(#as(?[*:0]mut u8, null), argv[p.left_argc])
// right vector starts past the boundary `null`.
try testing.expectEqualStrings("cat", argAt(&argv, p.left_argc + 1))
try testing.expectEqual(#as(?[*:0]mut u8, null), argv[p.left_argc + 1 + p.right_argc])
}
test "tokenize: pipe with no surrounding spaces still splits" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
const r = tokenize("echo|cat", &argv, &buf)
const p = r.piped
try testing.expectEqual(#as(usize, 1), p.left_argc)
try testing.expectEqual(#as(usize, 1), p.right_argc)
try testing.expectEqualStrings("echo", argAt(&argv, 0))
try testing.expectEqualStrings("cat", argAt(&argv, 2))
}
test "tokenize: pipe at start is an empty side" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
try testing.expectEqual(Err.empty_side, tokenize("| cat", &argv, &buf).err)
}
test "tokenize: pipe at end is an empty side" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
try testing.expectEqual(Err.empty_side, tokenize("echo hi |", &argv, &buf).err)
}
test "tokenize: two pipes rejected" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [64]u8 = undefined
try testing.expectEqual(Err.too_many_pipes, tokenize("a | b | c", &argv, &buf).err)
}
test "tokenize: argv overflow truncates the line" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [256]u8 = undefined
// 20 single-char tokens; MAX_ARGS - 1 = 15 fit, the 16th slot is the
// trailing null.
const r = tokenize("a b c d e f g h i j k l m n o p q r s t", &argv, &buf)
try testing.expectEqual(#as(usize, MAX_ARGS - 1), r.single)
try testing.expectEqual(#as(?[*:0]mut u8, null), argv[MAX_ARGS - 1])
}
test "tokenize: buf overflow truncates without corrupting placed tokens" {
var argv [MAX_ARGS]?[*:0]mut u8 = undefined
var buf [8]u8 = undefined // fits "abc\0" + "de\0" = 7 bytes; "fgh" drops
const r = tokenize("abc de fgh", &argv, &buf)
try testing.expectEqual(#as(usize, 2), r.single)
try testing.expectEqualStrings("abc", argAt(&argv, 0))
try testing.expectEqualStrings("de", argAt(&argv, 1))
}