FlashOS/user_space/fsh/tokenize.flash

Flash 235 lines
// fsh command tokenizer — whitespace splitter with an optional single
// `|` split. Pure: no syscalls, no allocator. The
// driver (fsh.flash) feeds in a submitted line and a caller-owned argv
// array + scratch buffer (rule 1 — fixed-size, no realloc); this fills
// the argv pointers and reports how the line decomposes. Host-tested in
// isolation (see the `test` blocks at the end), the same layout
// readline.flash / execvp.flash use for their pure cores.
//
// Decomposition:
//   * tokens are maximal runs of non-whitespace, non-`|` bytes;
//   * the first `|` (if any) splits the line into a left and a right
//     command — fsh supports exactly one pipe stage. Each token
//     is copied NUL-terminated into `buf`; its argv slot points there.
//   * the pipe boundary and the line end are marked by a `null` argv
//     slot, so `argv[0..]` is already an execve-ready NULL-terminated
//     vector for the left command, and `argv[left_argc + 1 ..]` is one
//     for the right command.
//
// Overflow truncates (rule 1): once the argv array or `buf` is full the
// rest of the line is dropped — matching readline's truncate-on-overflow
// rather than erroring. A second `|`, or a `|` with an empty side, is a
// hard error (the shells fsh imitates reject `a | | b` and `| b`).

/// argv capacity, including the interleaved `null` separators (the pipe
/// boundary and the trailing terminator). 16 covers a command plus a
/// generous argument list for demoware; longer lines truncate.
pub const MAX_ARGS usize = 16

/// Why the two sides of a `|` cannot both be commands, or why a second
/// `|` appeared.
pub const Err = enum {
    too_many_pipes,
    empty_side,
}

/// A single-pipe decomposition. The right command's argv begins at
/// `argv[left_argc + 1]` (the `+ 1` skips the `null` the tokenizer wrote
/// at the pipe boundary); both vectors are NULL-terminated in place.
pub const Piped = struct {
    left_argc usize,
    right_argc usize,
}

/// How a line decomposed.
pub const Result = union(enum) {
    /// Blank or whitespace-only line — fsh redraws the prompt.
    empty,
    /// One command; `argv[0..argc]` valid, `argv[argc] == null`.
    single usize,
    /// One pipe stage; see `Piped`.
    piped Piped,
    /// Malformed pipe usage.
    err Err,
}

inline fn is_space(c u8) bool {
    return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}

/// Split `line` into `argv` (pointers into `buf`). See the module header
/// for the decomposition rules. `argv` and `buf` are caller-owned and
/// reused per line; the returned pointers are valid until the next call
/// that reuses them.
pub fn tokenize(line []u8, argv *mut [MAX_ARGS]?[*:0]mut u8, buf []mut u8) Result {
    var argc usize = 0
    var buf_pos usize = 0
    var pipe_at ?usize = null
    var pipes usize = 0

    var i usize = 0
    while i < line.len {
        while i < line.len && is_space(line[i]) {
            i += 1
        }
        if i >= line.len {
            break
        }

        // Reserve the final slot for the trailing `null` terminator.
        if argc >= MAX_ARGS - 1 {
            break
        }

        if line[i] == '|' {
            pipes += 1
            if pipes > 1 {
                return .{ .err = .too_many_pipes }
            }
            pipe_at = argc
            argv[argc] = null
            argc += 1
            i += 1
            continue
        }

        start := i
        while i < line.len && !is_space(line[i]) && line[i] != '|' {
            i += 1
        }
        tok := line[start..i]

        // Need room for the bytes + a NUL; otherwise truncate the line.
        if buf_pos + tok.len + 1 > buf.len {
            break
        }
        #memcpy(buf[buf_pos..][0..tok.len], tok)
        buf[buf_pos + tok.len] = 0
        argv[argc] = buf[buf_pos .. buf_pos + tok.len :0].ptr
        argc += 1
        buf_pos += tok.len + 1
    }

    if argc < MAX_ARGS {
        argv[argc] = null
    }

    if pipe_at |p| {
        left_argc := p
        right_argc := argc - p - 1
        if left_argc == 0 || right_argc == 0 {
            return .{ .err = .empty_side }
        }
        return .{ .piped = .{ .left_argc = left_argc, .right_argc = right_argc } }
    }

    if argc == 0 {
        return .empty
    }
    return .{ .single = argc }
}

// ---- Host tests ----

const std = #import("std")
const testing = std.testing

fn argAt(argv *[MAX_ARGS]?[*:0]mut u8, idx usize) []u8 {
    return std.mem.span(argv[idx].?)
}

test "tokenize: empty line" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    try testing.expectEqual(Result.empty, tokenize("", &argv, &buf))
}

test "tokenize: whitespace-only line is empty" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    try testing.expectEqual(Result.empty, tokenize("  \t  ", &argv, &buf))
}

test "tokenize: single token" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    const r = tokenize("exit", &argv, &buf)
    try testing.expectEqual(#as(usize, 1), r.single)
    try testing.expectEqualStrings("exit", argAt(&argv, 0))
    try testing.expectEqual(#as(?[*:0]mut u8, null), argv[1])
}

test "tokenize: multi-arg command, surrounding + collapsed whitespace" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    const r = tokenize("  cd   /test  ", &argv, &buf)
    try testing.expectEqual(#as(usize, 2), r.single)
    try testing.expectEqualStrings("cd", argAt(&argv, 0))
    try testing.expectEqualStrings("/test", argAt(&argv, 1))
    try testing.expectEqual(#as(?[*:0]mut u8, null), argv[2])
}

test "tokenize: one pipe splits left/right NULL-terminated vectors" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    const r = tokenize("echo hi | cat", &argv, &buf)
    const p = r.piped
    try testing.expectEqual(#as(usize, 2), p.left_argc)
    try testing.expectEqual(#as(usize, 1), p.right_argc)
    // left vector: argv[0..left_argc], terminated by the pipe `null`.
    try testing.expectEqualStrings("echo", argAt(&argv, 0))
    try testing.expectEqualStrings("hi", argAt(&argv, 1))
    try testing.expectEqual(#as(?[*:0]mut u8, null), argv[p.left_argc])
    // right vector starts past the boundary `null`.
    try testing.expectEqualStrings("cat", argAt(&argv, p.left_argc + 1))
    try testing.expectEqual(#as(?[*:0]mut u8, null), argv[p.left_argc + 1 + p.right_argc])
}

test "tokenize: pipe with no surrounding spaces still splits" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    const r = tokenize("echo|cat", &argv, &buf)
    const p = r.piped
    try testing.expectEqual(#as(usize, 1), p.left_argc)
    try testing.expectEqual(#as(usize, 1), p.right_argc)
    try testing.expectEqualStrings("echo", argAt(&argv, 0))
    try testing.expectEqualStrings("cat", argAt(&argv, 2))
}

test "tokenize: pipe at start is an empty side" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    try testing.expectEqual(Err.empty_side, tokenize("| cat", &argv, &buf).err)
}

test "tokenize: pipe at end is an empty side" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    try testing.expectEqual(Err.empty_side, tokenize("echo hi |", &argv, &buf).err)
}

test "tokenize: two pipes rejected" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [64]u8 = undefined
    try testing.expectEqual(Err.too_many_pipes, tokenize("a | b | c", &argv, &buf).err)
}

test "tokenize: argv overflow truncates the line" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [256]u8 = undefined
    // 20 single-char tokens; MAX_ARGS - 1 = 15 fit, the 16th slot is the
    // trailing null.
    const r = tokenize("a b c d e f g h i j k l m n o p q r s t", &argv, &buf)
    try testing.expectEqual(#as(usize, MAX_ARGS - 1), r.single)
    try testing.expectEqual(#as(?[*:0]mut u8, null), argv[MAX_ARGS - 1])
}

test "tokenize: buf overflow truncates without corrupting placed tokens" {
    var argv [MAX_ARGS]?[*:0]mut u8 = undefined
    var buf [8]u8 = undefined // fits "abc\0" + "de\0" = 7 bytes; "fgh" drops
    const r = tokenize("abc de fgh", &argv, &buf)
    try testing.expectEqual(#as(usize, 2), r.single)
    try testing.expectEqualStrings("abc", argAt(&argv, 0))
    try testing.expectEqualStrings("de", argAt(&argv, 1))
}
raw view on GitHub →