FlashOS/src/sys.flash

Flash 1415 lines
// sys: syscall dispatch table and handlers.
// Layouts (TaskStruct etc.) come from src/task_layout.zig — the single
// source of truth shared with sched.zig / fork.zig / mm_user.zig.
// Syscall IDs come from lib/syscall_defs.zig — the single source of
// truth shared with user_space/kernel_tests.zig.

const std = #import("std")
const layout = #import("task_layout")
const defs = #import("syscall_defs")
const user_layout = #import("user_layout")
const pipe_mod = #import("pipe")
const console = #import("console")
const sched = #import("sched")
const vfs = #import("vfs")
const file_mod = #import("file")
const fdtable = #import("fdtable")
const path_mod = #import("path")
const klog_ring = #import("klog_ring")
const sha256 = #import("sha256")
const shadow = #import("shadow")
const perm = #import("perm")
const pwfile = #import("pwfile")
// Kernel entropy source (salt minting for sys_passwd). Named module —
// hwrng was promoted to a named module when it moved to Flash (the
// generated .zig lives in the build cache, so a path import would not
// resolve); start.zig force-includes the same module for the exported
// hwrng_init, same pattern as sched/execve/utilc.
const hwrng = #import("hwrng")
const TaskStruct = layout.TaskStruct
const UTHREAD = layout.UTHREAD
const MAX_PAGE_COUNT = layout.MAX_PAGE_COUNT

const MU i32 = 0
const NR_TASKS usize = 64
const PAGE_SIZE u64 = 1 << 12

extern var current ?*mut TaskStruct
extern var task [NR_TASKS]?*mut TaskStruct
extern fn preempt_disable() void
extern fn preempt_enable() void

extern fn main_output(interface i32, str [*:0]u8) void
extern fn copy_process(clone_flags u64, fn_ptr u64, arg u64) i32
extern fn exit_process() void
extern fn do_wait() i32
extern fn dump_free_count() u64
// Body lives in src/execve.zig; sys_execve below is the dispatch-table
// wrapper. Activates the real path-resolve → stream-PT_LOAD →
// encode-argv flow.
extern fn execve_impl(path_ptr u64, argv_ptr u64) i32
extern fn unmap_user_range(t *mut TaskStruct, start_uva u64, end_uva u64) void
extern fn set_pgd(pgd u64) void
extern fn check_and_prefault_user_range(uva u64, len u64) i32
extern fn copy_from_user(kbuf [*]mut u8, uva u64, len u64) i32
extern fn copy_to_user(uva u64, kbuf [*]u8, len u64) i32

// Board driver bag, reached through C-ABI trampolines in src/start.zig.
// The board bag is a named module imported by the kernel root module, so a
// leaf Flash module — whose generated .zig lives in the build cache — cannot
// @import it. start.zig (which is in the root and imports board) exports
// these thin wrappers, the same boundary fork.zig bridges with
// move_to_user_elf_argv. The usb pair backs console_tx; board_power_reboot
// backs sys_reboot.
extern fn board_usb_enumerated() bool
extern fn board_usb_cdc_tx(ptr [*]u8, len u64) void
extern fn board_power_reboot() noreturn
// Mailbox-backed hardware monitors, reached through the same start.zig
// trampolines (board.mailbox is rpi4b-real, virt-stubbed to 0). Both
// return 0 = unknown on a board without the firmware.
extern fn board_mailbox_temperature() u32
extern fn board_mailbox_cpu_clock() u32

// Allocatable pool size (SYS_MEMTOTAL) and seconds-since-boot
// (SYS_UPTIME). Cross-module kernel reads, exported by page_alloc /
// generic_timer.
extern fn mem_total_count() u64
extern fn uptime_seconds() u64

const builtin = #import("builtin")

// Syscalls run at EL1h with TTBR0 holding the *user* pgd (set by
// prepare_move_to_user_elf). Each function pointer is ORed with
// LINEAR_MAP_BASE so the `blr` in el0_svc lands in the kernel's
// high-mem mapping. Replaces the earlier broken `cur + &_start`
// formula, which doubled the address into .bss.
const LINEAR_MAP_BASE u64 = if (builtin.target.os.tag == .freestanding) 0xFFFF000000000000 else 0

// Console echo flags. Default off preserves the historical
// split — the kernel never echoes, userland readline owns echo (so fsh is
// unaffected). SYS_SET_CONSOLE_MODE flips them; when echo is on,
// readConsoleBytes echoes drained printable bytes, and when mask is on it
// echoes a '*' per printable byte instead (password masking). /bin/login
// turns echo on for the username prompt and mask on for the password, then
// leaves both off before exec'ing the shell.
var console_echo bool = false
var console_mask bool = false

// SYS CALL PROCESS CONTROL
export fn sys_fork() i32 {
    return copy_process(UTHREAD, 0, 0)
}
// Path-resolved ELF loader. Thin wrapper over execve_impl
// in src/execve.zig — keeps the dispatch-table binding adjacent to
// the other process-control syscalls. x0 = path_ptr (NUL-terminated
// absolute UVA), x1 = argv_ptr (UVA of NULL-terminated argv array).
// Returns 0 (does-not-return on success — eret jumps to e_entry),
// -1 on resolve / parse / alloc / argv-fault failure.
export fn sys_execve(path_ptr u64, argv_ptr u64) i32 {
    return execve_impl(path_ptr, argv_ptr)
}
export fn sys_wait() i32 {
    return do_wait()
}
export fn sys_exit() void {
    exit_process()
}
// SYS_REBOOT — reset the board. board.power.reboot() is the per-board
// reset (PSCI SYSTEM_RESET on virt, the BCM2711 watchdog on rpi4b) and
// never returns, so neither does this handler: el0_svc never reaches the
// eret back to the caller. EL0 cannot do this itself (privileged SMC /
// MMIO), which is why it is a syscall. No privilege gate yet.
export fn sys_reboot() noreturn {
    board_power_reboot()
}
// Walk task[] under preempt_disable for a matching .pid. On hit: flip to
// TASK_ZOMBIE and wake any TASK_INTERRUPTIBLE parent (mirrors exit_process
// in sched.zig). The slot stays occupied; the parent's existing do_wait
// reaps it (frees user/kernel pages + the kernel page itself). Returns 0
// on hit, -1 on miss. Self-kill is rejected — the running task is its own
// kernel page; sys_exit is the safe self-cancel path.
export fn sys_kill(pid i32) i32 {
    if current |c| {
        if c.pid == pid { return -1 }
    }

    preempt_disable()
    var i usize = 0
    while i < NR_TASKS {
        if task[i] |t| {
            if t.pid == pid {
                sched.zombify_and_wake_parent(t)
                preempt_enable()
                return 0
            }
        }
        i += 1
    }
    preempt_enable()
    return -1
}
export fn sys_dump_free() u64 {
    return dump_free_count()
}

// SYS_MEMTOTAL — allocatable pool size in pages, frozen at boot. A tool
// derives "used" as this minus SYS_DUMP_FREE; "total bytes" as pages << 12.
export fn sys_mem_total() u64 {
    return mem_total_count()
}

// SYS_UPTIME — seconds since boot, from the architectural counter.
export fn sys_uptime() u64 {
    return uptime_seconds()
}

// SYS_CPU_TEMP — SoC temperature in milli-degrees Celsius (0 = unknown).
// SYS_CPU_FREQ — ARM clock in Hz (0 = unknown). Both run a mailbox
// transaction over the shared prop_buf; preempt_disable serialises that
// single-core-shared static against a task switch landing mid-transaction.
export fn sys_cpu_temp() u64 {
    preempt_disable()
    const milli u64 = board_mailbox_temperature()
    preempt_enable()
    return milli
}

export fn sys_cpu_freq() u64 {
    preempt_disable()
    const hz u64 = board_mailbox_cpu_clock()
    preempt_enable()
    return hz
}

// SYS CALL FILE SYSTEM
//
// File access dispatches through the VFS layer: sys_openFile
// resolves the path via vfs.vfs_open and stashes the backing superblock
// in File.sb; seek and the unified read/write/close re-cast that opaque
// pointer (vfsSb) and call through the backend vtable. The per-backend
// arithmetic (initramfs's pointer walk, FAT32's cluster chains) lives
// in the backend modules — these handlers are thin dispatchers.
//
// User pointers (path / buf) reach the kernel through copy_from_user /
// copy_to_user. A wild UVA returns -1 to the caller via the soft path
// in mm_user.check_and_prefault_user_range; the task does NOT zombify.

// Re-type File.sb (an `?*anyopaque`, opaque to break the vfs<->file
// import cycle) back to `*vfs.SuperBlock` for vtable dispatch.
inline fn vfsSb(f *mut file_mod.File) ?*mut vfs.SuperBlock {
    const raw = f.sb orelse return null
    return #ptrCast(#alignCast(raw))
}

// sys_openFile + joinResolve form the deepest kernel-stack chain on the
// syscall path. The two path scratch buffers live as preempt-guarded
// module statics rather than ~1.3 KiB of stack locals: the kernel stack
// grows down toward the TaskStruct credential tail in the same page, so a
// stack-heavy open could descend into uid/gid/euid/egid and a timer IRQ
// taken in that window would save its register frame straight over the
// credentials. Keeping the buffers off the stack bounds the frame well
// clear of the creds. preempt_disable serialises the shared statics across
// the whole resolve + open; the defer covers every early-return error path.
var open_path_buf [1024]u8 = undefined
var open_join_buf [layout.CWD_SIZE]u8 = undefined

export fn sys_openFile(path_ptr u64) i32 {
    const c = current orelse return -1

    preempt_disable()
    defer preempt_enable()

    var i usize = 0
    while i < 1023 {
        var b u8 = 0
        if copy_from_user(#ptrCast(&b), path_ptr + i, 1) < 0 { return -1 }
        open_path_buf[i] = b
        if b == 0 { break }
        i += 1
    }
    open_path_buf[i] = 0
    const raw_path = std.mem.span(#as([*:0]u8, #ptrCast(&open_path_buf)))

    // Relative paths (no leading '/') are joined against current.cwd
    // and `.` / `..` collapsed into a kernel scratch buffer; absolute
    // paths pass straight through. The post-join slice is what vfs
    // (still absolute-only) sees. The pure helper is host-tested. Join
    // buffer is sized to one CWD_SIZE — over-long resolved paths
    // (cwd 256B + rel 256B before collapse) return -1.
    var resolved []u8 = undefined
    if raw_path.len > 0 && raw_path[0] == '/' {
        resolved = raw_path
    } else {
        const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)
        resolved = path_mod.joinResolve(cwd_slice, raw_path, &open_join_buf) orelse return -1
    }

    var open_result vfs.OpenResult = .{}
    const sb = vfs.vfs_open(resolved, &open_result)

    if sb == null { return -1 }

    // Permission gate: open is read-intent (this ABI has no
    // open flags — write permission is re-checked per write). A denied
    // read returns -EACCES, distinguishable from the -1 miss above, and
    // costs no File page since the check runs before the alloc.
    if !perm.checkAccess(
        open_result.mode,
        open_result.uid,
        open_result.gid,
        c.euid,
        c.egid,
        .read
    ) { return -defs.EACCES }

    const f = file_mod.alloc() orelse return -1
    f.refs = 1
    f.private = open_result.private
    f.size = open_result.size
    f.offset = 0
    f.sb = sb
    // Carry the backend's permission metadata on the handle so the
    // per-write check in sys_write needs no fresh VFS lookup.
    f.mode = open_result.mode
    f.uid = open_result.uid
    f.gid = open_result.gid
    // Directory-entry location: FAT32 write() rewrites the entry's
    // first-cluster / size through it. Only writable handles (this path)
    // need it; the read-only open sites below leave the alloc-zeroed
    // default, and non-FAT backends never set it.
    f.dirent_lba = open_result.dirent_lba
    f.dirent_off = open_result.dirent_off

    const fd = fdtable.install(c, .file, f)
    if fd < 0 {
        file_mod.unref(f)
        return -1
    }
    return fd
}

// Shared copy-path-from-user + cwd-resolve — the off-stack form
// sys_create / sys_unlink / sys_rename use (the same logic sys_openFile
// inlines). Copies the NUL-terminated user path into `raw_buf`, then
// resolves it against the caller's cwd into `join_buf`: an absolute path
// passes straight through, a relative one is `.`/`..`-collapsed by the
// host-tested joinResolve. Returns the resolved slice, or null on a copy
// fault or an over-long resolved path. Caller holds preempt_disable (the
// buffers are shared statics). Every buffer here is off-stack, keeping
// these handlers' frames well clear of the TaskStruct credential tail.
fn copyResolvePath(c *mut TaskStruct, path_ptr u64, raw_buf *mut [1024]u8, join_buf *mut [layout.CWD_SIZE]u8) ?[]u8 {
    var i usize = 0
    while i < 1023 {
        var b u8 = 0
        if copy_from_user(#ptrCast(&b), path_ptr + i, 1) < 0 { return null }
        raw_buf[i] = b
        if b == 0 { break }
        i += 1
    }
    raw_buf[i] = 0
    const raw_path = std.mem.span(#as([*:0]u8, #ptrCast(raw_buf)))
    if raw_path.len > 0 && raw_path[0] == '/' { return raw_path }
    const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)
    return path_mod.joinResolve(cwd_slice, raw_path, join_buf)
}

// Second path scratch for sys_rename — its two paths must be resolved and
// live simultaneously, so the new-path copy/join cannot reuse the old-path
// buffers (open_path_buf / open_join_buf). Off-stack for the same stack-tail reason.
var rename_new_buf [1024]u8 = undefined
var rename_new_join [layout.CWD_SIZE]u8 = undefined

// sys_create — creat(): make a new empty file and return a writable fd. The
// deepest-stack twin of sys_openFile (shared off-stack path scratch):
// resolve the path, dispatch vfs_create, then install a File the same way.
// The new file is caller-owned (uid/gid = the caller's effective ids); the
// backend supplies the 0644 mode baseline. /mnt is the only writable mount,
// so a create elsewhere (initramfs) fails closed via its EROFS vtable stub.
export fn sys_create(path_ptr u64) i32 {
    const c = current orelse return -1

    preempt_disable()
    defer preempt_enable()

    const resolved = copyResolvePath(c, path_ptr, &open_path_buf, &open_join_buf) orelse return -1

    var open_result vfs.OpenResult = .{}
    const sb = vfs.vfs_create(resolved, &open_result)
    if sb == null { return -1 }

    const f = file_mod.alloc() orelse return -1
    f.refs = 1
    f.private = open_result.private
    f.size = open_result.size
    f.offset = 0
    f.sb = sb
    // Caller-owned: stamp the creating user's effective ids over the
    // backend's root baseline so the per-write check in sys_write lets the
    // owner write the file it just made. Persistence is a known ceiling —
    // a reboot reverts to the overlay default (see fat32_backend create).
    f.mode = open_result.mode
    f.uid = c.euid
    f.gid = c.egid
    f.dirent_lba = open_result.dirent_lba
    f.dirent_off = open_result.dirent_off

    const fd = fdtable.install(c, .file, f)
    if fd < 0 {
        file_mod.unref(f)
        return -1
    }
    return fd
}

// sys_unlink — remove the file at `path`. Resolve + dispatch; the backend
// tombstones the entry and frees its chain. Returns 0 on success, -1 on a
// missing file, a read-only mount, or a fault.
export fn sys_unlink(path_ptr u64) i32 {
    const c = current orelse return -1

    preempt_disable()
    defer preempt_enable()

    const resolved = copyResolvePath(c, path_ptr, &open_path_buf, &open_join_buf) orelse return -1
    return #intCast(vfs.vfs_unlink(resolved))
}

// sys_rename — rename `old` to `new` within the same directory. Both paths
// are copied + resolved into separate off-stack buffers (both must be live
// for the dispatch) and handed to vfs_rename, which rejects a cross-mount
// pair before the backend sees it. Returns 0 on success, -1 on a missing
// source, a cross-directory/-mount move, a bad name, or a fault.
export fn sys_rename(old_ptr u64, new_ptr u64) i32 {
    const c = current orelse return -1

    preempt_disable()
    defer preempt_enable()

    const old_resolved = copyResolvePath(c, old_ptr, &open_path_buf, &open_join_buf) orelse return -1
    const new_resolved = copyResolvePath(c, new_ptr, &rename_new_buf, &rename_new_join) orelse return -1
    return #intCast(vfs.vfs_rename(old_resolved, new_resolved))
}

// Post-lookup body for file reads. The VFS vtable walks
// chunks of <=512 bytes and copies them to the caller's UVA. Returns
// total bytes copied, -1 on copy_to_user fault with no progress so
// far. Reached through the unified sys_read dispatcher.
fn readFileBacked(f *mut file_mod.File, sb *mut vfs.SuperBlock, buf_uva u64, len u64) i64 {
    var total_copied u64 = 0
    while total_copied < len {
        var kbuf [512]u8 = undefined
        const take = #min(len - total_copied, #as(u64, #intCast(kbuf.len)))
        preempt_disable()
        const n = vfs.vfs_read(sb, f, &kbuf, take)
        preempt_enable()
        if n < 0 { return if (total_copied > 0) #intCast(total_copied) else -1 }
        if n == 0 { break }
        if copy_to_user(buf_uva + total_copied, &kbuf, #intCast(n)) < 0 { return -1 }
        total_copied += #intCast(n)
        if n < take { break }
    }
    return #intCast(total_copied)
}

// Post-lookup body for file writes. Pulls up to 512 bytes
// per iteration through copy_from_user and pushes them via the
// backend's vfs_write vtable. Initramfs returns -1 (EROFS); FAT32
// honours the write via writeBack. Reached through the unified
// sys_write dispatcher.
fn writeFileBacked(f *mut file_mod.File, sb *mut vfs.SuperBlock, buf_uva u64, len u64) i64 {
    var total_pushed u64 = 0
    while total_pushed < len {
        var kbuf [512]u8 = undefined
        const take = #min(len - total_pushed, #as(u64, #intCast(kbuf.len)))
        if copy_from_user(&kbuf, buf_uva + total_pushed, take) < 0 { return -1 }
        preempt_disable()
        const n = vfs.vfs_write(sb, f, &kbuf, take)
        preempt_enable()
        if n < 0 { return if (total_pushed > 0) #intCast(total_pushed) else -1 }
        if n == 0 { break }
        total_pushed += #intCast(n)
        if n < take { break }
    }
    return #intCast(total_pushed)
}

export fn sys_seek(fd i32, off i64, whence i32) i64 {
    const c = current orelse return -1
    const f = fdtable.getFile(c, fd) orelse return -1
    const sb = vfsSb(f) orelse return -1

    preempt_disable()
    const ret = vfs.vfs_seek(sb, f, off, whence)
    preempt_enable()

    return ret
}

// MEMORY MANAGEMENT

// Set the heap break to `addr` (rounded up to the next page boundary).
// Returns the new break, or the current break if `addr == 0`. Returns
// -1 on out-of-range requests (below HEAP_BASE, or above
// STACK_TOP - STACK_BUDGET — the latter is the stack-budget upper
// bound shared with mm_user.zig's do_data_abort guard logic).
//
// No pages are eagerly allocated on grow — touching a page in the new
// range faults through do_data_abort and demand-allocates. On shrink
// the released pages MUST be freed here (the per-process do_wait reap
// loop only runs at process exit, so a long-lived process that grows
// then shrinks would leak otherwise); unmap_user_range walks
// `mm.user_pages` for entries in [new_brk, old_brk) and clears the
// PTE + frees the PA + zeros the slot. set_pgd at the tail flushes the
// TLB so a re-grow re-faults cleanly.
export fn sys_brk(addr u64) i64 {
    const c = current orelse return -1
    if addr == 0 { return #bitCast(c.mm.brk) }

    const new_brk u64 = (addr + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)
    if new_brk < user_layout.HEAP_BASE { return -1 }
    if new_brk > user_layout.STACK_TOP - user_layout.STACK_BUDGET { return -1 }

    const old_brk u64 = c.mm.brk
    if new_brk < old_brk {
        unmap_user_range(c, new_brk, old_brk)
        // Re-install the same pgd to drive the full-TLB-flush path in
        // set_pgd (sched.S). Targeted `tlbi vae1is` would be surgical;
        // the heap-shrink path is rare enough that a full flush is fine.
        set_pgd(c.mm.pgd)
    }
    c.mm.brk = new_brk
    return #bitCast(new_brk)
}

// Convenience wrapper: brk(current_break + delta), returns the previous
// break. Negative `delta` shrinks. The sys_brk path itself enforces
// bounds (HEAP_BASE / STACK_TOP - user_layout.STACK_BUDGET); sbrk only
// guards against signed-overflow on the addition.
export fn sys_sbrk(delta i64) i64 {
    const c = current orelse return -1
    const cur_brk u64 = c.mm.brk
    const cur_signed i64 = #bitCast(cur_brk)
    const new_signed = #addWithOverflow(cur_signed, delta)
    if new_signed[1] != 0 { return -1 }
    if new_signed[0] < 0 { return -1 }
    const target u64 = #bitCast(new_signed[0])
    const ret = sys_brk(target)
    if ret < 0 { return -1 }
    return #bitCast(cur_brk)
}

export fn sys_mmap() void {}
export fn sys_munmap() void {}
export fn sys_mlock() void {}
export fn sys_munlock() void {}

// Interprocess Communication
//
// Anonymous-pipe ABI. Slot map in lib/syscall_defs.zig.
// `sys_pipe` returns both fds in a single i64: low 32 bits = read fd,
// high 32 bits = write fd. Negative on out-of-fds / alloc-failure.
// Compact ABI keeps the user-side wrapper to one register and avoids
// a copy_to_user for the pair.
//
// `buf` reaches the kernel through copy_from_user / copy_to_user; a
// wild UVA returns -1 to the caller without zombifying the task.
export fn sys_pipe() i64 {
    const c = current orelse return -1
    const p = pipe_mod.alloc() orelse return -1
    p.refs = 2 // one ref per fd installed below

    const rfd = fdtable.install(c, .pipe, p)
    if rfd < 0 {
        // Two unrefs: refs was set to 2 above before either fd was
        // installed; the page leaks otherwise.
        pipe_mod.unref(p)
        pipe_mod.unref(p)
        return -1
    }
    const wfd = fdtable.install(c, .pipe, p)
    if wfd < 0 {
        // close() clears the read-end slot and drops its ref; one more
        // unref drops the write-end ref that was never installed.
        _ = fdtable.close(c, rfd)
        pipe_mod.unref(p)
        return -1
    }
    return (#as(i64, wfd) << 32) | (#as(i64, rfd) & 0xFFFF_FFFF)
}

// Post-lookup body for pipe reads. One 512-byte kbuf-bounded drain
// per call (POSIX short-read for pipes); the blocking is inside
// pipe_mod.read. Reached through the unified sys_read dispatcher.
fn readPipeBacked(p *mut pipe_mod.Pipe, buf_uva u64, len u64) i64 {
    var kbuf [512]u8 = undefined
    const n = #min(len, #as(u64, #intCast(kbuf.len)))
    const copied = pipe_mod.read(p, &kbuf, n)
    if copied > 0 {
        if copy_to_user(buf_uva, &kbuf, #intCast(copied)) < 0 { return -1 }
    }
    return copied
}

// Post-lookup body for pipe writes. Mirrors readPipeBacked:
// 512-byte kbuf, single push per call (no loop — caller iterates if
// it has more data than fits the ring). Reached through the unified
// sys_write dispatcher.
fn writePipeBacked(p *mut pipe_mod.Pipe, buf_uva u64, len u64) i64 {
    var kbuf [512]u8 = undefined
    const n = #min(len, #as(u64, #intCast(kbuf.len)))
    if copy_from_user(&kbuf, buf_uva, n) < 0 { return -1 }
    return pipe_mod.write(p, &kbuf, n)
}

export fn sys_socket() void {}
export fn sys_msgget() void {}
export fn sys_semget() void {}
export fn sys_shmget() void {}

// Device Management
//
// Console ABI. The unified (fd,buf,len) ABI at slots 32..35 routes
// console fds through the same tagged `fds` table as pipes and files;
// the post-lookup readConsoleBytes / writeConsoleBytes helpers below
// back the sys_read / sys_write dispatchers. fd 0/1/2 are pre-installed
// as console slots at PID-1 bring-up (src/kernel.zig:kernel_process),
// so user code reaches stdin/stdout/stderr without an explicit open.

// Post-lookup body for console reads. Console reads are short by
// design — see src/console.zig:console_read for the blocking and
// POSIX-TTY semantics. Reached through the unified sys_read
// dispatcher.
fn readConsoleBytes(buf_uva u64, len u64) i64 {
    var kbuf [256]u8 = undefined
    const n = #min(len, #as(u64, #intCast(kbuf.len)))
    const copied = console.console_read(&kbuf, n)
    if copied > 0 {
        if copy_to_user(buf_uva, &kbuf, #intCast(copied)) < 0 { return -1 }
        // Cooked-style echo/mask when enabled: printable bytes only,
        // one NUL-terminated byte at a time through the console mux. Control
        // bytes (CR/LF, and the [TEST] console-echo 0xC0..0xC7 injects) are
        // never emitted, so with both flags off (the default) this filter
        // leaves every existing scenario's serial output byte-identical; with
        // mask on, each printable byte is echoed as '*' instead of itself.
        if console_echo || console_mask {
            var j i64 = 0
            while j < copied {
                const ch = kbuf[#intCast(j)]
                if ch >= 0x20 && ch < 0x7F {
                    // mask wins over echo: show '*' instead of the secret.
                    const out u8 = if (console_mask) '*' else ch
                    var one [2]u8 = [2]u8{ out, 0 }
                    console_tx(#ptrCast(&one), 1)
                }
                j += 1
            }
        }
    }
    return copied
}

// Console-output sink (USB-C gadget console). Only the *user*
// console-write path is muxed here: once the DWC2 CDC-ACM gadget is
// enumerated on the host, fsh / user output streams out the bulk-IN
// endpoint (board.usb.cdc_tx); otherwise it falls back to the Mini-UART
// (main_output(MU, …)). This is a "switch", not a tee — the device-side
// trace already gives a parallel debug channel on the MU.
//
// Kernel [Debug] traces keep calling main_output(MU, …) directly and are
// deliberately NOT routed here, so boot diagnostics stay on the UART
// regardless of USB state (they share main_output with the user path, so
// the mux must live here, not inside main_output's MU case).
//
// `s` must be NUL-terminated at s[len] — the MU fallback is a C-string
// walker; `len` carries the true byte count for the length-framed USB
// bulk path. On virt, enumerated() is always false → MU fallback, so CI
// over QEMU is unaffected.
fn console_tx(s [*:0]u8, len u64) void {
    if board_usb_enumerated() {
        board_usb_cdc_tx(s, len)
    } else {
        main_output(MU, s)
    }
}

// Post-lookup body for console writes. Pulls bytes from
// the user buffer in 255-byte chunks, NUL-terminates each chunk in
// the kernel scratch buffer, and hands it to console_tx via the
// existing C-string contract. Returns total bytes pushed. Reached
// through the unified sys_write dispatcher.
//
// Limitation: embedded NULs in the payload truncate the affected
// chunk because main_output dispatches to mini_uart_send_string /
// pl011_uart_send_string, both NUL-terminated walkers. The
// fd-redirect coverage is text-only; binary console output is future
// work alongside a length-aware UART send path.
fn writeConsoleBytes(buf_uva u64, len u64) i64 {
    var kbuf [256]u8 = undefined
    var done u64 = 0
    while done < len {
        const take = #min(len - done, #as(u64, #intCast(kbuf.len - 1)))
        if copy_from_user(&kbuf, buf_uva + done, take) < 0 {
            return if (done > 0) #intCast(done) else -1
        }
        kbuf[take] = 0
        console_tx(#ptrCast(&kbuf), take)
        done += take
    }
    return #intCast(done)
}

// SYS_SET_CONSOLE_MODE (slot 25) — sets the
// kernel console echo/mask flags. CONSOLE_MODE_ECHO on => readConsoleBytes
// echoes drained printable bytes (cooked-style); CONSOLE_MODE_MASK on =>
// it echoes a '*' per printable byte instead (password masking); neither
// (the default) keeps the historical split where the kernel never echoes and
// userland readline owns echo. /bin/login uses ECHO to show the typed
// username and MASK to acknowledge the password without revealing it. Full
// termios / line discipline is still future work. SYS_CLOSE_CONSOLE stays
// inert (the unified ABI absorbs the close side via SYS_CLOSE on a console fd).
export fn sys_setConsoleMode(mode u64) i64 {
    console_echo = (mode & defs.CONSOLE_MODE_ECHO) != 0
    console_mask = (mode & defs.CONSOLE_MODE_MASK) != 0
    return 0
}
export fn sys_closeConsole() void {}

// Debug-only — not part of the stable ABI.
// Pushes one byte into the kernel RX ring as if it had arrived on
// the UART. Powers deterministic [TEST] console-echo coverage on
// QEMU where there is no external input driver. Document as debug-only
// in DOCUMENTATION.md §5 and remove once a real host-input driver lands.
export fn sys_console_inject(byte u64) void {
    console.console_test_push(#truncate(byte))
}

// Retired ABI slots. The numbers stay reserved forever — a stale binary
// invoking one gets a clean -1, never a silently different syscall.
export fn sys_retired() i64 {
    return -1
}

// ---- Unified fd-table ABI ----
//
// SYS_READ / SYS_WRITE / SYS_CLOSE / SYS_DUP2 dispatch by the fd's
// kind tag in current.fds and route through the post-lookup backend
// helpers (readConsoleBytes / writeConsoleBytes / readPipeBacked /
// writePipeBacked / readFileBacked / writeFileBacked) — one code path
// per backend. This is the sole entry point for all console / pipe /
// file I/O; the legacy per-kind shims that once shared these helpers
// were retired (see the retired-slots note at sys_retired).

export fn sys_read(fd i32, buf_uva u64, len u64) i64 {
    const c = current orelse return -1
    const s = fdtable.get(c, fd) orelse return -1
    return switch #as(fdtable.Kind, #enumFromInt(s.kind)) {
        .console => readConsoleBytes(buf_uva, len),
        .pipe => readPipeBacked(#ptrCast(#alignCast(s.ptr.?)), buf_uva, len),
        .file => blk: {
            const f *mut file_mod.File = #ptrCast(#alignCast(s.ptr.?))
            const sb = vfsSb(f) orelse break :blk #as(i64, -1)
            break :blk readFileBacked(f, sb, buf_uva, len)
        },
        .none => -1,
    }
}

export fn sys_write(fd i32, buf_uva u64, len u64) i64 {
    const c = current orelse return -1
    const s = fdtable.get(c, fd) orelse return -1
    return switch #as(fdtable.Kind, #enumFromInt(s.kind)) {
        .console => writeConsoleBytes(buf_uva, len),
        .pipe => writePipeBacked(#ptrCast(#alignCast(s.ptr.?)), buf_uva, len),
        .file => blk: {
            const f *mut file_mod.File = #ptrCast(#alignCast(s.ptr.?))
            const sb = vfsSb(f) orelse break :blk #as(i64, -1)
            // Permission gate: write-intent check against the
            // metadata carried on the File since open. Open is read-
            // intent only in this ABI, so a readable-but-not-writable
            // file (0644 root, non-root caller) opens fine and fails
            // here with -EACCES instead of a backend -1.
            if !perm.checkAccess(f.mode, f.uid, f.gid, c.euid, c.egid, .write) {
                break :blk #as(i64, -defs.EACCES)
            }
            break :blk writeFileBacked(f, sb, buf_uva, len)
        },
        .none => -1,
    }
}

// Unified close. File fds need an extra step before the slot is
// cleared: vfs_close runs the backend's flush (FAT32 cluster /
// dir-entry / FSInfo writeback; initramfs no-op). Pipe and console
// slots route straight through fdtable.close — refcount handles the
// pipe-page free, console is refcount-exempt.
export fn sys_close(fd i32) i32 {
    const c = current orelse return -1
    if fdtable.getFile(c, fd) |f| {
        if vfsSb(f) |sb| {
            preempt_disable()
            vfs.vfs_close(sb, f)
            preempt_enable()
        }
    }
    return fdtable.close(c, fd)
}

export fn sys_dup2(oldfd i32, newfd i32) i32 {
    const c = current orelse return -1
    return fdtable.dup2(c, oldfd, newfd)
}

// Working-directory ABI. Stores a NUL-terminated,
// `.` / `..`-collapsed absolute path into current.cwd. Relative
// arguments are joined against the existing cwd and then collapsed;
// absolute arguments are collapsed in place. No backend existence
// check — sys_readdir lands the directory probe; until
// then `chdir` is a pure store the open/execve boundary trusts.
// Returns 0 on success, -1 on a wild user pointer / un-NUL-terminated
// input / oversize composition / oversize resolved path.
export fn sys_chdir(path_ptr u64) i32 {
    const c = current orelse return -1

    var kpath [layout.CWD_SIZE]u8 = undefined
    var i usize = 0
    var nul_found bool = false
    while i < kpath.len - 1 {
        var b u8 = 0
        if copy_from_user(#ptrCast(&b), path_ptr + i, 1) < 0 { return -1 }
        kpath[i] = b
        if b == 0 {
            nul_found = true
            break
        }
        i += 1
    }
    if !nul_found { return -1 }

    const rel = std.mem.span(#as([*:0]u8, #ptrCast(&kpath)))
    const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)

    // Resolve into a fresh scratch buffer first, then swap into the
    // task slot only after a successful normalisation — keeps `cwd`
    // intact on overflow / overlong-collapse failure.
    var resolved_buf [layout.CWD_SIZE]u8 = undefined
    // Leave one byte for the trailing NUL in cwd[].
    const resolved = path_mod.joinResolve(cwd_slice, rel, resolved_buf[0 .. layout.CWD_SIZE - 1]) orelse return -1

    #memcpy(c.cwd[0..resolved.len], resolved)
    c.cwd[resolved.len] = 0
    return 0
}

// Working-directory readback. Copies the calling task's
// NUL-terminated `cwd` into the user buffer (path plus terminator) and
// returns the path length excluding the NUL. The readback half of
// sys_chdir: `cwd` is a plain TaskStruct field, so this allocates
// nothing and the harness free-page baseline is untouched. Returns -1 on
// a wild buffer UVA or a `len` too small to hold the path plus its NUL —
// a short buffer gets nothing, never a truncated path.
export fn sys_getcwd(buf_uva u64, len u64) i64 {
    const c = current orelse return -1
    const cwd = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)
    if len < cwd.len + 1 { return -1 }
    if copy_to_user(buf_uva, cwd.ptr, cwd.len) < 0 { return -1 }
    const nul = [_]u8{0}
    if copy_to_user(buf_uva + cwd.len, &nul, 1) < 0 { return -1 }
    return #intCast(cwd.len)
}

// Directory enumeration. Stateless index walk: fill
// the `index`-th entry of the directory at `path` into the caller's
// Dirent and return 0; return -1 at end-of-directory, a bad/unmounted
// path, or a wild user pointer. There is no fd cursor — see
// lib/syscall_defs.zig SYS_READDIR for the stateless-ABI rationale. The
// path reaches the kernel through the soft copy_from_user (a wild UVA
// returns -1 with no zombification); relative paths join against
// current.cwd exactly as sys_openFile does, since vfs.resolve is still
// absolute-only. Allocates nothing — a future OOM audit inherits no
// new site from readdir (the core reason the ABI is stateless).
export fn sys_readdir(path_ptr u64, index u64, dirent_uva u64) i32 {
    const c = current orelse return -1

    var kpath [layout.CWD_SIZE]u8 = undefined
    var i usize = 0
    var nul_found bool = false
    while i < kpath.len - 1 {
        var b u8 = 0
        if copy_from_user(#ptrCast(&b), path_ptr + i, 1) < 0 { return -1 }
        kpath[i] = b
        if b == 0 {
            nul_found = true
            break
        }
        i += 1
    }
    if !nul_found { return -1 }
    const raw_path = std.mem.span(#as([*:0]u8, #ptrCast(&kpath)))

    var join_buf [layout.CWD_SIZE]u8 = undefined
    var resolved []u8 = undefined
    if raw_path.len > 0 && raw_path[0] == '/' {
        resolved = raw_path
    } else {
        const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)
        resolved = path_mod.joinResolve(cwd_slice, raw_path, &join_buf) orelse return -1
    }

    var dirent defs.Dirent = .{}
    preempt_disable()
    const r = vfs.vfs_readdir(resolved, index, &dirent)
    preempt_enable()
    if r < 0 { return -1 }

    if copy_to_user(dirent_uva, std.mem.asBytes(&dirent), #sizeOf(defs.Dirent)) < 0 { return -1 }
    return 0
}

// Kernel-log read. Snapshots the most-recent min(len, retained)
// bytes of the kernel byte-ring (src/klog_ring.zig) into the caller's
// buffer, oldest-first, and returns the count (0 on an empty ring). The
// window head/tail are read once up front so a concurrent main_output
// push cannot move `start` out from under the copy; the bytes are bounced
// through a 512-byte kernel buffer — the ring data wraps the modulo
// boundary, so it is not contiguous for a single copy_to_user — exactly
// like readFileBacked. Allocates nothing (the ring is static BSS), so the
// harness free-page baseline is untouched. A wild buffer UVA returns -1
// via the soft copy_to_user path; the task does not zombify.
export fn sys_klog_read(buf_uva u64, len u64) i64 {
    _ = current orelse return -1

    // Snapshot the window bounds together: head/tail are monotone, so even
    // if a push lands mid-copy the indices stay masked and in-bounds, and
    // reading them as a pair keeps `start` consistent with `total`.
    const head = klog_ring.klog.head
    const tail = klog_ring.klog.tail
    const total = #min(len, head -% tail)
    const start = head -% total // most recent `total` bytes

    var copied u64 = 0
    while copied < total {
        var kbuf [512]u8 = undefined
        const take = #min(total - copied, #as(u64, #intCast(kbuf.len)))
        var i u64 = 0
        while i < take {
            kbuf[#intCast(i)] = klog_ring.klog.byteAt(start +% copied +% i)
            i += 1
        }
        if copy_to_user(buf_uva + copied, &kbuf, take) < 0 {
            return if (copied > 0) #intCast(copied) else -1
        }
        copied += take
    }
    return #intCast(copied)
}

// ---- Process credentials ----
//
// The identity layer for the login/auth flow. Getters report the calling
// task's real / effective uid / gid (carried on TaskStruct, inherited by
// fork, preserved by execve). setuid / setgid apply a root-gated policy:
// an euid-0 (root) caller sets BOTH the real and effective id to any
// value; a dropped (non-root) caller may only reset to an id it already
// holds — so /bin/login (root) can drop to a user, but that user can
// never climb back. Failure returns -1 (EPERM-lite); the i64 return makes
// the sentinel representable. `current` is always set in EL0 syscall
// context — the orelse -1 is for the impossible null only.
export fn sys_getuid() i64 {
    const c = current orelse return -1
    return #intCast(c.uid)
}
export fn sys_geteuid() i64 {
    const c = current orelse return -1
    return #intCast(c.euid)
}
export fn sys_getgid() i64 {
    const c = current orelse return -1
    return #intCast(c.gid)
}
export fn sys_getegid() i64 {
    const c = current orelse return -1
    return #intCast(c.egid)
}
export fn sys_setuid(uid u32) i64 {
    const c = current orelse return -1
    if c.euid == 0 {
        c.uid = uid
        c.euid = uid
        return 0
    }
    if (uid == c.uid) || (uid == c.euid) {
        c.euid = uid
        return 0
    }
    return -1
}
export fn sys_setgid(gid u32) i64 {
    const c = current orelse return -1
    if c.euid == 0 {
        c.gid = gid
        c.egid = gid
        return 0
    }
    if (gid == c.gid) || (gid == c.egid) {
        c.egid = gid
        return 0
    }
    return -1
}

// ---- Authentication ----

// The initramfs seed copy — read-only, baked into the kernel image, always
// present. The recovery anchor of the anti-brick design.
const SHADOW_PATH []u8 = "/etc/shadow"
// The writable FAT32 copy — what /bin/passwd rewrites. Consulted first so
// password changes take effect; absent on QEMU virt (no SD card) and on a
// freshly formatted card, in which case auth falls back to the seed.
const MNT_SHADOW_PATH []u8 = "/mnt/shadow"

// Auth working buffers — static, NOT stack. The per-task kernel stack
// shares its 4 KiB page with TaskStruct (~2.4 KiB usable above KeRegs),
// and the PBKDF2 / HMAC / SHA-256 call frames below already need a large
// share of that. Carrying another ~1.4 KiB of credential / file / digest
// buffers in sys_authenticate's own frame overflows the page and smashes
// the TaskStruct tail (fds table → wild vtable dispatch on the next
// sys_write). Statics sidestep that, exactly like execve.zig's exec_buf /
// argv_scratch. Same serialization argument too: single core, and the only
// callers are PID-1's [TEST] scenarios, /bin/login, and /bin/passwd — never
// concurrent. The password copy is overwritten by the next call; nothing
// here persists secrets beyond the syscall that wrote them.
const AuthScratch = struct {
    user [64]u8,
    pass [128]u8,
    fbuf [1024]u8,
    salt [64]u8,
    stored [64]u8,
    derived [32]u8,
}
var auth_scratch AuthScratch = undefined

const ReadFileError = error{ OpenFailed, ReadFailed }

// In-kernel whole-file read through the privileged VFS door (the
// execve.zig stack-File recipe: no file_mod.alloc → no page → the harness
// free-page baseline is untouched; preempt-guarded per VFS call). Returns
// the filled prefix of `buf`. OpenFailed = path does not resolve (not
// mounted / absent); ReadFailed = it resolved but a backend read errored
// (the corruption signal the fallback chain reports loudly).
fn readWholeFile(path []u8, buf []mut u8) ReadFileError![]u8 {
    var open_result vfs.OpenResult = .{}
    preempt_disable()
    const sb_opt = vfs.vfs_open(path, &open_result)
    preempt_enable()
    const sb = sb_opt orelse return error.OpenFailed

    var f file_mod.File = .{}
    f.private = open_result.private
    f.size = open_result.size
    f.offset = 0
    var off usize = 0
    var failed bool = false
    while off < buf.len {
        const take u64 = buf.len - off
        preempt_disable()
        const got = vfs.vfs_read(sb, &f, buf[off..].ptr, take)
        preempt_enable()
        if got < 0 {
            failed = true
            break
        }
        if got == 0 { break }
        off += #intCast(got)
    }
    preempt_disable()
    vfs.vfs_close(sb, &f)
    preempt_enable()
    if failed { return error.ReadFailed }
    return buf[0..off]
}

// Outcome of checking one credential pair against one shadow database.
// The distinction between no_user and corrupt drives the fallback chain:
// a parseable file that simply lacks the user is an authoritative denial,
// while a file with nothing parseable in it (truncation, garbage, a
// half-finished rewrite) falls back to the initramfs seed.
const VerifyResult = enum { match, mismatch, no_user, corrupt }

// Walk `content` line by line and verify `password` against the first
// line whose user field equals `username`. Uses auth_scratch.salt /
// .stored / .derived as decode + KDF scratch (single-caller discipline,
// see auth_scratch above).
fn verifyAgainst(content []u8, username []u8, password []u8) VerifyResult {
    var any_parseable bool = false
    var line_start usize = 0
    var k usize = 0
    while k <= content.len {
        if (k == content.len) || (content[k] == '\n') {
            const line = content[line_start..k]
            line_start = k + 1
            if line.len != 0 {
                if shadow.parseLine(line) |entry| {
                    any_parseable = true
                    // Demo-grade ceiling: PBKDF2 runs only after a username match, so
                    // a miss returns sooner than a hit — a username-enumeration timing
                    // oracle. Left unmitigated on purpose: the shipped accounts are
                    // build-time public (named in the README), so the oracle reveals
                    // nothing secret. If accounts ever become private, run a dummy KDF
                    // on the miss path so a miss costs the same as a hit.
                    if std.mem.eql(u8, entry.user, username) {
                        // A matching line with undecodable hex is corruption, not denial.
                        const salt_n = shadow.hexDecode(entry.salt_hex, &auth_scratch.salt) orelse return .corrupt
                        const hash_n = shadow.hexDecode(entry.hash_hex, &auth_scratch.stored) orelse return .corrupt
                        if (hash_n == 0) || (hash_n > 32) { return .corrupt }

                        sha256.pbkdf2HmacSha256(
                            password,
                            auth_scratch.salt[0..salt_n],
                            entry.iterations,
                            auth_scratch.derived[0..hash_n]
                        )
                        if sha256.ctEql(auth_scratch.derived[0..hash_n], auth_scratch.stored[0..hash_n]) { return .match }
                        return .mismatch
                    }
                }
            }
        }
        k += 1
    }
    return if (any_parseable) .no_user else .corrupt
}

// sys_authenticate — the kernel-owned credential verifier. /bin/login
// passes a username + plaintext password; the kernel reads the active
// shadow database, finds the matching line, runs PBKDF2-HMAC-SHA256 over
// the password with the stored salt + iteration count, and constant-time-
// compares the result to the stored verifier. Returns 0 on a match, -1 on
// anything else (no such user, malformed line, wild pointer, hash
// mismatch). Userland never sees a salt or hash — only pass/fail; the KDF
// lives here (the design intent committed in src/sha256.zig's header).
//
// Shadow source order: the writable FAT32 copy (/mnt/shadow) is
// authoritative when it is present and parseable — that is where
// sys_passwd writes. The initramfs seed (/etc/shadow) is the fallback for
// QEMU virt (no SD), a fresh card, or a corrupt FAT32 copy — the latter
// two announce themselves loudly (anti-brick: corruption never locks the
// operator out, it falls back to the baked-in seed credentials).
//
// The plaintext password crosses the user→kernel boundary exactly once,
// into a static scratch buffer that the next call overwrites.
export fn sys_authenticate(user_uva u64, user_len u64, pass_uva u64, pass_len u64) i64 {
    _ = current orelse return -1

    // Scrub the plaintext password and the derived verifier on every exit
    // path. These live in static BSS (single-caller scratch), so
    // without this the last login's secret lingers until the next call happens
    // to overwrite it — a post-boot memory dump could lift it. Plain @memset
    // (not a volatile loop) suffices: auth_scratch's address escapes to the
    // extern copy_from_user below, so the stores are not dead-store-eliminable.
    // Mirrors execve.zig's argv_scratch scrub. Runs after the result is
    // computed, so pass/fail timing is unchanged.
    defer {
        #memset(&auth_scratch.pass, 0)
        #memset(&auth_scratch.derived, 0)
    }

    // Copy the credentials under hard caps. Soft-fail on overflow or a wild
    // UVA (same contract as sys_openFile — no zombify).
    if (user_len == 0) || (user_len > auth_scratch.user.len) { return -1 }
    if pass_len > auth_scratch.pass.len { return -1 }
    if copy_from_user(&auth_scratch.user, user_uva, user_len) < 0 { return -1 }
    if (pass_len > 0) && (copy_from_user(&auth_scratch.pass, pass_uva, pass_len) < 0) { return -1 }
    const username = auth_scratch.user[0..user_len]
    const password = auth_scratch.pass[0..pass_len]

    // 1. The writable FAT32 shadow, when it exists and is intact.
    if readWholeFile(MNT_SHADOW_PATH, &auth_scratch.fbuf) |content| {
        switch verifyAgainst(content, username, password) {
            .match => return 0,
            .mismatch, .no_user => return -1,
            // Nothing parseable → announce + fall through to the seed.
            .corrupt => main_output(MU, "[Debug] /mnt/shadow corrupt - falling back to initramfs seed\n"),
        }
    } else |err| {
        // OpenFailed is the normal miss (virt / fresh card) → silent.
        // ReadFailed means the file is there but unreadable → announce.
        // (`unreadable` is bound first so the else-block holds two
        // statements: a lone `if` in an error-capture else lowers to a
        // capture-less `else if` in stage1, dropping the `|err|` binding.)
        const unreadable = err == error.ReadFailed
        if unreadable {
            main_output(MU, "[Debug] /mnt/shadow unreadable - falling back to initramfs seed\n")
        }
    }

    // 2. The initramfs seed (always present, read-only).
    const content = readWholeFile(SHADOW_PATH, &auth_scratch.fbuf) catch return -1
    return switch verifyAgainst(content, username, password) {
        .match => 0,
        else => -1,
    }
}

// ---- Password change ----

// The /etc/passwd account database (initramfs, read-only). sys_passwd
// reads it to map the caller's uid back to a login name for the
// "non-root may only change its own record" rule. The account LIST is
// build-time-immutable; only passwords are mutable state.
const PASSWD_PATH []u8 = "/etc/passwd"

// sys_passwd working buffers — static for the same stack-budget and
// single-caller reasons as auth_scratch above (the PBKDF2 frames plus
// these would smash the 2.4 KiB kernel stack). The shadow file content
// and the KDF decode/derive scratch live in auth_scratch (fbuf / salt /
// stored / derived) — sys_passwd and sys_authenticate never run
// concurrently, so sharing those buffers is free.
const PasswdScratch = struct {
    user [64]u8,
    old_pass [128]u8,
    new_pass [128]u8,
    pwbuf [512]u8,
    salt_raw [16]u8,
    salt_hex [32]u8,
    hash_hex [64]u8,
}
var passwd_scratch PasswdScratch = undefined

// In-kernel whole-file overwrite through the privileged VFS door. The
// caller guarantees content.len equals the file's current size (the
// same-length rewrite contract), so the write never grows the file and
// the FAT32 dir-entry resize branch is never taken.
fn writeWholeFile(path []u8, content []u8) bool {
    var open_result vfs.OpenResult = .{}
    preempt_disable()
    const sb_opt = vfs.vfs_open(path, &open_result)
    preempt_enable()
    const sb = sb_opt orelse return false

    var f file_mod.File = .{}
    f.private = open_result.private
    f.size = open_result.size
    f.offset = 0
    var off usize = 0
    var ok bool = true
    while off < content.len {
        preempt_disable()
        const n = vfs.vfs_write(sb, &f, content[off..].ptr, content.len - off)
        preempt_enable()
        if n <= 0 {
            ok = false
            break
        }
        off += #intCast(n)
    }
    preempt_disable()
    vfs.vfs_close(sb, &f)
    preempt_enable()
    return ok
}

// sys_passwd — kernel-owned password change (slot 46). Rewrites `user`'s
// record in the writable FAT32 shadow with a fresh kernel-minted salt and
// a PBKDF2 re-hash of the new password, in place and at the same byte
// length (the splice-safety contract — see shadow.rewriteLineInPlace).
//
// Authorization:
//   * root (euid 0) — any record, old password not required (this is the
//     recovery path: root resets a forgotten user password).
//   * everyone else — only the record whose login name maps to the
//     caller's own uid via /etc/passwd, and only with the correct old
//     password. Violations return -EACCES.
//
// Returns 0 on success; -EACCES on an authorization failure; -1 when
// there is no writable shadow (QEMU virt / fresh card — /mnt/shadow is
// the only rewrite target, the initramfs seed is immutable), the target
// user has no shadow record, the input is malformed, or the rewrite
// would change the record length.
//
// The salt source is the kernel entropy fallback (timer mix) — weak but
// fresh per change; the RNG200 hardware source is a named carve-out.
export fn sys_passwd(user_uva u64, user_len u64, old_uva u64, old_len u64, new_uva u64, new_len u64) i64 {
    const c = current orelse return -1

    // Scrub both plaintext passwords + the derived verifier on every exit
    // path (same rationale as sys_authenticate). The salt/hash hex are public
    // verifier material, not secret, so they need no scrub.
    defer {
        #memset(&passwd_scratch.old_pass, 0)
        #memset(&passwd_scratch.new_pass, 0)
        #memset(&auth_scratch.derived, 0)
    }

    // Copy all three strings under hard caps (sys_authenticate contract:
    // soft-fail on overflow or a wild UVA, no zombify).
    if (user_len == 0) || (user_len > passwd_scratch.user.len) { return -1 }
    if old_len > passwd_scratch.old_pass.len { return -1 }
    if (new_len == 0) || (new_len > passwd_scratch.new_pass.len) { return -1 }
    if copy_from_user(&passwd_scratch.user, user_uva, user_len) < 0 { return -1 }
    if (old_len > 0) && (copy_from_user(&passwd_scratch.old_pass, old_uva, old_len) < 0) { return -1 }
    if copy_from_user(&passwd_scratch.new_pass, new_uva, new_len) < 0 { return -1 }
    const username = passwd_scratch.user[0..user_len]
    const old_password = passwd_scratch.old_pass[0..old_len]
    const new_password = passwd_scratch.new_pass[0..new_len]

    // Authorization for non-root callers: own record only.
    if c.euid != 0 {
        const pw_content = readWholeFile(PASSWD_PATH, &passwd_scratch.pwbuf) catch return -1
        const own = pwfile.lookupByUid(pw_content, c.uid) orelse return -defs.EACCES
        if !std.mem.eql(u8, own.user, username) { return -defs.EACCES }
    }

    // The rewrite target must exist and be readable: /mnt/shadow only.
    // Its absence is the graceful no-writable-shadow case (QEMU virt).
    const content = readWholeFile(MNT_SHADOW_PATH, &auth_scratch.fbuf) catch return -1

    // The target record must exist and parse (we need its iteration count
    // — the rewrite keeps it, which is half of the same-length contract).
    const span = shadow.findUserLine(content, username) orelse return -1
    const old_entry = shadow.parseLine(content[span.start..span.end]) orelse return -1

    // Non-root callers must prove knowledge of the old password against
    // the very record being replaced.
    if c.euid != 0 {
        switch verifyAgainst(content, username, old_password) {
            .match => {},
            .mismatch, .no_user => return -defs.EACCES,
            .corrupt => return -1,
        }
    }

    // Mint the new verifier: fresh salt, PBKDF2 over the new password with
    // the record's existing iteration count, both hex-encoded at the fixed
    // widths the same-length contract relies on.
    _ = hwrng.fill(&passwd_scratch.salt_raw)
    _ = shadow.hexEncode(&passwd_scratch.salt_raw, &passwd_scratch.salt_hex) orelse return -1
    sha256.pbkdf2HmacSha256(
        new_password,
        &passwd_scratch.salt_raw,
        old_entry.iterations,
        auth_scratch.derived[0..32]
    )
    _ = shadow.hexEncode(auth_scratch.derived[0..32], &passwd_scratch.hash_hex) orelse return -1

    // Same-length in-place rewrite, then push the whole file back.
    // auth_scratch.fbuf still holds the file content; rewrite it there.
    const mut_content = auth_scratch.fbuf[0..content.len]
    if !shadow.rewriteLineInPlace(
        mut_content,
        username,
        &passwd_scratch.salt_hex,
        &passwd_scratch.hash_hex
    ) { return -1 }

    if !writeWholeFile(MNT_SHADOW_PATH, mut_content) { return -1 }
    return 0
}

/// Syscall dispatch table — referenced from entry.S (`adr x27, sys_call_table`).
/// Slot ↔ constant binding is compiler-enforced via the indexed
/// `t[defs.SYS_*]` writes below — a renumbering in lib/syscall_defs.zig
/// propagates here automatically and any duplicate id would overwrite
/// (and any gap would leave a null that still traps cleanly through the
/// unreachable kernel code path). The upper dispatch bound is
/// NR_SYSCALLS in arch/aarch64/asm_defs_common.inc (`b.hs` in entry.S); keep it
/// in lockstep with the highest user-facing id +1.
///
/// The unified ABI (slots 32..35) carries all console / pipe /
/// file I/O. The legacy per-kind shims at slots 0 / 5 / 8 / 9 / 11 /
/// 23 / 24 / 27..29 were retired: those slots route to sys_retired
/// (a clean -1) and their numbers are never reused.
export var sys_call_table = blk: {
    var t [defs.NR_SYSCALLS]?*anyopaque = [_]?*anyopaque{null} ** defs.NR_SYSCALLS

    t[defs.SYS_FORK] = #ptrCast(&sys_fork)
    t[defs.SYS_EXIT] = #ptrCast(&sys_exit)
    t[defs.SYS_WAIT] = #ptrCast(&sys_wait)
    t[defs.SYS_DUMP_FREE] = #ptrCast(&sys_dump_free)
    t[defs.SYS_KILL] = #ptrCast(&sys_kill)
    t[defs.SYS_EXECVE] = #ptrCast(&sys_execve)

    t[defs.SYS_OPEN_FILE] = #ptrCast(&sys_openFile)
    t[defs.SYS_SEEK] = #ptrCast(&sys_seek)

    t[defs.SYS_BRK] = #ptrCast(&sys_brk)
    t[defs.SYS_SBRK] = #ptrCast(&sys_sbrk)
    t[defs.SYS_MMAP] = #ptrCast(&sys_mmap)
    t[defs.SYS_MUNMAP] = #ptrCast(&sys_munmap)
    t[defs.SYS_MLOCK] = #ptrCast(&sys_mlock)
    t[defs.SYS_MUNLOCK] = #ptrCast(&sys_munlock)

    t[defs.SYS_PIPE] = #ptrCast(&sys_pipe)
    t[defs.SYS_SOCKET] = #ptrCast(&sys_socket)
    t[defs.SYS_MSGGET] = #ptrCast(&sys_msgget)
    t[defs.SYS_SEMGET] = #ptrCast(&sys_semget)
    t[defs.SYS_SHMGET] = #ptrCast(&sys_shmget)

    t[defs.SYS_SET_CONSOLE_MODE] = #ptrCast(&sys_setConsoleMode)
    t[defs.SYS_CLOSE_CONSOLE] = #ptrCast(&sys_closeConsole)

    t[defs.SYS_CONSOLE_INJECT] = #ptrCast(&sys_console_inject)

    t[defs.SYS_READ] = #ptrCast(&sys_read)
    t[defs.SYS_WRITE] = #ptrCast(&sys_write)
    t[defs.SYS_CLOSE] = #ptrCast(&sys_close)
    t[defs.SYS_DUP2] = #ptrCast(&sys_dup2)

    t[defs.SYS_CHDIR] = #ptrCast(&sys_chdir)
    t[defs.SYS_GETCWD] = #ptrCast(&sys_getcwd)
    t[defs.SYS_READDIR] = #ptrCast(&sys_readdir)

    t[defs.SYS_KLOG_READ] = #ptrCast(&sys_klog_read)

    t[defs.SYS_GETUID] = #ptrCast(&sys_getuid)
    t[defs.SYS_GETEUID] = #ptrCast(&sys_geteuid)
    t[defs.SYS_GETGID] = #ptrCast(&sys_getgid)
    t[defs.SYS_GETEGID] = #ptrCast(&sys_getegid)
    t[defs.SYS_SETUID] = #ptrCast(&sys_setuid)
    t[defs.SYS_SETGID] = #ptrCast(&sys_setgid)

    t[defs.SYS_AUTHENTICATE] = #ptrCast(&sys_authenticate)
    t[defs.SYS_PASSWD] = #ptrCast(&sys_passwd)
    t[defs.SYS_REBOOT] = #ptrCast(&sys_reboot)

    t[defs.SYS_MEMTOTAL] = #ptrCast(&sys_mem_total)
    t[defs.SYS_UPTIME] = #ptrCast(&sys_uptime)
    t[defs.SYS_CPU_TEMP] = #ptrCast(&sys_cpu_temp)
    t[defs.SYS_CPU_FREQ] = #ptrCast(&sys_cpu_freq)

    t[defs.SYS_CREATE] = #ptrCast(&sys_create)
    t[defs.SYS_UNLINK] = #ptrCast(&sys_unlink)
    t[defs.SYS_RENAME] = #ptrCast(&sys_rename)

    // Retired: legacy per-kind console / file / pipe / exec shims
    // (write_str, exec, readFile, writeFile, closeFile, openConsole,
    // readConsole, pipe_read, pipe_write, pipe_close). Slot numbers are
    // never reused; any caller gets -1.
    for retired in ([_]usize{ 0, 5, 8, 9, 11, 23, 24, 27, 28, 29 }) {
        t[retired] = #ptrCast(&sys_retired)
    }

    break :blk t
}

// Build-time guard: arch/aarch64/asm_defs_common.inc must declare
// `#define NR_SYSCALLS 56` to match. If you bump the highest SYS_*
// constant in lib/syscall_defs.flash, also bump the asm-side literal,
// then update this comptime check.
comptime {
    if defs.NR_SYSCALLS != 56 {
        #compileError("NR_SYSCALLS drifted from arch/aarch64/asm_defs_common.inc — keep both in lockstep")
    }
}

/// Map each syscall function pointer to its high-mem (TTBR1) alias so
/// el0_svc can `blr` through the table after the user pgd has been
/// installed in TTBR0.
export fn sys_call_table_relocate() void {
    var i usize = 0
    while i < defs.NR_SYSCALLS {
        const cur u64 = #intFromPtr(sys_call_table[i])
        sys_call_table[i] = #ptrFromInt(cur | LINEAR_MAP_BASE)
        i += 1
    }
}
raw view on GitHub →