Flash 484 lines
// execve: path-resolved ELF loader. Streams PT_LOAD segments from an open
// VFS file into a kernel buffer, then hands off to the ELF loader in
// src/fork.zig. There is no per-image size cap beyond MAX_EXEC_BYTES and
// no double-copy. Argv strings + pointer array live in the eagerly-mapped
// top stack page; entry contract is x0 = argc, x1 = argv (AAPCS64).
//
// Wired into sys.zig via execve_impl + the SYS_EXECVE dispatch slot.
// execve_impl resolves the path through the VFS shim, streams the ELF
// into a static kernel buffer, encodes argv, and hands off to the
// argv-aware loader in src/fork.zig. The kernel body sits behind a
// comptime is_kernel guard so the host-test build compiles only the pure
// encodeArgvBlock (build.zig wires this file with no kernel imports).
const std = #import("std")
const builtin = #import("builtin")
// The real execve_impl body runs only on the freestanding kernel; the
// host-test build compiles encodeArgvBlock alone. A comptime-known guard
// keeps the kernel-only branch — and therefore the kernel-only imports
// and externs below — out of host analysis: Zig only analyses the taken
// branch of a comptime if, so execveKernel and its dependencies are never
// referenced (and never resolved) when is_kernel is false.
const is_kernel = builtin.target.os.tag == .freestanding
// Kernel-only imports. Referenced solely inside execveKernel, so on the
// host build they are never analysed and need not resolve.
const task_layout = #import("task_layout")
const vfs = #import("vfs")
const user_layout = #import("user_layout")
const path_mod = #import("path")
// Permission gate: exec-intent check + the shared EACCES
// constant. Same lazy-analysis posture as the imports above.
const perm = #import("perm")
const defs = #import("syscall_defs")
// Kernel-only externs (same lazy-analysis posture as the imports).
extern var current ?*mut task_layout.TaskStruct
extern fn free_page(p u64) void
extern fn copy_from_user(kbuf [*]mut u8, uva u64, len u64) i32
extern fn preempt_disable() void
extern fn preempt_enable() void
// C-ABI trampoline into the argv-aware ELF loader (src/fork.zig). A leaf
// module cannot import the root kernel_mod where prepare_move_to_user_elf_argv
// lives, so fork.zig exports this thin shim. argv_block_ptr is a kernel
// pointer to an ArgvBlock, or 0 for the no-argv path.
extern fn move_to_user_elf_argv(blob_addr_kva u64, blob_size u64, argv_block_ptr u64) i32
// OOM-after-teardown diagnostics. A loader -1 past the point of no return
// cannot return to userland (the caller's pgd is gone), so it emits this
// marker and zombies the task, mirroring do_data_abort's fault-context OOM.
extern fn main_output(interface i32, str [*:0]u8) void
extern fn exit_process() void
const MU i32 = 0
// Largest ELF the path-resolved loader will stream in. Sits well above
// PAGE_SIZE (the retired blob loader's cap) so multi-page programs load;
// argv_echo.elf is ~4.5 KiB, fsh will stay under 16 KiB. A larger file
// resolves to a clean -1 rather than a silent clamp. Baseline-neutral:
// exec_buf lives in kernel .bss (below MALLOC_START), not the page pool.
pub const MAX_EXEC_BYTES usize = 0x10000
// One exec at a time (uniprocessor; a future SMP release revisits, same posture as
// argv_scratch). exec_buf snapshots the whole ELF contiguously so the
// loader's per-PT_LOAD memcpy walks a single blob (get_free_page would
// hand back non-contiguous pages); arg_storage holds the copied-in argv
// strings before encodeArgvBlock serialises them.
var exec_buf [MAX_EXEC_BYTES]u8 = undefined
var arg_storage [MAX_ARGV_BYTES]u8 = undefined
// execveKernel frame relief. These were execveKernel stack
// locals; they moved up here — same one-exec-at-a-time posture as
// exec_buf / arg_storage — because the per-task kernel stack shares its
// 4 KiB page with TaskStruct (~2.4 KiB usable above KeRegs) and this
// ~1.8 KiB of path / join / argv-slice buffers pushed the frame past it.
// The overflow lands in the TaskStruct tail: it had been silently
// clipping the unused tail of `cwd[]` all along, and the appended
// credential fields (added after cwd) made it visible as garbage
// euid/gid right after an exec. Container-level analysis is lazy, so the
// host-test build (which never analyses execveKernel) never sees these.
// exec_join_buf is sized to task_layout.CWD_SIZE; the comptime check in
// execveKernel keeps the literal honest without importing task_layout
// at container scope (the host build has no task_layout module).
var exec_kpath [1024]u8 = undefined
var exec_join_buf [256]u8 = undefined
var exec_argv_slices [MAX_ARGV][]u8 = undefined
// Maximum argv string count surfaced to userland. Bounded by the top
// stack page (one PAGE_SIZE for strings + pointer array).
pub const MAX_ARGV usize = 32
// Maximum total argv byte budget — strings + pointer array combined.
// Picked under PAGE_SIZE so the eagerly-mapped top stack page holds
// the whole block with headroom for the initial sp alignment.
pub const MAX_ARGV_BYTES usize = 3072
// Encoded argv-on-stack image. encodeArgvBlock fills `bytes` against a
// kernel-side scratch buffer; prepare_move_to_user_elf copies it into
// the top stack page's KVA alias and writes argc/argv/sp into the
// task's saved register frame before eret.
pub const ArgvBlock = struct {
sp u64,
argv_uva u64,
argc u64,
bytes []mut u8,
}
export fn execve_impl(path_ptr u64, argv_ptr u64) i32 {
// is_kernel is comptime-known, so Zig analyses only the taken branch:
// execveKernel (and its kernel-only imports) stay out of the host build.
if is_kernel {
return execveKernel(path_ptr, argv_ptr)
} else {
return -1 // host: only encodeArgvBlock is exercised
}
}
// Real path-resolve → copy-argv → stream-PT_LOAD → set-regs flow. Every
// user copy and validation happens BEFORE the address-space teardown
// ("point of no return"), so a wild path/argv UVA soft-fails to -1 with
// the caller intact — the same contract gate-4's [TEST] efault-syscall
// proves for sys_openFile.
fn execveKernel(path_ptr u64, argv_ptr u64) i32 {
const c = current orelse return -1
// Serialise the WHOLE of execveKernel. It fills, then much later consumes,
// a pile of shared kernel statics (exec_kpath / exec_join_buf /
// exec_argv_slices, arg_storage, argv_scratch, exec_buf — the "one exec at
// a time" posture at exec_buf's decl). The final consume is
// move_to_user_elf_argv, which memcpys out of BOTH exec_buf and
// argv_scratch — long after the fill — so a timer preempt anywhere from the
// first static write down to that consume could schedule a second task
// through execveKernel, clobber the buffers, and leave this task loading a
// corrupted image. preempt_count is per-task and timer_tick honours
// preempt_count > 0 (src/sched.zig), so this one disable defers
// rescheduling across the entire body; the defer re-balances on every
// return. (The OOM branch calls noreturn exit_process without running the
// defer, but exit_process zombifies this task and voluntary _schedule
// switches away regardless of preempt_count, so the leaked count is inert —
// the next `current` carries its own.) The inner open/fill/close guards
// below now nest harmlessly under this. NB this supersedes the earlier
// fill-only guard, which re-enabled preemption BEFORE the consume and so
// left the buffer clobberable in the gap between fill and load.
preempt_disable()
defer preempt_enable()
// The static join buffer must stay in lockstep with the cwd budget
// (see the container-scope comment at exec_join_buf). Both operands are
// comptime-known, so the untaken arm — and #compileError with it — is
// never analysed unless the lengths drift. (Flash has no in-function
// comptime block, so this comptime-known discard carries the assert.)
_ = if (exec_join_buf.len == task_layout.CWD_SIZE) 0 else #compileError("exec_join_buf must match task_layout.CWD_SIZE")
// 1. Copy the path in (byte loop, soft-fail on a wild UVA — mirrors
// sys_openFile:195-204). No teardown yet → the child survives a fault.
const kpath = &exec_kpath
var pi usize = 0
var nul_found bool = false
while pi < kpath.len - 1 {
var b u8 = 0
if copy_from_user(#ptrCast(&b), path_ptr + pi, 1) < 0 { return -1 }
kpath[pi] = b
if b == 0 {
nul_found = true
break
}
pi += 1
}
if !nul_found { return -1 } // not NUL-terminated within the buffer
const raw_path = std.mem.span(#as([*:0]u8, #ptrCast(kpath)))
// Relative paths (no leading '/') are joined against current.cwd
// and `.` / `..` collapsed via the host-tested helper in
// src/path.zig; absolute paths pass through. Still pre-teardown
// (the VFS open below is the next failable step), so an oversize
// join returns -1 with the caller intact.
var path []u8 = undefined
if raw_path.len > 0 && raw_path[0] == '/' {
path = raw_path
} else {
const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0)
path = path_mod.joinResolve(cwd_slice, raw_path, &exec_join_buf) orelse return -1
}
// 2. Copy argv in: walk the NULL-terminated user pointer array, copy
// each NUL-terminated string into arg_storage, build kernel slices.
// Bounded by MAX_ARGV count and MAX_ARGV_BYTES total; any
// fault/overflow → -1 (still pre-teardown).
const slices = &exec_argv_slices
var argc usize = 0
var store_off usize = 0
if argv_ptr != 0 {
while true {
if argc >= MAX_ARGV { return -1 }
var p u64 = 0
if copy_from_user(#ptrCast(&p), argv_ptr + argc * 8, 8) < 0 { return -1 }
if p == 0 { break }
const start = store_off
while true {
var b u8 = 0
if copy_from_user(#ptrCast(&b), p + (store_off - start), 1) < 0 { return -1 }
if b == 0 { break }
if store_off >= MAX_ARGV_BYTES { return -1 }
arg_storage[store_off] = b
store_off += 1
}
slices[argc] = arg_storage[start..store_off]
argc += 1
}
}
// 3. Serialise the argv block (lands in argv_scratch, a static that
// survives the teardown below). Soft-fail → -1.
const blk = encodeArgvBlock(user_layout.STACK_TOP, argc, slices) orelse return -1
// 4. Resolve the path through the VFS shim (preempt-guarded like
// sys_openFile:208-210). Backend miss → -1.
var open_result vfs.OpenResult = .{}
preempt_disable()
const sb_opt = vfs.vfs_open(path, &open_result)
preempt_enable()
const sb = sb_opt orelse return -1
// Permission gate: exec-intent check against the caller's
// effective ids. Still pre-teardown, so a denied exec soft-fails to
// -EACCES with the caller's address space intact — same contract as
// the path/argv faults above. (A check after the teardown would
// zombie the task instead of returning.)
if !perm.checkAccess(
open_result.mode,
open_result.uid,
open_result.gid,
c.euid,
c.egid,
.exec
) { return -defs.EACCES }
if open_result.size > MAX_EXEC_BYTES { return -1 }
// 5. Stream the whole file into exec_buf via a local stack File (no
// file_mod.alloc → no page → baseline-neutral). preempt-guard per
// read on the unified read path; EOF (n == 0) ends the loop.
var f task_layout.File = .{}
f.private = open_result.private
f.size = open_result.size
f.offset = 0
var off usize = 0
// Hold preemption disabled across the ENTIRE fill, not per chunk:
// exec_buf is a shared kernel static, so a timer preempt between
// chunks could schedule a second task into execveKernel that
// overwrites the same buffer mid-stream → corrupted image. preempt is
// a counter, so every exit path below re-balances exactly once.
preempt_disable()
while off < MAX_EXEC_BYTES {
const take u64 = MAX_EXEC_BYTES - off
const n = vfs.vfs_read(sb, &f, exec_buf[off..].ptr, take)
if n < 0 {
preempt_enable()
return -1
}
if n == 0 { break }
off += #intCast(n)
}
preempt_enable()
const file_size u64 = off
// ELF magic gate: reject a non-ELF file. Still pre-teardown.
const is_elf = file_size >= 4 &&
exec_buf[0] == 0x7F && exec_buf[1] == 'E' &&
exec_buf[2] == 'L' && exec_buf[3] == 'F'
if !is_elf { return -1 }
// vfs_close is inert for initramfs but call it for backend symmetry.
preempt_disable()
vfs.vfs_close(sb, &f)
preempt_enable()
// 6. POINT OF NO RETURN — tear down the caller's address space.
// Nothing below can soft-fail.
// c.fds is deliberately NOT touched: POSIX execve preserves the
// fd table so a shell can hand a child its redirected stdio.
// c.uid/gid/euid/egid are likewise preserved (the same TaskStruct
// survives the image swap), so a privilege drop done in /bin/login
// before execve carries into the shell. Only mm pages + pgd go away.
var i usize = 0
while i < task_layout.MAX_PAGE_COUNT {
const pa = c.mm.user_pages[i].pa
if pa != 0 { free_page(pa) }
c.mm.user_pages[i] = .{}
i += 1
}
i = 0
while i < task_layout.MAX_PAGE_COUNT {
const kp = c.mm.kernel_pages[i]
if kp != 0 { free_page(kp) }
c.mm.kernel_pages[i] = 0
i += 1
}
c.mm.pgd = 0
// 7. Hand off to the argv-aware loader: PT_LOAD map + eager stack +
// argv memcpy + x0/x1/sp + set_pgd. Returns 0 (eret jumps to
// e_entry, so the caller's post-svc PC is unreachable) or -1. blk is
// a stack local — the trampoline derefs it by value immediately, and
// blk.bytes points into argv_scratch (static).
const rc = move_to_user_elf_argv(#intFromPtr(&exec_buf), file_size, #intFromPtr(&blk))
if rc < 0 {
// Past the point of no return: the address space is already torn
// down (pgd == 0), so the caller cannot resume. A loader -1 here is
// OOM (allocate_user_page exhausted mid-PT_LOAD / stack). Emit the
// marker and zombie the task. exit_process never returns.
main_output(MU, "[KERN] OOM\n")
exit_process()
}
// Success: the eret jumps to e_entry, so this "return value" is never
// read by the (now-replaced) caller. Instead ret_from_syscall
// (arch/aarch64/entry.S) does `str x0, [sp, 0]` AFTER the loader runs, storing
// this value into the saved-x0 slot — which becomes the new program's
// x0. The AAPCS64 entry contract is x0 = argc, so success MUST return
// argc: the loader's `regs.regs[0] = argc` frame write is otherwise
// clobbered by that str (x1 = argv survives — ret_from_syscall touches
// only x0). argc <= MAX_ARGV (32), so the i32 cast cannot truncate.
return #intCast(argc)
}
// Kernel-side scratch buffer the encoder serialises into. Single-
// threaded exec path + sequential host tests, so a module-level buffer
// is safe; prepare_move_to_user_elf copies the returned slice into the
// top stack page before any reuse.
var argv_scratch [MAX_ARGV_BYTES]u8 = undefined
// Lay out the argv block (pointer array + NUL-terminated strings) for a
// fresh user stack, high → low inside the top stack page:
//
// top_stack_uva ← exclusive end of the mapped page
// NULL guard (8 B)
// argv[argc-1] string … argv[0] string (NUL-terminated, packed)
// NULL terminator (8 B, == argv[argc])
// argv[argc-1] ptr … argv[0] ptr (8 B each, UVA into strings)
// ← sp == argv_uva == &argv[0]
//
// The returned `bytes` are the serialised image whose lowest byte lands
// at top_stack_uva - bytes.len; prepare_move_to_user_elf memcpys it into
// the page's KVA alias at offset PAGE_SIZE - bytes.len. Pointers are
// computed as user VAs against that final placement, so `top_stack_uva`
// must be the user VA of the top of the stack page (STACK_TOP), not the
// kernel alias. sp is 16-byte aligned per AAPCS64 (STACK_TOP is page-
// aligned, so aligning the total length to 16 suffices).
//
// Returns null on a soft fault: more than MAX_ARGV strings, or a total
// image larger than MAX_ARGV_BYTES (callers turn this into a clean -1
// rather than a half-built stack).
pub fn encodeArgvBlock(
top_stack_uva u64,
argc usize,
kargv [*][]u8
) ?ArgvBlock {
if argc > MAX_ARGV { return null }
// String bytes = each arg plus its NUL terminator. Bail early if the
// strings alone blow the budget (guards against usize overflow on a
// pathological length too).
var str_bytes usize = 0
var i usize = 0
while i < argc {
str_bytes += kargv[i].len + 1
if str_bytes > MAX_ARGV_BYTES { return null }
i += 1
}
// Region sizes. The pointer array is argc entries; argv[argc] NULL
// terminator and the top NULL guard are 8 B each.
const ptr_bytes = argc * 8
const core = ptr_bytes + 8 + str_bytes + 8
const total = std.mem.alignForward(usize, core, 16)
if total > MAX_ARGV_BYTES { return null }
// scratch[0] is the lowest byte → final user VA top_stack_uva - total.
const base_uva = top_stack_uva - total
#memset(argv_scratch[0..total], 0)
// Pointer array at [0, ptr_bytes); argv[argc] NULL at [ptr_bytes,
// ptr_bytes+8) is left zero. Strings packed ascending from there,
// argv[0] lowest. Each pointer is the user VA of its string.
var str_off usize = ptr_bytes + 8
i = 0
while i < argc {
const s = kargv[i]
std.mem.writeInt(u64, argv_scratch[i * 8 ..][0..8], base_uva + str_off, .little)
#memcpy(argv_scratch[str_off..][0..s.len], s)
argv_scratch[str_off + s.len] = 0
str_off += s.len + 1
i += 1
}
// [str_off, total) is the NULL guard + 16-byte alignment pad, already
// zeroed by the memset above.
return .{
.sp = base_uva,
.argv_uva = base_uva,
.argc = argc,
.bytes = argv_scratch[0..total]
}
}
// ---- Host Tests ----
const testing = std.testing
// Page-aligned top-of-stack user VA for layout assertions (the real
// call site passes user_layout.STACK_TOP, itself page-aligned).
const TEST_TOP u64 = 0x0000_0FFF_FFFF_F000
const TEST_PAGE u64 = 1 << 12
// Resolve argv[i] back to its string by walking the encoded image: the
// pointer is a user VA whose offset from base (== block start) indexes
// straight into `bytes`.
fn argAt(blk ArgvBlock, i usize) []u8 {
const p = std.mem.readInt(u64, blk.bytes[i * 8 ..][0..8], .little)
const off usize = #intCast(p - blk.sp)
return std.mem.sliceTo(#as([*:0]u8, #ptrCast(&blk.bytes[off])), 0)
}
test "execve: encodeArgvBlock lays out argc=3" {
const kargv = [_][]u8{ "argv_echo", "A", "B" }
const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull
try testing.expectEqual(#as(u64, 3), blk.argc)
try testing.expectEqual(blk.sp, blk.argv_uva)
try testing.expectEqual(#as(u64, 0), blk.sp % 16)
// Block sits entirely inside the top stack page and butts STACK_TOP.
try testing.expectEqual(TEST_TOP, blk.sp + blk.bytes.len)
try testing.expect(blk.sp >= TEST_TOP - TEST_PAGE)
try testing.expectEqualStrings("argv_echo", argAt(blk, 0))
try testing.expectEqualStrings("A", argAt(blk, 1))
try testing.expectEqualStrings("B", argAt(blk, 2))
// argv[argc] is the NULL terminator.
try testing.expectEqual(#as(u64, 0), std.mem.readInt(u64, blk.bytes[3 * 8 ..][0..8], .little))
}
test "execve: encodeArgvBlock empty argv is a lone NULL" {
const kargv = [_][]u8{}
const blk = encodeArgvBlock(TEST_TOP, 0, &kargv) orelse return error.UnexpectedNull
try testing.expectEqual(#as(u64, 0), blk.argc)
try testing.expectEqual(blk.sp, blk.argv_uva)
try testing.expectEqual(#as(u64, 0), blk.sp % 16)
// argv[0] is immediately NULL: argc=0 + a NULL-terminated empty array.
try testing.expectEqual(#as(u64, 0), std.mem.readInt(u64, blk.bytes[0..8], .little))
}
test "execve: encodeArgvBlock rejects more than MAX_ARGV strings" {
var kargv [MAX_ARGV + 1][]u8 = undefined
for *s in &kargv { s.* = "x" }
try testing.expectEqual(#as(?ArgvBlock, null), encodeArgvBlock(TEST_TOP, kargv.len, &kargv))
}
test "execve: encodeArgvBlock rejects oversize byte budget" {
var big [MAX_ARGV_BYTES]u8 = undefined
const kargv = [_][]u8{big[0..]}
try testing.expectEqual(#as(?ArgvBlock, null), encodeArgvBlock(TEST_TOP, kargv.len, &kargv))
}
test "execve: encodeArgvBlock keeps sp 16-aligned for odd lengths" {
// Lengths chosen so the unaligned `core` size is not a multiple of 16.
const kargv = [_][]u8{ "abc", "de" }
const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull
try testing.expectEqual(#as(u64, 0), blk.sp % 16)
try testing.expectEqual(TEST_TOP, blk.sp + blk.bytes.len)
try testing.expectEqualStrings("abc", argAt(blk, 0))
try testing.expectEqualStrings("de", argAt(blk, 1))
}
test "execve: encodeArgvBlock pointers stay inside the stack page" {
const kargv = [_][]u8{ "one", "two", "three" }
const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull
var i usize = 0
while i < blk.argc {
const p = std.mem.readInt(u64, blk.bytes[i * 8 ..][0..8], .little)
try testing.expect(p >= TEST_TOP - TEST_PAGE)
try testing.expect(p < TEST_TOP)
i += 1
}
}