Flash 458 lines
// fork: process creation, fork() and move-to-user setup.
// Layouts (TaskStruct, KeRegs, ...) come from src/task_layout.zig.
const layout = #import("task_layout")
const TaskStruct = layout.TaskStruct
const CoreContext = layout.CoreContext
const KeRegs = layout.KeRegs
const TASK_RUNNING = layout.TASK_RUNNING
const KTHREAD = layout.KTHREAD
const MAX_PAGE_COUNT = layout.MAX_PAGE_COUNT
const fdtable = #import("fdtable")
// User VA layout (STACK_TOP, HEAP_BASE) + page-permission flags. The ELF
// loader prepare_move_to_user_elf_argv picks per-region flags from these
// (text = RWX — the default bag is EL0 read/write + executable and no
// read-only (AP[2]) descriptor bit is defined, so W^X is not yet
// enforced; data/heap/stack add TD_USER_XN for RW-NX), and do_data_abort
// (src/mm_user.zig) reuses the same bag when demand-allocating heap/stack
// pages on a fault.
const user_layout = #import("user_layout")
// ELF parser — the named module "elf" (src/elf.flash), also the module
// the host tests cover. It moved from a sibling @import to a named
// module when it was ported to Flash: the generated .zig lives in the
// build cache, so a file-relative import can no longer resolve it.
// build.zig wires the same module into the kernel and host-test builds.
const elf = #import("elf")
// Argv-on-stack block type, encoded by execve.encodeArgvBlock and
// written into the top stack page by the argv-aware loader below. Named
// module (not a sibling @import) because src/execve.zig is the "execve"
// module in the kernel build; the fork host test wires the same module
// in via build.zig.
const execve = #import("execve")
const NR_TASKS usize = 64
const PAGE_SIZE u64 = 1 << 12
const THREAD_SIZE u64 = PAGE_SIZE
const SPSR_EL1_MODE_EL0t u64 = 0
const MU i32 = 0
const builtin = #import("builtin")
// Opt-in fork tracing (default off). See build.zig `-Dverbose-fork`.
const build_options = #import("build_options")
// Kernel-thread PCs must run via TTBR1 (high-mem linear map). Otherwise
// the moment a process does set_pgd() to a user pgd, TTBR0 stops mapping
// the kernel's low-VA copy and the next ret/blr to a kernel function
// faults. ORing instead of adding is idempotent if the address is
// already high.
const LINEAR_MAP_BASE u64 = if (builtin.target.os.tag == .freestanding) 0xFFFF000000000000 else 0
extern fn get_kernel_page() u64
extern fn free_kernel_page(kp u64) void
extern fn release_user_mm(t *mut TaskStruct) void
extern fn allocate_user_page(tsk *mut TaskStruct, uva u64, flags u64) u64
extern fn copy_virt_memory(dst *mut TaskStruct) i32
extern fn memzero(start u64, size u64) void
extern fn memcpy(dst *mut anyopaque, src *anyopaque, bytes u64) *mut anyopaque
extern fn copy_ke_regs(to *mut KeRegs, from *mut KeRegs) void
extern fn set_pgd(pgd u64) void
extern fn preempt_disable() void
extern fn preempt_enable() void
extern fn ret_from_fork() void
extern fn main_output(interface i32, str [*:0]u8) void
extern fn main_output_u64(interface i32, inw u64) void
extern fn main_output_char(interface i32, ch u8) void
extern var current ?*mut TaskStruct
extern var task [NR_TASKS]?*mut TaskStruct
extern var nr_tasks i32
extern var next_pid i32
export fn task_ke_regs(tsk *mut TaskStruct) *mut KeRegs {
// KeRegs sits at the top of the task's kernel-stack page. Tasks made by
// copy_process carry a dedicated stack page in `kstack`; init_task
// (kstack == 0) falls back to its own page for the boot context.
const base u64 = if (tsk.kstack != 0) tsk.kstack else #intFromPtr(tsk)
return #ptrFromInt(base + THREAD_SIZE - #sizeOf(KeRegs))
}
export fn copy_process_impl(clone_flags u64, fn_addr u64, arg u64) i32 {
preempt_disable()
// OOM: no kernel page for the child TaskStruct. Bail before any
// dereference of the (null) pointer; preempt was disabled above.
const kp = get_kernel_page()
if kp == 0 {
preempt_enable()
return -1
}
const p *mut TaskStruct = #ptrFromInt(kp)
// Dedicated kernel-stack page: the child's kernel stack lives
// in its own page, decoupled from the TaskStruct page, so a deep
// syscall plus a nested timer-IRQ frame-save can never overflow into
// the credential tail (the recurring stack-into-creds class). Freed
// alongside the TaskStruct page on every exit path; task_ke_regs(p)
// resolves KeRegs against it.
const ksp = get_kernel_page()
if ksp == 0 {
free_kernel_page(kp)
preempt_enable()
return -1
}
p.kstack = ksp
const childregs = task_ke_regs(p)
memzero(#intFromPtr(childregs), #sizeOf(KeRegs))
memzero(#intFromPtr(&p.core_context), #sizeOf(CoreContext))
if (clone_flags & KTHREAD) != 0 {
p.core_context.x19 = fn_addr | LINEAR_MAP_BASE
p.core_context.x20 = arg
} else {
const cur_regs = task_ke_regs(current.?)
// copy_ke_regs avoids gcc emitting a memcpy call
copy_ke_regs(childregs, cur_regs)
// child returns 0 from fork
childregs.regs[0] = 0
if copy_virt_memory(p) != 0 {
// copy_virt_memory may have mapped part of the child mm before
// failing (OOM mid-copy, or the child's page cap). Release those
// pages so this path is baseline-neutral, then the TaskStruct
// page. preempt was disabled at entry and must be re-enabled.
release_user_mm(p)
free_kernel_page(p.kstack)
free_kernel_page(#intFromPtr(p))
preempt_enable()
return -1
}
// Dup the parent's fd table: each installed slot is a shared
// reference to the same kernel-resident Pipe, and the refcount
// bumps once per inherited slot. POSIX-equivalent without
// CLOEXEC for now (future work wires CLOEXEC + close-on-exec).
// KTHREAD branch skips this — kernel threads cannot reach the
// EL0 syscall path that fills fd_table.
fdtable.dupAll(current.?, p)
// Inherit the parent's working directory. cwd lives
// on the child task's kernel page (zeroed by get_kernel_page),
// so without this copy the child would come up with cwd = ""
// and the next relative-path open would fall back to root with
// a stray leading byte. KTHREADs skip the copy along with fds —
// their default cwd = "/" from the TaskStruct field initialiser
// is fine for sched-only code paths.
#memcpy(&p.cwd, ¤t.?.cwd)
// Inherit process credentials: a forked child runs as
// the same user as its parent until it (or an image it execs)
// drops privilege via setuid/setgid. KTHREADs skip this along
// with fds/cwd — their 0/root default suits sched-only paths.
p.uid = current.?.uid
p.gid = current.?.gid
p.euid = current.?.euid
p.egid = current.?.egid
}
p.flags = clone_flags
p.priority = current.?.priority
p.state = TASK_RUNNING
// Halved so a freshly forked child doesn't out-budget a parent that has
// already burned ticks; gives the round-robin path a chance to interleave
// parent/child during fork-stress instead of running parent in a tight
// burst.
p.counter = #divTrunc(p.priority, 2)
p.preempt_count = 1
p.parent = current
p.core_context.lr = #intFromPtr(&ret_from_fork) | LINEAR_MAP_BASE
p.core_context.sp = #intFromPtr(childregs)
// First-null-slot scan instead of monotonic nr_tasks bump so that slots
// freed by do_wait get reused; otherwise long fork-stress runs hit
// NR_TASKS=64 well before allocator pressure. nr_tasks is kept as a
// high-water mark only.
var slot i32 = -1
var i usize = 0
while i < NR_TASKS {
if task[i] == null {
slot = #intCast(i)
break
}
i += 1
}
if slot < 0 {
// Out of task slots: the child mm is fully built (copy_virt_memory
// succeeded), so release it before freeing the TaskStruct page —
// otherwise the child's user + page-table pages leak.
release_user_mm(p)
free_kernel_page(p.kstack)
free_kernel_page(#intFromPtr(p))
preempt_enable()
return -1
}
// Pid is monotonic (next_pid++), independent of the reusable slot index.
p.pid = next_pid
next_pid += 1
task[#intCast(slot)] = p
if slot + 1 > nr_tasks { nr_tasks = slot + 1 }
if build_options.verbose_fork {
main_output(MU, "created pid ")
if p.pid < 10 {
main_output_char(MU, #intCast('0' + p.pid))
} else {
main_output_char(MU, #intCast('0' + #divTrunc(p.pid, 10)))
main_output_char(MU, #intCast('0' + #mod(p.pid, 10)))
}
main_output(MU, " at ")
main_output_u64(MU, #intFromPtr(p))
main_output(MU, "\n")
}
preempt_enable()
return p.pid
}
// Loads an ELF image into the current task's address space. Callers
// (kernel boot for the PID 1 init image, and sys_execve via the argv
// trampoline below) snapshot the ELF bytes into a kernel-owned region at
// `blob_addr_kva`, free the old user pages, and zero `current.mm.pgd`
// before calling.
// Walks PT_LOAD segments via src/elf.zig, allocates fresh user pages
// per segment with region-aware flags (text=RWX — writable, no
// read-only page bit; data/heap/stack=RW-NX),
// memcpys file-backed bytes from the blob, eagerly maps one stack page
// at the top of the user VA, then sets ELR=e_entry / SP=STACK_TOP and
// installs the new pgd. Returns 0 on success, -1 on parse failure /
// alloc failure / non-page-aligned p_vaddr / inconsistent memsz<filesz.
//
// Per-page memcpy uses the kernel-virtual alias of the freshly mapped
// page (returned by allocate_user_page) so the copy works while TTBR0
// still holds the old (now freed) pgd — set_pgd is the last thing
// before return.
//
// The exported 2-arg entry is the ABI kernel boot (PID 1 init) reaches
// via `extern fn`; it loads with no argv. The argv-aware worker takes an
// optional execve.ArgvBlock — when present, its serialised image is
// copied into the top stack page and argc/argv land in x0/x1 (AAPCS64)
// with sp parked at &argv[0] instead of STACK_TOP. sys_execve calls the
// worker directly via the move_to_user_elf_argv trampoline below.
export fn prepare_move_to_user_elf(blob_addr_kva u64, blob_size u64) i32 {
return prepare_move_to_user_elf_argv(blob_addr_kva, blob_size, null)
}
pub fn prepare_move_to_user_elf_argv(
blob_addr_kva u64,
blob_size u64,
argv_block ?execve.ArgvBlock
) i32 {
const blob []u8 = #as([*]u8, #ptrFromInt(blob_addr_kva))[0..blob_size]
const ehdr = elf.parseEhdr(blob) catch return -1
var entry_mapped = false
var it = elf.iteratePhdrs(blob, ehdr)
while true {
const ph_opt = it.next() catch return -1
const ph = ph_opt orelse break
if ph.p_type != elf.PT_LOAD { continue }
if ehdr.e_entry >= ph.p_vaddr && ehdr.e_entry < ph.p_vaddr + ph.p_memsz {
if (ph.p_flags & elf.PF_X) != 0 {
entry_mapped = true
}
}
// Sanity: page-aligned vaddr and memsz >= filesz. Mis-aligned
// segments would force partial-page memcpys that break the
// page-grain free-page accounting; reject and document.
if (ph.p_vaddr & (PAGE_SIZE - 1)) != 0 { return -1 }
if ph.p_memsz < ph.p_filesz { return -1 }
if ph.p_memsz == 0 { continue }
const flags u64 = if ((ph.p_flags & elf.PF_X) != 0)
user_layout.TD_USER_PAGE_FLAGS_DEFAULT
else
user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN
const num_pages u64 = (ph.p_memsz + PAGE_SIZE - 1) / PAGE_SIZE
var i u64 = 0
while i < num_pages {
const uva = ph.p_vaddr + i * PAGE_SIZE
const kva = allocate_user_page(current.?, uva, flags)
if kva == 0 { return -1 }
const seg_off u64 = i * PAGE_SIZE
if seg_off < ph.p_filesz {
const remaining u64 = ph.p_filesz - seg_off
const copy_bytes u64 = if (remaining > PAGE_SIZE) PAGE_SIZE else remaining
_ = memcpy(#ptrFromInt(kva), #ptrFromInt(blob_addr_kva + ph.p_offset + seg_off), copy_bytes)
}
// Trailing memsz-filesz BSS bytes are implicitly zero
// because get_free_page returns zeroed pages.
i += 1
}
}
if !entry_mapped { return -1 }
// Eagerly map the top stack page so EL0 entry doesn't fault before
// the first instruction. Lazy stack growth + guard-page handling
// arrives in 2.5 / 2.6.
const stack_uva u64 = user_layout.STACK_TOP - PAGE_SIZE
const stack_kva = allocate_user_page(
current.?,
stack_uva,
user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN
)
if stack_kva == 0 { return -1 }
const regs = task_ke_regs(current.?)
memzero(#intFromPtr(regs), #sizeOf(KeRegs))
regs.elr = ehdr.e_entry
regs.pstate = SPSR_EL1_MODE_EL0t
if argv_block |ab| {
// Copy the encoded argv image into the eagerly-mapped top stack
// page via its KVA alias (TTBR0 still holds the old pgd until
// set_pgd below). encodeArgvBlock laid the block flush against
// STACK_TOP, so it lands at PAGE_SIZE - len from the page base.
const dst [*]mut u8 = #ptrFromInt(stack_kva + (PAGE_SIZE - ab.bytes.len))
#memcpy(dst[0..ab.bytes.len], ab.bytes)
// x1 = argv and sp = &argv[0] survive to the new program: kernel_exit
// restores them from this frame and ret_from_syscall (arch/aarch64/entry.S)
// does not touch them. x0 = argc is the AAPCS64 contract, but for the
// sole caller (execve via the SVC path) ret_from_syscall overwrites
// the saved-x0 slot with execveKernel's return value — so execveKernel
// returns argc to satisfy it. This frame write keeps the register
// setup complete for any future direct (non-syscall) caller.
regs.regs[0] = ab.argc // x0 = argc (see note above)
regs.regs[1] = ab.argv_uva // x1 = argv
regs.sp = ab.sp
} else {
regs.sp = user_layout.STACK_TOP
}
// Heap starts empty at HEAP_BASE — sys_brk grows / shrinks from
// here, do_data_abort demand-allocates pages as the heap is touched.
current.?.mm.brk = user_layout.HEAP_BASE
set_pgd(current.?.mm.pgd)
return 0
}
// C-ABI trampoline: src/execve.zig is a leaf module and cannot import the
// root kernel_mod where prepare_move_to_user_elf_argv lives, so it reaches
// the argv-aware loader through this exported symbol — the same pattern
// sys.zig uses to call prepare_move_to_user_elf via `extern fn`. A direct
// call between kernel functions in syscall context works (only the
// indirect dispatch table needs the | LINEAR_MAP_BASE alias). argv_block_ptr
// is a kernel pointer to an execve.ArgvBlock, or 0 for the no-argv path.
export fn move_to_user_elf_argv(blob_addr_kva u64, blob_size u64, argv_block_ptr u64) i32 {
const ab ?execve.ArgvBlock = if (argv_block_ptr == 0)
null
else
#as(*execve.ArgvBlock, #ptrFromInt(argv_block_ptr)).*
return prepare_move_to_user_elf_argv(blob_addr_kva, blob_size, ab)
}
// ---- Host Tests ----
const std = #import("std")
const testing = std.testing
extern fn reset_fork_test() void
test "fork: copy_process_impl creates a child" {
reset_fork_test()
var p TaskStruct = undefined
#memset(std.mem.asBytes(&p), 0)
p.priority = 10
current = &p
const child_pid = copy_process_impl(0, 0, 0)
try testing.expect(child_pid > 0)
try testing.expectEqual(#as(i32, 1), nr_tasks)
try testing.expect(task[0] != null)
try testing.expectEqual(child_pid, task[0].?.pid)
try testing.expectEqual(p.priority, task[0].?.priority)
try testing.expectEqual(#as(i64, 5), task[0].?.counter)
}
test "fork: task_ke_regs returns correct pointer" {
var t TaskStruct = undefined
#memset(std.mem.asBytes(&t), 0)
// kstack == 0: KeRegs resolves against the task page itself — the
// init_task / boot-context fallback.
const regs = task_ke_regs(&t)
const offset = #intFromPtr(regs) - #intFromPtr(&t)
try testing.expectEqual(#as(u64, THREAD_SIZE - #sizeOf(KeRegs)), offset)
// kstack set: KeRegs resolves against the dedicated kernel-stack page,
// not the TaskStruct page — the decoupling that keeps a deep syscall +
// nested IRQ frame off the credential tail.
var stack_page [THREAD_SIZE]u8 align(16) = undefined
t.kstack = #intFromPtr(&stack_page)
const regs2 = task_ke_regs(&t)
try testing.expectEqual(
#intFromPtr(&stack_page) + THREAD_SIZE - #sizeOf(KeRegs),
#intFromPtr(regs2)
)
}
extern fn set_fail_copy_virt(v bool) void
test "fork: copy_process_impl returns -1 when the kernel page OOMs" {
reset_fork_test()
var p TaskStruct = undefined
#memset(std.mem.asBytes(&p), 0)
p.priority = 10
current = &p
// Drain the stub's page pool so get_kernel_page returns the sentinel.
// Without the :75 null-check, copy_process_impl would deref a null
// TaskStruct pointer and crash this test.
var i usize = 0
while i < 256 {
_ = get_kernel_page()
i += 1
}
try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
}
test "fork: copy_process_impl returns -1 when copy_virt_memory fails" {
reset_fork_test()
var p TaskStruct = undefined
#memset(std.mem.asBytes(&p), 0)
p.priority = 10
current = &p
set_fail_copy_virt(true)
// The copy_virt_memory-failure path releases the child mm + the
// TaskStruct page and returns -1 (no slot consumed).
try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
try testing.expectEqual(#as(?*mut TaskStruct, null), task[0])
}
test "fork: copy_process_impl returns -1 when all task slots are full" {
reset_fork_test()
var p TaskStruct = undefined
#memset(std.mem.asBytes(&p), 0)
p.priority = 10
current = &p
// Occupy every task[] slot so the first-null-slot scan fails.
var dummy TaskStruct = undefined
#memset(std.mem.asBytes(&dummy), 0)
var i usize = 0
while i < NR_TASKS {
task[i] = &dummy
i += 1
}
try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
}