ajhahn.de
← FlashOS
Flash 458 lines
// fork: process creation, fork() and move-to-user setup.
// Layouts (TaskStruct, KeRegs, ...) come from src/task_layout.zig.

const layout = #import("task_layout")
const TaskStruct = layout.TaskStruct
const CoreContext = layout.CoreContext
const KeRegs = layout.KeRegs
const TASK_RUNNING = layout.TASK_RUNNING
const KTHREAD = layout.KTHREAD
const MAX_PAGE_COUNT = layout.MAX_PAGE_COUNT

const fdtable = #import("fdtable")

// User VA layout (STACK_TOP, HEAP_BASE) + page-permission flags. The ELF
// loader prepare_move_to_user_elf_argv picks per-region flags from these
// (text = RWX — the default bag is EL0 read/write + executable and no
// read-only (AP[2]) descriptor bit is defined, so W^X is not yet
// enforced; data/heap/stack add TD_USER_XN for RW-NX), and do_data_abort
// (src/mm_user.zig) reuses the same bag when demand-allocating heap/stack
// pages on a fault.
const user_layout = #import("user_layout")

// ELF parser — the named module "elf" (src/elf.flash), also the module
// the host tests cover. It moved from a sibling @import to a named
// module when it was ported to Flash: the generated .zig lives in the
// build cache, so a file-relative import can no longer resolve it.
// build.zig wires the same module into the kernel and host-test builds.
const elf = #import("elf")

// Argv-on-stack block type, encoded by execve.encodeArgvBlock and
// written into the top stack page by the argv-aware loader below. Named
// module (not a sibling @import) because src/execve.zig is the "execve"
// module in the kernel build; the fork host test wires the same module
// in via build.zig.
const execve = #import("execve")

const NR_TASKS usize = 64
const PAGE_SIZE u64 = 1 << 12
const THREAD_SIZE u64 = PAGE_SIZE
const SPSR_EL1_MODE_EL0t u64 = 0
const MU i32 = 0

const builtin = #import("builtin")

// Opt-in fork tracing (default off). See build.zig `-Dverbose-fork`.
const build_options = #import("build_options")

// Kernel-thread PCs must run via TTBR1 (high-mem linear map). Otherwise
// the moment a process does set_pgd() to a user pgd, TTBR0 stops mapping
// the kernel's low-VA copy and the next ret/blr to a kernel function
// faults. ORing instead of adding is idempotent if the address is
// already high.
const LINEAR_MAP_BASE u64 = if (builtin.target.os.tag == .freestanding) 0xFFFF000000000000 else 0

extern fn get_kernel_page() u64
extern fn free_kernel_page(kp u64) void
extern fn release_user_mm(t *mut TaskStruct) void
extern fn allocate_user_page(tsk *mut TaskStruct, uva u64, flags u64) u64
extern fn copy_virt_memory(dst *mut TaskStruct) i32
extern fn memzero(start u64, size u64) void
extern fn memcpy(dst *mut anyopaque, src *anyopaque, bytes u64) *mut anyopaque
extern fn copy_ke_regs(to *mut KeRegs, from *mut KeRegs) void
extern fn set_pgd(pgd u64) void
extern fn preempt_disable() void
extern fn preempt_enable() void
extern fn ret_from_fork() void
extern fn main_output(interface i32, str [*:0]u8) void
extern fn main_output_u64(interface i32, inw u64) void
extern fn main_output_char(interface i32, ch u8) void

extern var current ?*mut TaskStruct
extern var task [NR_TASKS]?*mut TaskStruct
extern var nr_tasks i32
extern var next_pid i32

export fn task_ke_regs(tsk *mut TaskStruct) *mut KeRegs {
    // KeRegs sits at the top of the task's kernel-stack page. Tasks made by
    // copy_process carry a dedicated stack page in `kstack`; init_task
    // (kstack == 0) falls back to its own page for the boot context.
    const base u64 = if (tsk.kstack != 0) tsk.kstack else #intFromPtr(tsk)
    return #ptrFromInt(base + THREAD_SIZE - #sizeOf(KeRegs))
}

export fn copy_process_impl(clone_flags u64, fn_addr u64, arg u64) i32 {
    preempt_disable()

    // OOM: no kernel page for the child TaskStruct. Bail before any
    // dereference of the (null) pointer; preempt was disabled above.
    const kp = get_kernel_page()
    if kp == 0 {
        preempt_enable()
        return -1
    }
    const p *mut TaskStruct = #ptrFromInt(kp)

    // Dedicated kernel-stack page: the child's kernel stack lives
    // in its own page, decoupled from the TaskStruct page, so a deep
    // syscall plus a nested timer-IRQ frame-save can never overflow into
    // the credential tail (the recurring stack-into-creds class). Freed
    // alongside the TaskStruct page on every exit path; task_ke_regs(p)
    // resolves KeRegs against it.
    const ksp = get_kernel_page()
    if ksp == 0 {
        free_kernel_page(kp)
        preempt_enable()
        return -1
    }
    p.kstack = ksp

    const childregs = task_ke_regs(p)
    memzero(#intFromPtr(childregs), #sizeOf(KeRegs))
    memzero(#intFromPtr(&p.core_context), #sizeOf(CoreContext))

    if (clone_flags & KTHREAD) != 0 {
        p.core_context.x19 = fn_addr | LINEAR_MAP_BASE
        p.core_context.x20 = arg
    } else {
        const cur_regs = task_ke_regs(current.?)
        // copy_ke_regs avoids gcc emitting a memcpy call
        copy_ke_regs(childregs, cur_regs)
        // child returns 0 from fork
        childregs.regs[0] = 0
        if copy_virt_memory(p) != 0 {
            // copy_virt_memory may have mapped part of the child mm before
            // failing (OOM mid-copy, or the child's page cap). Release those
            // pages so this path is baseline-neutral, then the TaskStruct
            // page. preempt was disabled at entry and must be re-enabled.
            release_user_mm(p)
            free_kernel_page(p.kstack)
            free_kernel_page(#intFromPtr(p))
            preempt_enable()
            return -1
        }
        // Dup the parent's fd table: each installed slot is a shared
        // reference to the same kernel-resident Pipe, and the refcount
        // bumps once per inherited slot. POSIX-equivalent without
        // CLOEXEC for now (future work wires CLOEXEC + close-on-exec).
        // KTHREAD branch skips this — kernel threads cannot reach the
        // EL0 syscall path that fills fd_table.
        fdtable.dupAll(current.?, p)
        // Inherit the parent's working directory. cwd lives
        // on the child task's kernel page (zeroed by get_kernel_page),
        // so without this copy the child would come up with cwd = ""
        // and the next relative-path open would fall back to root with
        // a stray leading byte. KTHREADs skip the copy along with fds —
        // their default cwd = "/" from the TaskStruct field initialiser
        // is fine for sched-only code paths.
        #memcpy(&p.cwd, &current.?.cwd)
        // Inherit process credentials: a forked child runs as
        // the same user as its parent until it (or an image it execs)
        // drops privilege via setuid/setgid. KTHREADs skip this along
        // with fds/cwd — their 0/root default suits sched-only paths.
        p.uid = current.?.uid
        p.gid = current.?.gid
        p.euid = current.?.euid
        p.egid = current.?.egid
    }

    p.flags = clone_flags
    p.priority = current.?.priority
    p.state = TASK_RUNNING
    // Halved so a freshly forked child doesn't out-budget a parent that has
    // already burned ticks; gives the round-robin path a chance to interleave
    // parent/child during fork-stress instead of running parent in a tight
    // burst.
    p.counter = #divTrunc(p.priority, 2)
    p.preempt_count = 1
    p.parent = current

    p.core_context.lr = #intFromPtr(&ret_from_fork) | LINEAR_MAP_BASE
    p.core_context.sp = #intFromPtr(childregs)

    // First-null-slot scan instead of monotonic nr_tasks bump so that slots
    // freed by do_wait get reused; otherwise long fork-stress runs hit
    // NR_TASKS=64 well before allocator pressure. nr_tasks is kept as a
    // high-water mark only.
    var slot i32 = -1
    var i usize = 0
    while i < NR_TASKS {
        if task[i] == null {
            slot = #intCast(i)
            break
        }
        i += 1
    }
    if slot < 0 {
        // Out of task slots: the child mm is fully built (copy_virt_memory
        // succeeded), so release it before freeing the TaskStruct page —
        // otherwise the child's user + page-table pages leak.
        release_user_mm(p)
        free_kernel_page(p.kstack)
        free_kernel_page(#intFromPtr(p))
        preempt_enable()
        return -1
    }
    // Pid is monotonic (next_pid++), independent of the reusable slot index.
    p.pid = next_pid
    next_pid += 1
    task[#intCast(slot)] = p
    if slot + 1 > nr_tasks { nr_tasks = slot + 1 }

    if build_options.verbose_fork {
        main_output(MU, "created pid ")
        if p.pid < 10 {
            main_output_char(MU, #intCast('0' + p.pid))
        } else {
            main_output_char(MU, #intCast('0' + #divTrunc(p.pid, 10)))
            main_output_char(MU, #intCast('0' + #mod(p.pid, 10)))
        }
        main_output(MU, " at ")
        main_output_u64(MU, #intFromPtr(p))
        main_output(MU, "\n")
    }

    preempt_enable()
    return p.pid
}

// Loads an ELF image into the current task's address space. Callers
// (kernel boot for the PID 1 init image, and sys_execve via the argv
// trampoline below) snapshot the ELF bytes into a kernel-owned region at
// `blob_addr_kva`, free the old user pages, and zero `current.mm.pgd`
// before calling.
// Walks PT_LOAD segments via src/elf.zig, allocates fresh user pages
// per segment with region-aware flags (text=RWX — writable, no
// read-only page bit; data/heap/stack=RW-NX),
// memcpys file-backed bytes from the blob, eagerly maps one stack page
// at the top of the user VA, then sets ELR=e_entry / SP=STACK_TOP and
// installs the new pgd. Returns 0 on success, -1 on parse failure /
// alloc failure / non-page-aligned p_vaddr / inconsistent memsz<filesz.
//
// Per-page memcpy uses the kernel-virtual alias of the freshly mapped
// page (returned by allocate_user_page) so the copy works while TTBR0
// still holds the old (now freed) pgd — set_pgd is the last thing
// before return.
//
// The exported 2-arg entry is the ABI kernel boot (PID 1 init) reaches
// via `extern fn`; it loads with no argv. The argv-aware worker takes an
// optional execve.ArgvBlock — when present, its serialised image is
// copied into the top stack page and argc/argv land in x0/x1 (AAPCS64)
// with sp parked at &argv[0] instead of STACK_TOP. sys_execve calls the
// worker directly via the move_to_user_elf_argv trampoline below.
export fn prepare_move_to_user_elf(blob_addr_kva u64, blob_size u64) i32 {
    return prepare_move_to_user_elf_argv(blob_addr_kva, blob_size, null)
}

pub fn prepare_move_to_user_elf_argv(
    blob_addr_kva u64,
    blob_size u64,
    argv_block ?execve.ArgvBlock
) i32 {
    const blob []u8 = #as([*]u8, #ptrFromInt(blob_addr_kva))[0..blob_size]
    const ehdr = elf.parseEhdr(blob) catch return -1

    var entry_mapped = false
    var it = elf.iteratePhdrs(blob, ehdr)
    while true {
        const ph_opt = it.next() catch return -1
        const ph = ph_opt orelse break
        if ph.p_type != elf.PT_LOAD { continue }

        if ehdr.e_entry >= ph.p_vaddr && ehdr.e_entry < ph.p_vaddr + ph.p_memsz {
            if (ph.p_flags & elf.PF_X) != 0 {
                entry_mapped = true
            }
        }

        // Sanity: page-aligned vaddr and memsz >= filesz. Mis-aligned
        // segments would force partial-page memcpys that break the
        // page-grain free-page accounting; reject and document.
        if (ph.p_vaddr & (PAGE_SIZE - 1)) != 0 { return -1 }
        if ph.p_memsz < ph.p_filesz { return -1 }
        if ph.p_memsz == 0 { continue }

        const flags u64 = if ((ph.p_flags & elf.PF_X) != 0)
            user_layout.TD_USER_PAGE_FLAGS_DEFAULT
        else
            user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN

        const num_pages u64 = (ph.p_memsz + PAGE_SIZE - 1) / PAGE_SIZE
        var i u64 = 0
        while i < num_pages {
            const uva = ph.p_vaddr + i * PAGE_SIZE
            const kva = allocate_user_page(current.?, uva, flags)
            if kva == 0 { return -1 }

            const seg_off u64 = i * PAGE_SIZE
            if seg_off < ph.p_filesz {
                const remaining u64 = ph.p_filesz - seg_off
                const copy_bytes u64 = if (remaining > PAGE_SIZE) PAGE_SIZE else remaining
                _ = memcpy(#ptrFromInt(kva), #ptrFromInt(blob_addr_kva + ph.p_offset + seg_off), copy_bytes)
            }
            // Trailing memsz-filesz BSS bytes are implicitly zero
            // because get_free_page returns zeroed pages.
            i += 1
        }
    }

    if !entry_mapped { return -1 }

    // Eagerly map the top stack page so EL0 entry doesn't fault before
    // the first instruction. Lazy stack growth + guard-page handling
    // arrives in 2.5 / 2.6.
    const stack_uva u64 = user_layout.STACK_TOP - PAGE_SIZE
    const stack_kva = allocate_user_page(
        current.?,
        stack_uva,
        user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN
    )
    if stack_kva == 0 { return -1 }

    const regs = task_ke_regs(current.?)
    memzero(#intFromPtr(regs), #sizeOf(KeRegs))
    regs.elr = ehdr.e_entry
    regs.pstate = SPSR_EL1_MODE_EL0t

    if argv_block |ab| {
        // Copy the encoded argv image into the eagerly-mapped top stack
        // page via its KVA alias (TTBR0 still holds the old pgd until
        // set_pgd below). encodeArgvBlock laid the block flush against
        // STACK_TOP, so it lands at PAGE_SIZE - len from the page base.
        const dst [*]mut u8 = #ptrFromInt(stack_kva + (PAGE_SIZE - ab.bytes.len))
        #memcpy(dst[0..ab.bytes.len], ab.bytes)
        // x1 = argv and sp = &argv[0] survive to the new program: kernel_exit
        // restores them from this frame and ret_from_syscall (arch/aarch64/entry.S)
        // does not touch them. x0 = argc is the AAPCS64 contract, but for the
        // sole caller (execve via the SVC path) ret_from_syscall overwrites
        // the saved-x0 slot with execveKernel's return value — so execveKernel
        // returns argc to satisfy it. This frame write keeps the register
        // setup complete for any future direct (non-syscall) caller.
        regs.regs[0] = ab.argc // x0 = argc (see note above)
        regs.regs[1] = ab.argv_uva // x1 = argv
        regs.sp = ab.sp
    } else {
        regs.sp = user_layout.STACK_TOP
    }

    // Heap starts empty at HEAP_BASE — sys_brk grows / shrinks from
    // here, do_data_abort demand-allocates pages as the heap is touched.
    current.?.mm.brk = user_layout.HEAP_BASE

    set_pgd(current.?.mm.pgd)
    return 0
}

// C-ABI trampoline: src/execve.zig is a leaf module and cannot import the
// root kernel_mod where prepare_move_to_user_elf_argv lives, so it reaches
// the argv-aware loader through this exported symbol — the same pattern
// sys.zig uses to call prepare_move_to_user_elf via `extern fn`. A direct
// call between kernel functions in syscall context works (only the
// indirect dispatch table needs the | LINEAR_MAP_BASE alias). argv_block_ptr
// is a kernel pointer to an execve.ArgvBlock, or 0 for the no-argv path.
export fn move_to_user_elf_argv(blob_addr_kva u64, blob_size u64, argv_block_ptr u64) i32 {
    const ab ?execve.ArgvBlock = if (argv_block_ptr == 0)
        null
    else
        #as(*execve.ArgvBlock, #ptrFromInt(argv_block_ptr)).*
    return prepare_move_to_user_elf_argv(blob_addr_kva, blob_size, ab)
}

// ---- Host Tests ----
const std = #import("std")
const testing = std.testing

extern fn reset_fork_test() void

test "fork: copy_process_impl creates a child" {
    reset_fork_test()
    var p TaskStruct = undefined
    #memset(std.mem.asBytes(&p), 0)
    p.priority = 10
    current = &p

    const child_pid = copy_process_impl(0, 0, 0)
    try testing.expect(child_pid > 0)
    try testing.expectEqual(#as(i32, 1), nr_tasks)
    try testing.expect(task[0] != null)
    try testing.expectEqual(child_pid, task[0].?.pid)
    try testing.expectEqual(p.priority, task[0].?.priority)
    try testing.expectEqual(#as(i64, 5), task[0].?.counter)
}

test "fork: task_ke_regs returns correct pointer" {
    var t TaskStruct = undefined
    #memset(std.mem.asBytes(&t), 0)

    // kstack == 0: KeRegs resolves against the task page itself — the
    // init_task / boot-context fallback.
    const regs = task_ke_regs(&t)
    const offset = #intFromPtr(regs) - #intFromPtr(&t)
    try testing.expectEqual(#as(u64, THREAD_SIZE - #sizeOf(KeRegs)), offset)

    // kstack set: KeRegs resolves against the dedicated kernel-stack page,
    // not the TaskStruct page — the decoupling that keeps a deep syscall +
    // nested IRQ frame off the credential tail.
    var stack_page [THREAD_SIZE]u8 align(16) = undefined
    t.kstack = #intFromPtr(&stack_page)
    const regs2 = task_ke_regs(&t)
    try testing.expectEqual(
        #intFromPtr(&stack_page) + THREAD_SIZE - #sizeOf(KeRegs),
        #intFromPtr(regs2)
    )
}

extern fn set_fail_copy_virt(v bool) void

test "fork: copy_process_impl returns -1 when the kernel page OOMs" {
    reset_fork_test()
    var p TaskStruct = undefined
    #memset(std.mem.asBytes(&p), 0)
    p.priority = 10
    current = &p

    // Drain the stub's page pool so get_kernel_page returns the sentinel.
    // Without the :75 null-check, copy_process_impl would deref a null
    // TaskStruct pointer and crash this test.
    var i usize = 0
    while i < 256 {
        _ = get_kernel_page()
        i += 1
    }

    try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
}

test "fork: copy_process_impl returns -1 when copy_virt_memory fails" {
    reset_fork_test()
    var p TaskStruct = undefined
    #memset(std.mem.asBytes(&p), 0)
    p.priority = 10
    current = &p

    set_fail_copy_virt(true)
    // The copy_virt_memory-failure path releases the child mm + the
    // TaskStruct page and returns -1 (no slot consumed).
    try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
    try testing.expectEqual(#as(?*mut TaskStruct, null), task[0])
}

test "fork: copy_process_impl returns -1 when all task slots are full" {
    reset_fork_test()
    var p TaskStruct = undefined
    #memset(std.mem.asBytes(&p), 0)
    p.priority = 10
    current = &p

    // Occupy every task[] slot so the first-null-slot scan fails.
    var dummy TaskStruct = undefined
    #memset(std.mem.asBytes(&dummy), 0)
    var i usize = 0
    while i < NR_TASKS {
        task[i] = &dummy
        i += 1
    }

    try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0))
}