ajhahn.de
← FlashOS
Assembly 287 lines
#include "asm_defs.inc"

/* changes tmp1, tmp2 only */
.macro create_table_entry, tbl, ntbl, va, shift, flags, tmp1, tmp2
    /* get entry index in tmp1 */
    lsr \tmp1, \va, #\shift
    and \tmp1, \tmp1, #ENTRIES_PER_TABLE - 1
    /* tmp2 = entry value */
    mov \tmp2, \ntbl
    orr \tmp2, \tmp2, #\flags
    /* install entry */
    str \tmp2, [\tbl, \tmp1, lsl #3]
.endm

/* changes vstart, vend, pa, tmp1 */
/* vstart and vend must differ by at least one block. */
/* `flags_label` is a PC-relative label in .text.boot.literals — the
 * literal table is kept out of .text.boot.late so that section stays
 * 4-byte aligned and the linker does not pad before it. */
.macro create_block_map, pmd, vstart, vend, pa, flags_label, tmp1
    /* turn vstart, vend into indices */
    lsr \vstart, \vstart, #SECTION_SHIFT
    and \vstart, \vstart, #ENTRIES_PER_TABLE - 1
    lsr \vend, \vend, #SECTION_SHIFT
    /* minus one to handle the last entry */
    sub \vend, \vend, #1
    and \vend, \vend, #ENTRIES_PER_TABLE - 1
    /* loop init, pa = pa | flags */
    lsr \pa, \pa, #SECTION_SHIFT
    lsl \pa, \pa, #SECTION_SHIFT
    ldr \tmp1, \flags_label
    orr \pa, \pa, \tmp1
    /* loop */
    /* pmd[vstart] = pa */
2:
    str \pa, [\pmd, \vstart, lsl #3]
    /* pa += section size */
    add \pa, \pa, #SECTION_SIZE
    /* vstart += 1 */
    add \vstart, \vstart, #1
    cmp \vstart, \vend
    b.le 2b
.endm

.section ".text.boot"

.globl _start
.globl _start_real
_start:
_start_real:
    /* only core 0 starts here */
    /* On virt the Linux arm64 image header (board/virt/image_header.S)
     * lives in .text.boot.header at the image base and branches here
     * via `_start_real`; on Pi the firmware enters at offset 0, which
     * is `_start` itself. Both labels alias the same instruction so
     * Pi kernel8.img stays byte-identical. */
    b master
    /* unreachable */
    b proc_hang

master: /* entry point of the primary core */
    /* Save the DTB physical address that UEFI / QEMU `-kernel` hand
     * off in x0 (Linux arm64 boot protocol) — must happen before any
     * `bl` that might clobber x0. The macro is board-specific:
     * virt stores into the `.bss` global `dtb_pa`; Pi expands to
     * nothing because no firmware on Pi 4 hands off a DTB pointer. */
    save_dtb_pa x0
    bl drop_to_el1
    /* Board-specific FP/SIMD enable at EL1 — virt sets
     * CPACR_EL1.FPEN, Pi inlines to nothing (armstub already did
     * it at EL3 and Pi's Zig binary contains no NEON). Must run
     * before any Zig code, so before the jump into kernel_main. */
    enable_fp_simd_el1
    /* Board-specific stack init — Pi expands to `mov sp, #LOW_MEMORY`
     * (single 4-byte instruction, baseline-identical); virt expands
     * to `ldr x9, =LOW_MEMORY; mov sp, x9` because its LOW_MEMORY
     * does not fit the immediate field. */
    mov_sp_low_memory x9
    /* Compute the BSS range as load (physical) addresses — the MMU is
     * still off here (map_identity/map_high run below). adr's
     * ADR_PREL_LO21 reach is only ±1 MiB; once kernel .bss grew past
     * that (large statics, e.g. execve's exec_buf) bss_end fell out of
     * range, so use the adrp/add ±4 GiB pair. */
    adrp x0, bss_begin
    add x0, x0, :lo12:bss_begin
    adrp x1, bss_end
    add x1, x1, :lo12:bss_end
    sub x1, x1, x0
    /* clear out the bss section */
    bl memzero
    bl map_identity
    bl map_high
    bl wake_up_cores
    /* save kernel pa base */
    adr x0, _start
    adr x1, KERNEL_PA_BASE
    str x0, [x1]
    /* set ttbr's */
    adrp x0, id_pg_dir
    msr ttbr0_el1, x0
    adrp x0, high_pg_dir
    msr ttbr1_el1, x0
    /* MAIR/TCR/SCTLR rewrite: always-safe (idempotent on HW); required on
     * QEMU's -kernel shim which skips the EL3-side init armstub does on
     * real Pi 4. Without this the first translation walk after MMU enable
     * faults under QEMU. */
    /* Most literals (MAIR/LINEAR_MAP_BASE/HCR/SPSR) get GAS-optimised to
     * inline `movz` so they emit no pool entry. The two values that do
     * end up in the pool — TCR_EL1_VAL and SCTLR_EL1_VAL_MMU_ENABLED —
     * are routed through explicit labels in `.text.boot.literals`, which
     * keeps boot.S `.text.boot` literal-pool-free and stops GAS from
     * dumping a pool between el1_entry and the board's boot_quirks. */
    ldr x0, =MAIR_EL1_VAL
    msr mair_el1, x0
    ldr x0, .Ltcr_el1_val
    msr tcr_el1, x0
    adr x0, vectors
    msr vbar_el1, x0
    isb
    /* turn on the mmu */
    ldr x0, .Lsctlr_mmu_enabled
    msr sctlr_el1, x0
    isb
    /* prepare jumping to high mem */
    ldr x2, =LINEAR_MAP_BASE
    add sp, sp, x2
    adr x1, kernel_main
    add x1, x1, x2
    /* core 0 */
    mov x0, #0
    /* jump to high mem */
    blr x1
    /* unreachable */
    b proc_hang

.globl app
app: /* entry point of the secondary cores */
    bl drop_to_el1
    /* setup stack */
    mrs x0, mpidr_el1
    and x0, x0, #0xFF
    mov x1, #SECTION_SIZE
    mul x1, x1, x0
    /* Board-specific add of LOW_MEMORY (see master:'s comment). */
    add_low_memory x1, x1, x9
    mov sp, x1
    bl kernel_main

drop_to_el1:
    /* Three valid entry paths reach this routine:
     *   * armstub8.bin runs first (real Pi 4 hardware): EL3 with
     *     SPSR_EL3 pre-loaded for the EL1h drop.
     *   * QEMU `-M raspi4b -kernel` shim hands off at EL2 with no
     *     SPSR setup.
     *   * QEMU `-M virt -kernel` (and UEFI/GRUB chain) hands off
     *     directly at EL1.
     * CurrentEL discriminates: the EL1 case is matched by the
     * board macro, EL2 by the b.eq below, EL3 falls through. */
    mrs x0, CurrentEL
    check_el1_already x0
    cmp x0, #(2 << 2)
    b.eq drop_from_el2

    /* EL3 path: armstub already wrote SPSR_EL3 / HCR_EL2 / SCR_EL3
     * etc. Eret to el1_entry. */
    adr x0, el1_entry
    msr ELR_EL3, x0
    eret

drop_from_el2:
    /* EL2 path: replicate the bits of armstub's setup that matter for
     * dropping to EL1. Both values fit in a single movz, so GAS keeps
     * them inline — no pool entry, no boundary shift. */
    ldr x0, =HCR_EL2_VAL
    msr HCR_EL2, x0
    ldr x0, =SPSR_EL3_VAL
    msr SPSR_EL2, x0
    adr x0, el1_entry
    msr ELR_EL2, x0
    eret

el1_entry:
    ret

/* Board-specific wake_up_cores lives in
 * src/board/<board>/boot_quirks.S; the linker concatenates its
 * ".text.boot" between this file's ".text.boot" and
 * ".text.boot.late" below, preserving the original layout. */

.section ".text.boot.late"

map_identity:
    /* save return address */
    mov x29, x30
    adrp x0, id_pg_dir
    mov x1, #ID_MAP_TABLE_SIZE
    /* clear id page tables */
    bl memzero
    adrp x0, id_pg_dir
    /* x1 = address of id map pud */
    add x1, x0, #PAGE_SIZE

    /* Board-specific PUD/PMD setup. `.macro map_identity_regions` is
     * defined in src/board/<board>/board_asm_defs.inc — Pi maps PA
     * 0..0x1000000 via PUD index 0; virt maps PA
     * 0x40000000..0x41000000 via PUD index 1 so the kernel image at
     * PA 0x40080000 stays addressable across the MMU-enable point.
     * The macro expands inline; rpi4b output is byte-identical. */
    map_identity_regions

    /* restore return address */
    mov x30, x29
    ret

map_high:
    /* save return address */
    mov x29, x30
    adrp x0, high_pg_dir
    mov x1, #HIGH_MAP_TABLE_SIZE
    /* clear high page tables */
    bl memzero
    adrp x0, high_pg_dir
    /* x1 = address of high map pud */
    add x1, x0, #PAGE_SIZE
    /* x4 = address of the mapped va (pgd) */
    ldr x4, =LINEAR_MAP_BASE
    /* install PGD entry */
    create_table_entry x0, x1, x4, PGD_SHIFT, TD_KERNEL_TABLE_FLAGS, x2, x3
    /* goto next level */
    add x0, x0, #PAGE_SIZE
    add x1, x1, #PAGE_SIZE

    /* Board-specific PUD/PMD setup + create_block_map calls.
     * `.macro map_high_regions` is defined in
     * src/board/<board>/board_asm_defs.inc — Pi 4 fans out to four
     * 1 GiB PUD slots with a 64 MiB device window in the last PMD;
     * other boards adapt freely.  The macro expands inline, so the
     * emitted bytes for rpi4b stay byte-identical to the previous
     * unrolled form. */
    map_high_regions

    /* restore return address */
    mov x30, x29
    ret

proc_hang:
    /* wait for event */
    wfe
    b proc_hang

/* Explicit literal table — kept out of .text.boot.late so that section
 * stays 4-byte aligned (no 8-byte data inside ⇒ no padding before it
 * when ld concatenates it after .text.boot). Order matches the GAS
 * literal-pool encounter order pre-split, so the table is layout-
 * identical to the single-section baseline. Only values that GAS
 * could not movz-fold into a single inline instruction live here:
 * 32+-bit non-single-chunk values (TCR, SCTLR, HIGH_MAP_*_END,
 * HIGH_MAP_SECOND_START, TD_*_BLOCK_FLAGS, HIGH_MAP_DEVICE_END).
 * linker.ld places .text.boot.literals right after .text.boot.late
 * inside the output .text.boot section. */
.section ".text.boot.literals"

.Ltcr_el1_val:
    .quad TCR_EL1_VAL
.Lsctlr_mmu_enabled:
    .quad SCTLR_EL1_VAL_MMU_ENABLED
.Ltd_kernel_block_flags:
    .quad TD_KERNEL_BLOCK_FLAGS
.Lhigh_map_first_end:
    .quad HIGH_MAP_FIRST_END
.Lhigh_map_second_start:
    .quad HIGH_MAP_SECOND_START
.Lhigh_map_second_end:
    /* same value as HIGH_MAP_THIRD_START; one entry, two consumers */
    .quad HIGH_MAP_SECOND_END
.Lhigh_map_third_end:
    /* same value as HIGH_MAP_FOURTH_START */
    .quad HIGH_MAP_THIRD_END
.Lhigh_map_fourth_end:
    /* same value as HIGH_MAP_DEVICE_START */
    .quad HIGH_MAP_FOURTH_END
.Lhigh_map_device_end:
    .quad HIGH_MAP_DEVICE_END
.Ltd_device_block_flags:
    .quad TD_DEVICE_BLOCK_FLAGS