ajhahn.de
← FlashOS
Flash 685 lines
// BCM2711 EMMC2 SDHCI driver — PIO block I/O.
//
// MMIO at 0xFE340000 + LINEAR_MAP_BASE; reachable from EL1 via the
// TTBR1 device-typed mapping boot.S sets up for the GIC / UART / timer.
// Single-block read/write only; multi-block (CMD18 / CMD25) + DMA are
// future optimisations.
//
// Init sequence (matches the SD Physical Layer Simplified Spec):
//   1. Software reset (SRST_HC), internal clock @ ~400 kHz, bus power on
//   2. CMD0  — GO_IDLE_STATE
//   3. CMD8  — SEND_IF_COND, check pattern 0xAA (rejects pre-v2 cards)
//   4. ACMD41 loop — SD_SEND_OP_COND, HCS bit set, until card ready
//   5. CMD2  — ALL_SEND_CID
//   6. CMD3  — SEND_REL_ADDR, capture RCA
//   7. CMD9  — SEND_CSD, decode v2 capacity
//   8. CMD7  — SELECT_CARD (transfer state)
//   9. Switch DIV → ~25 MHz
//
// All waits are polled busy loops; IRQ-driven completion is a future
// perf pass. send_cmd / read_block / write_block return i32 with -1
// on any failure path; the caller (kernel.zig) logs `[Debug] EMMC2
// init FAILED` and continues — graceful degradation.
//
// STATUS — Pi-hardware EMMC2 VERIFIED on real microSD across the full
// stack. init() + write_block(LBA 2064) + read_block +
// byte-compare green against a 64 GB SDXC card formatted FAT32 (MBR,
// name "BOOT") booting FlashOS off EMMC2 with the Toshiba USB
// removed. `[PASS] fs-roundtrip` two-boot acceptance on the same
// card — write 1-byte ROUNDTR.MAG + 4-KiB ROUNDTR.DAT on boot 1,
// power-cycle, read back + verify on boot 2 (16/16 tally, 0 ERROR).
// SDHCI single-block PIO: poll BUFFER_*_RDY once per block, burst all
// 128 words through DATAPORT, then poll DATA_DONE once. The BCM2711
// Arasan controller fires BUFFER_*_RDY per block (not per word), so
// per-word polling drops bytes; the once-per-block pattern matches
// Linux sdhci.c and Circle. `log_io_fail` runs on every failure
// return — zero hot-path overhead and one log line per wedged op.

const std = #import("std")
const sdhci = #import("sdhci_cmd")
const block_dev = #import("block_dev")
const mailbox = #import("mailbox") //      pure: clock-id constants
const mbox = #import("rpi4b_mailbox") //   board: VideoCore MMIO doorbell

// Per-step debug-print: needed to know which SDHCI init step fails on
// real hardware. main_output is the same UART sink kernel.zig uses;
// declaring it extern here keeps emmc2.zig out of the host-test
// build (the module is rpi4b-only, gated by board.zig).
extern fn main_output(interface i32, str [*:0]u8) void
extern fn main_output_u64(interface i32, n u64) void
const MU i32 = 0

const DIAG bool = false // per-step SDHCI init trace; flip to true to see which step fails on a bad card

const LINEAR_MAP_BASE u64 = 0xFFFF000000000000
const DEVICE_BASE u64 = 0xFE000000
const EMMC2_BASE u64 = DEVICE_BASE + 0x340000 + LINEAR_MAP_BASE

// SDHCI register layout (BCM2711 ARM Peripherals §5, simplified to
// the registers the driver touches). Offsets match the SD spec 3.00
// Standard Host Controller register file.
const EmmcRegs = extern struct {
    arg2 u32, // 0x00
    blksizecnt u32, // 0x04 — BLKSIZE (low 12) | BLKCNT (16..31)
    arg1 u32, // 0x08
    cmdtm u32, // 0x0C — CMD + TRANSFER_MODE (sdhci_cmd encodes)
    resp0 u32, // 0x10
    resp1 u32, // 0x14
    resp2 u32, // 0x18
    resp3 u32, // 0x1C
    data u32, // 0x20 — buffer port (PIO drain/fill)
    status u32, // 0x24
    control0 u32, // 0x28
    control1 u32, // 0x2C
    interrupt u32, // 0x30 — write-1-to-clear on real card
    irpt_mask u32, // 0x34
    irpt_en u32, // 0x38
    control2 u32 // 0x3C
}

inline fn regs() *mut volatile EmmcRegs {
    return #ptrFromInt(EMMC2_BASE)
}

// Off-struct register pointers — CAPABILITIES (0x40/0x44) and
// SLOTISR_VER (0xFC) are diagnostic-only, so keeping them out of the
// hot-path struct avoids forcing a 256-byte stride on every register
// access.
inline fn reg_at(comptime offset u32) *mut volatile u32 {
    return #ptrFromInt(EMMC2_BASE + offset)
}

// STATUS register flags (offset 0x24).
const STATUS_CMD_INHIBIT u32 = 1 << 0
const STATUS_DAT_INHIBIT u32 = 1 << 1
const STATUS_SPACE_AVAIL u32 = 1 << 10
const STATUS_DATA_AVAIL u32 = 1 << 11

// INTERRUPT register flags (offset 0x30). Write-1-to-clear.
const INTERRUPT_CMD_DONE u32 = 1 << 0
const INTERRUPT_DATA_DONE u32 = 1 << 1
const INTERRUPT_WRITE_RDY u32 = 1 << 4
const INTERRUPT_READ_RDY u32 = 1 << 5
const INTERRUPT_ERR_MASK u32 = 0x017F8000

// CONTROL1 register flags (offset 0x2C).
const CTRL1_CLK_INTLEN u32 = 1 << 0
const CTRL1_CLK_STABLE u32 = 1 << 1
const CTRL1_CLK_EN u32 = 1 << 2
const CTRL1_SRST_HC u32 = 1 << 24
const CTRL1_SRST_CMD u32 = 1 << 25
const CTRL1_SRST_DAT u32 = 1 << 26
const CTRL1_SRST_ALL u32 = CTRL1_SRST_HC | CTRL1_SRST_CMD | CTRL1_SRST_DAT

// Polled-wait spin counts. Big enough to absorb sub-MHz SD cards on
// real hardware (~700 µs at 1.5 GHz) and trivial on QEMU. Don't lower
// to "tune for QEMU" — real cards are slower.
const SPIN_CMD u32 = 1_000_000
const SPIN_DATA u32 = 1_000_000

var rca u32 = 0
var capacity_blocks u64 = 0
var base_clock_hz u32 = 0

// Arasan SDHCI core inside the BCM2711 EMMC2 has a clock-domain-crossing
// bugette (Linux drivers/mmc/host/sdhci-iproc.c §"writel" + the bugette
// comment): successive register writes spaced closer than ~2 SD-card
// clock cycles can be silently dropped. At the ~390 kHz identification
// clock that is ~5 µs; back-to-back CPU writes at 1.5 GHz land
// nanoseconds apart, so ARG1 was being lost between BLKSIZECNT and
// CMDTM — every command with a non-zero argument (CMD8, ACMD41, CMD9,
// CMD17, …) fired with ARG=0 and timed out, while CMD0 looked fine
// because its argument is 0 either way. Linux mitigates by inserting a
// 4-SD-clock delay after every writel while host->clock ≤ 400 kHz; this
// driver does the same via `emmc_write`. The flag flips to `false` in
// init step 10 once the bus moves to ~25 MHz, after which the inter-write
// gap is no longer an issue.
var low_clock bool = true

// 4 SD-clock cycles at the ~390 kHz identification clock ≈ 10.3 µs,
// rounded up. Linux uses the same 4-clock delay in
// drivers/mmc/host/sdhci-iproc.c while host->clock ≤ 400 kHz.
const IDENT_CLOCK_DOMAIN_CROSSING_DELAY_US u32 = 11

inline fn emmc_write(reg *mut volatile u32, val u32) void {
    reg.* = val
    if (low_clock) {
        delay_us(IDENT_CLOCK_DOMAIN_CROSSING_DELAY_US)
    }
}

pub fn init() i32 {
    const r = regs()

    // Diagnostic dump before any controller poke. Proves the MMIO
    // address is right (SLOTISR_VER reads a sane vendor/version, not
    // 0xFFFFFFFF) and records the controller's pre-init state.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 diag SLOTISR_VER=0x")
        main_output_u64(MU, reg_at(0xFC).*)
        main_output(MU, " CAPS_LO=0x")
        main_output_u64(MU, reg_at(0x40).*)
        main_output(MU, " CAPS_HI=0x")
        main_output_u64(MU, reg_at(0x44).*)
        main_output(MU, "\n")
        main_output(MU, "[Debug] EMMC2 diag entry ctrl0=0x")
        main_output_u64(MU, r.control0)
        main_output(MU, " ctrl1=0x")
        main_output_u64(MU, r.control1)
        main_output(MU, " ctrl2=0x")
        main_output_u64(MU, r.control2)
        main_output(MU, " status=0x")
        main_output_u64(MU, r.status)
        main_output(MU, " intr=0x")
        main_output_u64(MU, r.interrupt)
        main_output(MU, "\n")
    }

    // 0. Ensure the SD-card power rail is on. Circle's CardInit calls
    //    PROPTAG_SET_POWER_STATE(SD_CARD, ON|WAIT) before any controller
    //    reset on Pi 4. The Pi 4 boot firmware loaded the kernel from
    //    this slot so VDD is normally already on, but matching Circle
    //    defensively rules out a half-powered state where commands
    //    transmit on the wire but the card can't answer.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 0 sd_power_on\n")
    }
    if (!mbox.setPowerState(mailbox.DEVICE_ID_SD_CARD, mailbox.POWER_STATE_ON | mailbox.POWER_STATE_WAIT)) {
        if (DIAG) {
            main_output(MU, "[Debug] EMMC2 sd_power_on FAILED\n")
        }
        return -1
    }
    delay_us(2_000)

    // 0a. Select the 3.3 V SD I/O rail (expander line 4 = 0; per
    //     bcm2711-rpi-4-b.dts VDD_SD_IO_SEL: 0 = 3.3 V, 1 = 1.8 V),
    //     matching the controller's 3.3 V drive — the conventional
    //     bring-up assumption. Pi-HW init has been verified end-to-end
    //     from this 3.3 V default; 1.8 V UHS-I
    //     switching stays a future perf concern.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 0a sd_io_3v3\n")
    }
    if (!mbox.setGpioState(mailbox.EXP_GPIO_SD_1V8, 0)) {
        if (DIAG) {
            main_output(MU, "[Debug] EMMC2 sd_io_3v3 FAILED\n")
        }
        return -1
    }
    delay_us(5_000)

    // 1. Software reset of the host controller. SRST_HC alone leaves
    //    the CMD/DAT sub-state machines in limbo — cmdtm writes have
    //    no effect on real hardware after SRST_HC alone. Triple-reset
    //    (SRST_HC | SRST_CMD | SRST_DAT) matches Linux's
    //    drivers/mmc/host/sdhci.c sdhci_reset(host, SDHCI_RESET_ALL).
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 1 SRST_ALL\n")
    }
    emmc_write(&r.control1, r.control1 | CTRL1_SRST_ALL)
    if (!busy_wait_clear(&r.control1, CTRL1_SRST_ALL, 100_000)) {
        return -1
    }

    // 1a. Bring the SD bus up before the clock. Circle's Pi 4 EMMC
    //     reset path powers VDD and clears CONTROL2 before configuring
    //     SDCLK; SRST_HC zeroes both. POWER_ON = bit 8, BUS_VOLTAGE =
    //     bits 11:9 (0b111 = 3.3 V). Let the rail settle before the
    //     clock is brought up.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 1a bus_power\n")
    }
    emmc_write(&r.control2, 0)
    emmc_write(&r.control0, (#as(u32, 1) << 8) | (#as(u32, 0b111) << 9))
    // SD spec PLSS §6.4.1: ≥1 ms after VDD reaches stable level before
    // first command. Pi 4 firmware can leave BUS_POWER cleared (entry
    // ctrl0=0x00800000 has bit 8 = 0), so this write may be the actual
    // VDD power-on edge for the card — be generous to cover both
    // power-cycle (cold rise) and pure-controller-toggle paths.
    delay_us(10_000)

    // 1b. Resolve the EMMC2 base clock from the VideoCore firmware.
    //     The SDHCI divider is derived from this; the CAP register's
    //     base-clock field is unreliable on the BCM2711, so the
    //     firmware value is the only sound source.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 1b base_clock\n")
    }
    base_clock_hz = mbox.getClockRate(mailbox.CLOCK_ID_EMMC2)
    if (base_clock_hz == 0) {
        if (DIAG) {
            main_output(MU, "[Debug] EMMC2 mailbox clock query FAILED\n")
        }
        return -1
    }
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 base clock=0x")
        main_output_u64(MU, base_clock_hz)
        main_output(MU, "\n")
    }

    // 2. Internal clock + identification-mode divider (~400 kHz). The
    //    divisor is a power of two derived from the firmware base
    //    clock (the BCM2711 EMMC2 only accepts power-of-two dividers).
    //    The delays around CLK_EN mirror Circle's reset path — real
    //    hardware wants the internal clock to settle before the card
    //    clock is gated on, and again before the first command.
    //    TOUNIT = 0xC matches Circle's Pi 4 data-timeout choice.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 2 CLK_STABLE\n")
    }
    const id_div = sdhci.clockDivisor(base_clock_hz, 400_000)
    emmc_write(&r.control1, CTRL1_CLK_INTLEN | sdhci.control1ClockBits(id_div) | (#as(u32, 0xC) << 16))
    if (!busy_wait_set(&r.control1, CTRL1_CLK_STABLE, 100_000)) {
        return -1
    }
    delay_us(2_000)
    emmc_write(&r.control1, r.control1 | CTRL1_CLK_EN)
    delay_us(2_000)

    // 2a. Enable interrupt-status latching. SRST zeroes IRPT_MASK
    //     (0x34, the SDHCI Normal+Error Interrupt Status Enable
    //     register); while it reads 0 the INTERRUPT register (0x30)
    //     never latches a single event, so the polled send_cmd loop
    //     spins out every command. IRPT_MASK gates 0x30 latching;
    //     IRPT_EN (0x38) is the physical-IRQ signal enable and stays
    //     clear — send_cmd is polled and no EMMC line is wired into
    //     the GIC. The explicit IRPT_EN=0 write matches Circle's
    //     CardReset (defensive against firmware that left it non-zero).
    emmc_write(&r.irpt_en, 0)
    emmc_write(&r.interrupt, 0xFFFF_FFFF)
    emmc_write(&r.irpt_mask, 0xFFFF_FFFF)
    delay_us(2_000)

    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 pre-CMD0 status=0x")
        main_output_u64(MU, r.status)
        main_output(MU, " ctrl0=0x")
        main_output_u64(MU, r.control0)
        main_output(MU, " ctrl1=0x")
        main_output_u64(MU, r.control1)
        main_output(MU, " ctrl2=0x")
        main_output_u64(MU, r.control2)
        main_output(MU, " mask=0x")
        main_output_u64(MU, r.irpt_mask)
        main_output(MU, "\n")
    }

    // 3. CMD0 — GO_IDLE_STATE. No response; the card transitions to idle.
    //    Triple-issue with 5 ms gaps. Pi 4 firmware can hand off with
    //    the card in Stand-by or Transfer state (RCA assigned, last
    //    block read complete) rather than the cold-POR Idle state every
    //    other bare-metal driver assumes. A single CMD0 with no inter-
    //    command settle is not guaranteed to traverse the state machine
    //    back to Idle when the card was warm-handed-off. Three sends
    //    with 5 ms gaps gives the card-side state machine time to
    //    transition, per SD PLSS §4.4 NCC + post-reset settle.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 3 CMD0 (x3)\n")
    }
    var cmd0_try u32 = 0
    while (cmd0_try < 3) {
        if (send_cmd(sdhci.CMD0_GO_IDLE, 0, BLKSIZECNT_NONE) < 0) {
            return -1
        }
        delay_us(5_000)
        cmd0_try += 1
    }

    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 post-CMD0 status=0x")
        main_output_u64(MU, r.status)
        main_output(MU, " intr=0x")
        main_output_u64(MU, r.interrupt)
        main_output(MU, "\n")
    }

    // Extra settle after CMD0 burst, before CMD8 — covers post-state-
    // transition NCC plus internal card-clock domain crossing.
    delay_us(5_000)

    // 4. CMD8 — SEND_IF_COND. Echo the 0xAA check pattern back in R7;
    //    mismatch means pre-v2.0 card or out-of-range voltage rail.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 4 CMD8\n")
    }
    if (send_cmd(sdhci.CMD8_SEND_IF_COND, sdhci.CMD8_ARG_VHS_27_36_CHECK_AA, BLKSIZECNT_NONE) < 0) {
        // CMD8 timeout = no card present or unreadable card. Fail
        // cleanly; kernel.zig logs `EMMC2 init FAILED` and degrades
        // to the initramfs path.
        return -1
    }
    if ((r.resp0 & 0xFF) != 0xAA) {
        if (DIAG) {
            main_output(MU, "[Debug] EMMC2 step 4 CMD8 echo mismatch\n")
        }
        return -1
    }

    // 5. ACMD41 — SD_SEND_OP_COND with HCS. Repeated until bit 31 of
    //    OCR (resp0) is set, indicating card power-up complete. Each
    //    ACMD requires a preceding CMD55 (APP_CMD); failures inside
    //    the loop are tolerated because the next pass re-issues both.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 5 ACMD41\n")
    }
    var tries u32 = 0
    while (tries < 100) {
        _ = send_cmd(sdhci.CMD55_APP_CMD, 0, BLKSIZECNT_NONE)
        _ = send_cmd(sdhci.ACMD41_SD_SEND_OP_COND, sdhci.ACMD41_ARG_HCS_AND_VOLT, BLKSIZECNT_NONE)
        if ((r.resp0 & (#as(u32, 1) << 31)) != 0) {
            break
        }
        delay_us(10_000)
        tries += 1
    }
    if (tries == 100) {
        return -1
    }

    // 6. CMD2 — ALL_SEND_CID. R2 lands in resp0..resp3; the CID is
    //    not consumed past init, but the card must transition through
    //    this state to accept CMD3.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 6 CMD2\n")
    }
    if (send_cmd(sdhci.CMD2_ALL_SEND_CID, 0, BLKSIZECNT_NONE) < 0) {
        return -1
    }

    // 7. CMD3 — SEND_REL_ADDR. R6: RCA in resp0[31:16]. Subsequent
    //    addressed commands (CMD7, CMD9) use this in arg[31:16].
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 7 CMD3\n")
    }
    if (send_cmd(sdhci.CMD3_SEND_REL_ADDR, 0, BLKSIZECNT_NONE) < 0) {
        return -1
    }
    rca = r.resp0 & 0xFFFF_0000

    // 8. CMD9 — SEND_CSD. R2 again; parseCsdV2 rejects pre-SDHC v1.0
    //    cards (CSD_STRUCTURE = 0) which this driver does not
    //    support.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 8 CMD9\n")
    }
    if (send_cmd(sdhci.CMD9_SEND_CSD, rca, BLKSIZECNT_NONE) < 0) {
        return -1
    }
    const csd = sdhci.parseCsdV2(.{ r.resp0, r.resp1, r.resp2, r.resp3 }) catch {
        if (DIAG) {
            main_output(MU, "[Debug] EMMC2 step 8 CSD parse failed (v1 card?)\n")
        }
        return -1
    }
    capacity_blocks = csd.capacity_blocks

    // 9. CMD7 — SELECT_CARD. Moves the card into the transfer state so
    //    CMD17 / CMD24 are legal.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 9 CMD7\n")
    }
    if (send_cmd(sdhci.CMD7_SELECT_CARD, rca, BLKSIZECNT_NONE) < 0) {
        return -1
    }

    // 10. Transfer-mode clock (~25 MHz). Divisor derived from the same
    //     firmware base clock as the identification divider. The PIO
    //     polled-wait loop dominates throughput, so default-speed SD
    //     (25 MHz) is fine; future perf can pick high-speed via CAP1.
    //     Once the clock crosses ~400 kHz the Arasan CDC bugette is no
    //     longer triggered (the 2-SD-clock window shrinks below CPU
    //     instruction-pair spacing only at the ID clock), so clear
    //     `low_clock` here and skip the per-write delay from now on.
    if (DIAG) {
        main_output(MU, "[Debug] EMMC2 step 10 switch_clk\n")
    }
    const tx_div = sdhci.clockDivisor(base_clock_hz, 25_000_000)
    var c1 u32 = r.control1
    c1 &= ~CTRL1_CLK_EN
    emmc_write(&r.control1, c1)
    c1 &= ~#as(u32, 0xFFC0) //                clear SDCLK freq select [15:6]
    c1 |= sdhci.control1ClockBits(tx_div)
    emmc_write(&r.control1, c1)
    if (!busy_wait_set(&r.control1, CTRL1_CLK_STABLE, 100_000)) {
        return -1
    }
    emmc_write(&r.control1, r.control1 | CTRL1_CLK_EN)
    low_clock = false

    // Wire the BlockDev vtable now the controller is in transfer state.
    // The FAT32 backend reads + writes through block_dev.sd_dev;
    // Acceptance #7 checks the slot is populated post-init.
    block_dev.sd_dev = .{ .read_fn = read_block, .write_fn = write_block }
    return 0
}

// Programmed into BLKSIZECNT for non-data commands. Circle writes
// BLKSIZECNT before *every* command (m_block_size | (m_blocks_to_transfer
// << 16); both fields are 0 outside a data transfer); this driver
// follows defensively — some BCM2711 EMMC2 firmware revisions
// reportedly hang CMD8 when stale BLKSIZECNT bits leak in from a
// prior data op.
const BLKSIZECNT_NONE u32 = 0
const BLKSIZECNT_512x1 u32 = (#as(u32, 1) << 16) | 512

fn send_cmd(cmdtm u32, arg u32, blksizecnt u32) i32 {
    const r = regs()
    if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT, SPIN_CMD)) {
        if (DIAG) {
            main_output(MU, "[Debug] send_cmd CMD_INHIBIT stuck\n")
        }
        return -1
    }
    // Clear any stale CMD_DONE / error bits left from a previous command.
    // The Arasan clock-domain-crossing bug applies to *every* write at
    // ID-mode clock, including this one — without the inter-write gap
    // the BLKSIZECNT / ARG1 writes that follow can be silently dropped.
    emmc_write(&r.interrupt, INTERRUPT_CMD_DONE | INTERRUPT_ERR_MASK)
    emmc_write(&r.blksizecnt, blksizecnt)
    emmc_write(&r.arg1, arg)
    emmc_write(&r.cmdtm, cmdtm)

    var spin u32 = 0
    while (spin < SPIN_CMD) {
        const irpt = r.interrupt
        if ((irpt & INTERRUPT_ERR_MASK) != 0) {
            if (DIAG) {
                main_output(MU, "[Debug] send_cmd ERR_MASK irpt=0x")
                main_output_u64(MU, irpt)
                main_output(MU, " status=0x")
                main_output_u64(MU, r.status)
                main_output(MU, " resp0=0x")
                main_output_u64(MU, r.resp0)
                main_output(MU, " resp1=0x")
                main_output_u64(MU, r.resp1)
                main_output(MU, "\n")
            }
            emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
            if (DIAG) {
                main_output(MU, "[Debug] send_cmd post-clear intr=0x")
                main_output_u64(MU, r.interrupt)
                main_output(MU, "\n")
            }
            return -1
        }
        if ((irpt & INTERRUPT_CMD_DONE) != 0) {
            emmc_write(&r.interrupt, INTERRUPT_CMD_DONE)
            return 0
        }
        spin += 1
    }
    if (DIAG) {
        main_output(MU, "[Debug] send_cmd CMD_DONE timeout status=0x")
        main_output_u64(MU, r.status)
        main_output(MU, " irpt=0x")
        main_output_u64(MU, r.interrupt)
        main_output(MU, "\n")
    }
    return -1
}

pub fn read_block(lba u32, buf *mut [512]u8) callconv(.c) i32 {
    const r = regs()
    if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT | STATUS_DAT_INHIBIT, SPIN_DATA)) {
        log_io_fail("read pre-CMD17 inhibit-clear timeout", 0xFFFFFFFF)
        return -1
    }
    // BLKSIZE = 512 (low 12 bits), BLKCNT = 1 (bits 16..31).
    if (send_cmd(sdhci.CMD17_READ_SINGLE, lba, BLKSIZECNT_512x1) < 0) {
        return -1
    }

    // SDHCI single-block PIO: READ_RDY fires once when the block buffer
    // has the full 512 bytes ready; the host then drains it word-by-word
    // without re-polling. Per-word polling is wrong — the interrupt only
    // re-fires for the next block (this driver issues one).
    if (!busy_wait_set(&r.interrupt, INTERRUPT_READ_RDY | INTERRUPT_ERR_MASK, SPIN_DATA)) {
        log_io_fail("read READ_RDY timeout", 0xFFFFFFFF)
        return -1
    }
    if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
        log_io_fail("read ERR before READ_RDY", 0xFFFFFFFF)
        emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
        return -1
    }
    emmc_write(&r.interrupt, INTERRUPT_READ_RDY)

    var i u32 = 0
    while (i < 128) {
        const w = r.data
        // SD bus is little-endian; the data port hands back the wire
        // order directly, so a verbatim byte copy preserves layout.
        const off = i * 4
        const wbytes = std.mem.asBytes(&w)
        buf[off + 0] = wbytes[0]
        buf[off + 1] = wbytes[1]
        buf[off + 2] = wbytes[2]
        buf[off + 3] = wbytes[3]
        i += 1
    }

    if (!busy_wait_set(&r.interrupt, INTERRUPT_DATA_DONE | INTERRUPT_ERR_MASK, SPIN_DATA)) {
        log_io_fail("read DATA_DONE timeout", 0xFFFFFFFF)
        return -1
    }
    if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
        log_io_fail("read ERR before DATA_DONE", 0xFFFFFFFF)
        emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
        return -1
    }
    emmc_write(&r.interrupt, INTERRUPT_DATA_DONE)
    return 0
}

pub fn write_block(lba u32, buf *[512]u8) callconv(.c) i32 {
    const r = regs()
    if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT | STATUS_DAT_INHIBIT, SPIN_DATA)) {
        log_io_fail("write pre-CMD24 inhibit-clear timeout", 0xFFFFFFFF)
        return -1
    }
    if (send_cmd(sdhci.CMD24_WRITE_SINGLE, lba, BLKSIZECNT_512x1) < 0) {
        return -1
    }

    // SDHCI single-block PIO: WRITE_RDY fires once when the block buffer
    // is ready to accept 512 bytes; the host then pushes the full block
    // word-by-word without re-polling. Per-word polling is wrong — the
    // interrupt only re-fires for the next block (this driver issues one).
    if (!busy_wait_set(&r.interrupt, INTERRUPT_WRITE_RDY | INTERRUPT_ERR_MASK, SPIN_DATA)) {
        log_io_fail("write WRITE_RDY timeout", 0xFFFFFFFF)
        return -1
    }
    if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
        log_io_fail("write ERR before WRITE_RDY", 0xFFFFFFFF)
        emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
        return -1
    }
    emmc_write(&r.interrupt, INTERRUPT_WRITE_RDY)

    var i u32 = 0
    while (i < 128) {
        const off = i * 4
        var w u32 = undefined
        const wbytes = std.mem.asBytes(&w)
        wbytes[0] = buf[off + 0]
        wbytes[1] = buf[off + 1]
        wbytes[2] = buf[off + 2]
        wbytes[3] = buf[off + 3]
        r.data = w
        i += 1
    }

    if (!busy_wait_set(&r.interrupt, INTERRUPT_DATA_DONE | INTERRUPT_ERR_MASK, SPIN_DATA)) {
        log_io_fail("write DATA_DONE timeout", 0xFFFFFFFF)
        return -1
    }
    if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
        log_io_fail("write ERR before DATA_DONE", 0xFFFFFFFF)
        emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
        return -1
    }
    emmc_write(&r.interrupt, INTERRUPT_DATA_DONE)
    return 0
}

fn log_io_fail(tag [*:0]u8, word_idx u32) void {
    if (DIAG) {
        const r = regs()
        main_output(MU, "[Debug] EMMC2 ")
        main_output(MU, tag)
        if (word_idx != 0xFFFFFFFF) {
            main_output(MU, " word=0x")
            main_output_u64(MU, word_idx)
        }
        main_output(MU, " status=0x")
        main_output_u64(MU, r.status)
        main_output(MU, " intr=0x")
        main_output_u64(MU, r.interrupt)
        main_output(MU, " resp0=0x")
        main_output_u64(MU, r.resp0)
        main_output(MU, "\n")
    }
}

// Polled-bit helpers. Returns true on the bit reaching the target
// state inside `max_spin` iterations, false on timeout. Callers
// translate timeout to a -1 return (send_cmd / read_block / write_block).
fn busy_wait_set(reg *mut volatile u32, mask u32, max_spin u32) bool {
    var i u32 = 0
    while (i < max_spin) {
        if ((reg.* & mask) != 0) {
            return true
        }
        i += 1
    }
    return false
}

fn busy_wait_clear(reg *mut volatile u32, mask u32, max_spin u32) bool {
    var i u32 = 0
    while (i < max_spin) {
        if ((reg.* & mask) == 0) {
            return true
        }
        i += 1
    }
    return false
}

// Coarse delay used during ACMD41 polling. Real driver uses the
// generic timer's udelay; dragging that in at this layer would force
// a new named-module dependency for a microsecond pause that is only
// hit during init. A future perf pass can swap. The 100×us multiplier
// is a back-of-envelope match for a 1.5 GHz core with the spin body
// being a single `nop`; QEMU executes faster but the only effect is
// quicker init, which is fine.
fn delay_us(us u32) void {
    var i u64 = #as(u64, us) * 100
    while (i > 0) {
        asm volatile ("nop")
        i -= 1
    }
}