Flash 685 lines
// BCM2711 EMMC2 SDHCI driver — PIO block I/O.
//
// MMIO at 0xFE340000 + LINEAR_MAP_BASE; reachable from EL1 via the
// TTBR1 device-typed mapping boot.S sets up for the GIC / UART / timer.
// Single-block read/write only; multi-block (CMD18 / CMD25) + DMA are
// future optimisations.
//
// Init sequence (matches the SD Physical Layer Simplified Spec):
// 1. Software reset (SRST_HC), internal clock @ ~400 kHz, bus power on
// 2. CMD0 — GO_IDLE_STATE
// 3. CMD8 — SEND_IF_COND, check pattern 0xAA (rejects pre-v2 cards)
// 4. ACMD41 loop — SD_SEND_OP_COND, HCS bit set, until card ready
// 5. CMD2 — ALL_SEND_CID
// 6. CMD3 — SEND_REL_ADDR, capture RCA
// 7. CMD9 — SEND_CSD, decode v2 capacity
// 8. CMD7 — SELECT_CARD (transfer state)
// 9. Switch DIV → ~25 MHz
//
// All waits are polled busy loops; IRQ-driven completion is a future
// perf pass. send_cmd / read_block / write_block return i32 with -1
// on any failure path; the caller (kernel.zig) logs `[Debug] EMMC2
// init FAILED` and continues — graceful degradation.
//
// STATUS — Pi-hardware EMMC2 VERIFIED on real microSD across the full
// stack. init() + write_block(LBA 2064) + read_block +
// byte-compare green against a 64 GB SDXC card formatted FAT32 (MBR,
// name "BOOT") booting FlashOS off EMMC2 with the Toshiba USB
// removed. `[PASS] fs-roundtrip` two-boot acceptance on the same
// card — write 1-byte ROUNDTR.MAG + 4-KiB ROUNDTR.DAT on boot 1,
// power-cycle, read back + verify on boot 2 (16/16 tally, 0 ERROR).
// SDHCI single-block PIO: poll BUFFER_*_RDY once per block, burst all
// 128 words through DATAPORT, then poll DATA_DONE once. The BCM2711
// Arasan controller fires BUFFER_*_RDY per block (not per word), so
// per-word polling drops bytes; the once-per-block pattern matches
// Linux sdhci.c and Circle. `log_io_fail` runs on every failure
// return — zero hot-path overhead and one log line per wedged op.
const std = #import("std")
const sdhci = #import("sdhci_cmd")
const block_dev = #import("block_dev")
const mailbox = #import("mailbox") // pure: clock-id constants
const mbox = #import("rpi4b_mailbox") // board: VideoCore MMIO doorbell
// Per-step debug-print: needed to know which SDHCI init step fails on
// real hardware. main_output is the same UART sink kernel.zig uses;
// declaring it extern here keeps emmc2.zig out of the host-test
// build (the module is rpi4b-only, gated by board.zig).
extern fn main_output(interface i32, str [*:0]u8) void
extern fn main_output_u64(interface i32, n u64) void
const MU i32 = 0
const DIAG bool = false // per-step SDHCI init trace; flip to true to see which step fails on a bad card
const LINEAR_MAP_BASE u64 = 0xFFFF000000000000
const DEVICE_BASE u64 = 0xFE000000
const EMMC2_BASE u64 = DEVICE_BASE + 0x340000 + LINEAR_MAP_BASE
// SDHCI register layout (BCM2711 ARM Peripherals §5, simplified to
// the registers the driver touches). Offsets match the SD spec 3.00
// Standard Host Controller register file.
const EmmcRegs = extern struct {
arg2 u32, // 0x00
blksizecnt u32, // 0x04 — BLKSIZE (low 12) | BLKCNT (16..31)
arg1 u32, // 0x08
cmdtm u32, // 0x0C — CMD + TRANSFER_MODE (sdhci_cmd encodes)
resp0 u32, // 0x10
resp1 u32, // 0x14
resp2 u32, // 0x18
resp3 u32, // 0x1C
data u32, // 0x20 — buffer port (PIO drain/fill)
status u32, // 0x24
control0 u32, // 0x28
control1 u32, // 0x2C
interrupt u32, // 0x30 — write-1-to-clear on real card
irpt_mask u32, // 0x34
irpt_en u32, // 0x38
control2 u32 // 0x3C
}
inline fn regs() *mut volatile EmmcRegs {
return #ptrFromInt(EMMC2_BASE)
}
// Off-struct register pointers — CAPABILITIES (0x40/0x44) and
// SLOTISR_VER (0xFC) are diagnostic-only, so keeping them out of the
// hot-path struct avoids forcing a 256-byte stride on every register
// access.
inline fn reg_at(comptime offset u32) *mut volatile u32 {
return #ptrFromInt(EMMC2_BASE + offset)
}
// STATUS register flags (offset 0x24).
const STATUS_CMD_INHIBIT u32 = 1 << 0
const STATUS_DAT_INHIBIT u32 = 1 << 1
const STATUS_SPACE_AVAIL u32 = 1 << 10
const STATUS_DATA_AVAIL u32 = 1 << 11
// INTERRUPT register flags (offset 0x30). Write-1-to-clear.
const INTERRUPT_CMD_DONE u32 = 1 << 0
const INTERRUPT_DATA_DONE u32 = 1 << 1
const INTERRUPT_WRITE_RDY u32 = 1 << 4
const INTERRUPT_READ_RDY u32 = 1 << 5
const INTERRUPT_ERR_MASK u32 = 0x017F8000
// CONTROL1 register flags (offset 0x2C).
const CTRL1_CLK_INTLEN u32 = 1 << 0
const CTRL1_CLK_STABLE u32 = 1 << 1
const CTRL1_CLK_EN u32 = 1 << 2
const CTRL1_SRST_HC u32 = 1 << 24
const CTRL1_SRST_CMD u32 = 1 << 25
const CTRL1_SRST_DAT u32 = 1 << 26
const CTRL1_SRST_ALL u32 = CTRL1_SRST_HC | CTRL1_SRST_CMD | CTRL1_SRST_DAT
// Polled-wait spin counts. Big enough to absorb sub-MHz SD cards on
// real hardware (~700 µs at 1.5 GHz) and trivial on QEMU. Don't lower
// to "tune for QEMU" — real cards are slower.
const SPIN_CMD u32 = 1_000_000
const SPIN_DATA u32 = 1_000_000
var rca u32 = 0
var capacity_blocks u64 = 0
var base_clock_hz u32 = 0
// Arasan SDHCI core inside the BCM2711 EMMC2 has a clock-domain-crossing
// bugette (Linux drivers/mmc/host/sdhci-iproc.c §"writel" + the bugette
// comment): successive register writes spaced closer than ~2 SD-card
// clock cycles can be silently dropped. At the ~390 kHz identification
// clock that is ~5 µs; back-to-back CPU writes at 1.5 GHz land
// nanoseconds apart, so ARG1 was being lost between BLKSIZECNT and
// CMDTM — every command with a non-zero argument (CMD8, ACMD41, CMD9,
// CMD17, …) fired with ARG=0 and timed out, while CMD0 looked fine
// because its argument is 0 either way. Linux mitigates by inserting a
// 4-SD-clock delay after every writel while host->clock ≤ 400 kHz; this
// driver does the same via `emmc_write`. The flag flips to `false` in
// init step 10 once the bus moves to ~25 MHz, after which the inter-write
// gap is no longer an issue.
var low_clock bool = true
// 4 SD-clock cycles at the ~390 kHz identification clock ≈ 10.3 µs,
// rounded up. Linux uses the same 4-clock delay in
// drivers/mmc/host/sdhci-iproc.c while host->clock ≤ 400 kHz.
const IDENT_CLOCK_DOMAIN_CROSSING_DELAY_US u32 = 11
inline fn emmc_write(reg *mut volatile u32, val u32) void {
reg.* = val
if (low_clock) {
delay_us(IDENT_CLOCK_DOMAIN_CROSSING_DELAY_US)
}
}
pub fn init() i32 {
const r = regs()
// Diagnostic dump before any controller poke. Proves the MMIO
// address is right (SLOTISR_VER reads a sane vendor/version, not
// 0xFFFFFFFF) and records the controller's pre-init state.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 diag SLOTISR_VER=0x")
main_output_u64(MU, reg_at(0xFC).*)
main_output(MU, " CAPS_LO=0x")
main_output_u64(MU, reg_at(0x40).*)
main_output(MU, " CAPS_HI=0x")
main_output_u64(MU, reg_at(0x44).*)
main_output(MU, "\n")
main_output(MU, "[Debug] EMMC2 diag entry ctrl0=0x")
main_output_u64(MU, r.control0)
main_output(MU, " ctrl1=0x")
main_output_u64(MU, r.control1)
main_output(MU, " ctrl2=0x")
main_output_u64(MU, r.control2)
main_output(MU, " status=0x")
main_output_u64(MU, r.status)
main_output(MU, " intr=0x")
main_output_u64(MU, r.interrupt)
main_output(MU, "\n")
}
// 0. Ensure the SD-card power rail is on. Circle's CardInit calls
// PROPTAG_SET_POWER_STATE(SD_CARD, ON|WAIT) before any controller
// reset on Pi 4. The Pi 4 boot firmware loaded the kernel from
// this slot so VDD is normally already on, but matching Circle
// defensively rules out a half-powered state where commands
// transmit on the wire but the card can't answer.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 0 sd_power_on\n")
}
if (!mbox.setPowerState(mailbox.DEVICE_ID_SD_CARD, mailbox.POWER_STATE_ON | mailbox.POWER_STATE_WAIT)) {
if (DIAG) {
main_output(MU, "[Debug] EMMC2 sd_power_on FAILED\n")
}
return -1
}
delay_us(2_000)
// 0a. Select the 3.3 V SD I/O rail (expander line 4 = 0; per
// bcm2711-rpi-4-b.dts VDD_SD_IO_SEL: 0 = 3.3 V, 1 = 1.8 V),
// matching the controller's 3.3 V drive — the conventional
// bring-up assumption. Pi-HW init has been verified end-to-end
// from this 3.3 V default; 1.8 V UHS-I
// switching stays a future perf concern.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 0a sd_io_3v3\n")
}
if (!mbox.setGpioState(mailbox.EXP_GPIO_SD_1V8, 0)) {
if (DIAG) {
main_output(MU, "[Debug] EMMC2 sd_io_3v3 FAILED\n")
}
return -1
}
delay_us(5_000)
// 1. Software reset of the host controller. SRST_HC alone leaves
// the CMD/DAT sub-state machines in limbo — cmdtm writes have
// no effect on real hardware after SRST_HC alone. Triple-reset
// (SRST_HC | SRST_CMD | SRST_DAT) matches Linux's
// drivers/mmc/host/sdhci.c sdhci_reset(host, SDHCI_RESET_ALL).
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 1 SRST_ALL\n")
}
emmc_write(&r.control1, r.control1 | CTRL1_SRST_ALL)
if (!busy_wait_clear(&r.control1, CTRL1_SRST_ALL, 100_000)) {
return -1
}
// 1a. Bring the SD bus up before the clock. Circle's Pi 4 EMMC
// reset path powers VDD and clears CONTROL2 before configuring
// SDCLK; SRST_HC zeroes both. POWER_ON = bit 8, BUS_VOLTAGE =
// bits 11:9 (0b111 = 3.3 V). Let the rail settle before the
// clock is brought up.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 1a bus_power\n")
}
emmc_write(&r.control2, 0)
emmc_write(&r.control0, (#as(u32, 1) << 8) | (#as(u32, 0b111) << 9))
// SD spec PLSS §6.4.1: ≥1 ms after VDD reaches stable level before
// first command. Pi 4 firmware can leave BUS_POWER cleared (entry
// ctrl0=0x00800000 has bit 8 = 0), so this write may be the actual
// VDD power-on edge for the card — be generous to cover both
// power-cycle (cold rise) and pure-controller-toggle paths.
delay_us(10_000)
// 1b. Resolve the EMMC2 base clock from the VideoCore firmware.
// The SDHCI divider is derived from this; the CAP register's
// base-clock field is unreliable on the BCM2711, so the
// firmware value is the only sound source.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 1b base_clock\n")
}
base_clock_hz = mbox.getClockRate(mailbox.CLOCK_ID_EMMC2)
if (base_clock_hz == 0) {
if (DIAG) {
main_output(MU, "[Debug] EMMC2 mailbox clock query FAILED\n")
}
return -1
}
if (DIAG) {
main_output(MU, "[Debug] EMMC2 base clock=0x")
main_output_u64(MU, base_clock_hz)
main_output(MU, "\n")
}
// 2. Internal clock + identification-mode divider (~400 kHz). The
// divisor is a power of two derived from the firmware base
// clock (the BCM2711 EMMC2 only accepts power-of-two dividers).
// The delays around CLK_EN mirror Circle's reset path — real
// hardware wants the internal clock to settle before the card
// clock is gated on, and again before the first command.
// TOUNIT = 0xC matches Circle's Pi 4 data-timeout choice.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 2 CLK_STABLE\n")
}
const id_div = sdhci.clockDivisor(base_clock_hz, 400_000)
emmc_write(&r.control1, CTRL1_CLK_INTLEN | sdhci.control1ClockBits(id_div) | (#as(u32, 0xC) << 16))
if (!busy_wait_set(&r.control1, CTRL1_CLK_STABLE, 100_000)) {
return -1
}
delay_us(2_000)
emmc_write(&r.control1, r.control1 | CTRL1_CLK_EN)
delay_us(2_000)
// 2a. Enable interrupt-status latching. SRST zeroes IRPT_MASK
// (0x34, the SDHCI Normal+Error Interrupt Status Enable
// register); while it reads 0 the INTERRUPT register (0x30)
// never latches a single event, so the polled send_cmd loop
// spins out every command. IRPT_MASK gates 0x30 latching;
// IRPT_EN (0x38) is the physical-IRQ signal enable and stays
// clear — send_cmd is polled and no EMMC line is wired into
// the GIC. The explicit IRPT_EN=0 write matches Circle's
// CardReset (defensive against firmware that left it non-zero).
emmc_write(&r.irpt_en, 0)
emmc_write(&r.interrupt, 0xFFFF_FFFF)
emmc_write(&r.irpt_mask, 0xFFFF_FFFF)
delay_us(2_000)
if (DIAG) {
main_output(MU, "[Debug] EMMC2 pre-CMD0 status=0x")
main_output_u64(MU, r.status)
main_output(MU, " ctrl0=0x")
main_output_u64(MU, r.control0)
main_output(MU, " ctrl1=0x")
main_output_u64(MU, r.control1)
main_output(MU, " ctrl2=0x")
main_output_u64(MU, r.control2)
main_output(MU, " mask=0x")
main_output_u64(MU, r.irpt_mask)
main_output(MU, "\n")
}
// 3. CMD0 — GO_IDLE_STATE. No response; the card transitions to idle.
// Triple-issue with 5 ms gaps. Pi 4 firmware can hand off with
// the card in Stand-by or Transfer state (RCA assigned, last
// block read complete) rather than the cold-POR Idle state every
// other bare-metal driver assumes. A single CMD0 with no inter-
// command settle is not guaranteed to traverse the state machine
// back to Idle when the card was warm-handed-off. Three sends
// with 5 ms gaps gives the card-side state machine time to
// transition, per SD PLSS §4.4 NCC + post-reset settle.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 3 CMD0 (x3)\n")
}
var cmd0_try u32 = 0
while (cmd0_try < 3) {
if (send_cmd(sdhci.CMD0_GO_IDLE, 0, BLKSIZECNT_NONE) < 0) {
return -1
}
delay_us(5_000)
cmd0_try += 1
}
if (DIAG) {
main_output(MU, "[Debug] EMMC2 post-CMD0 status=0x")
main_output_u64(MU, r.status)
main_output(MU, " intr=0x")
main_output_u64(MU, r.interrupt)
main_output(MU, "\n")
}
// Extra settle after CMD0 burst, before CMD8 — covers post-state-
// transition NCC plus internal card-clock domain crossing.
delay_us(5_000)
// 4. CMD8 — SEND_IF_COND. Echo the 0xAA check pattern back in R7;
// mismatch means pre-v2.0 card or out-of-range voltage rail.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 4 CMD8\n")
}
if (send_cmd(sdhci.CMD8_SEND_IF_COND, sdhci.CMD8_ARG_VHS_27_36_CHECK_AA, BLKSIZECNT_NONE) < 0) {
// CMD8 timeout = no card present or unreadable card. Fail
// cleanly; kernel.zig logs `EMMC2 init FAILED` and degrades
// to the initramfs path.
return -1
}
if ((r.resp0 & 0xFF) != 0xAA) {
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 4 CMD8 echo mismatch\n")
}
return -1
}
// 5. ACMD41 — SD_SEND_OP_COND with HCS. Repeated until bit 31 of
// OCR (resp0) is set, indicating card power-up complete. Each
// ACMD requires a preceding CMD55 (APP_CMD); failures inside
// the loop are tolerated because the next pass re-issues both.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 5 ACMD41\n")
}
var tries u32 = 0
while (tries < 100) {
_ = send_cmd(sdhci.CMD55_APP_CMD, 0, BLKSIZECNT_NONE)
_ = send_cmd(sdhci.ACMD41_SD_SEND_OP_COND, sdhci.ACMD41_ARG_HCS_AND_VOLT, BLKSIZECNT_NONE)
if ((r.resp0 & (#as(u32, 1) << 31)) != 0) {
break
}
delay_us(10_000)
tries += 1
}
if (tries == 100) {
return -1
}
// 6. CMD2 — ALL_SEND_CID. R2 lands in resp0..resp3; the CID is
// not consumed past init, but the card must transition through
// this state to accept CMD3.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 6 CMD2\n")
}
if (send_cmd(sdhci.CMD2_ALL_SEND_CID, 0, BLKSIZECNT_NONE) < 0) {
return -1
}
// 7. CMD3 — SEND_REL_ADDR. R6: RCA in resp0[31:16]. Subsequent
// addressed commands (CMD7, CMD9) use this in arg[31:16].
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 7 CMD3\n")
}
if (send_cmd(sdhci.CMD3_SEND_REL_ADDR, 0, BLKSIZECNT_NONE) < 0) {
return -1
}
rca = r.resp0 & 0xFFFF_0000
// 8. CMD9 — SEND_CSD. R2 again; parseCsdV2 rejects pre-SDHC v1.0
// cards (CSD_STRUCTURE = 0) which this driver does not
// support.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 8 CMD9\n")
}
if (send_cmd(sdhci.CMD9_SEND_CSD, rca, BLKSIZECNT_NONE) < 0) {
return -1
}
const csd = sdhci.parseCsdV2(.{ r.resp0, r.resp1, r.resp2, r.resp3 }) catch {
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 8 CSD parse failed (v1 card?)\n")
}
return -1
}
capacity_blocks = csd.capacity_blocks
// 9. CMD7 — SELECT_CARD. Moves the card into the transfer state so
// CMD17 / CMD24 are legal.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 9 CMD7\n")
}
if (send_cmd(sdhci.CMD7_SELECT_CARD, rca, BLKSIZECNT_NONE) < 0) {
return -1
}
// 10. Transfer-mode clock (~25 MHz). Divisor derived from the same
// firmware base clock as the identification divider. The PIO
// polled-wait loop dominates throughput, so default-speed SD
// (25 MHz) is fine; future perf can pick high-speed via CAP1.
// Once the clock crosses ~400 kHz the Arasan CDC bugette is no
// longer triggered (the 2-SD-clock window shrinks below CPU
// instruction-pair spacing only at the ID clock), so clear
// `low_clock` here and skip the per-write delay from now on.
if (DIAG) {
main_output(MU, "[Debug] EMMC2 step 10 switch_clk\n")
}
const tx_div = sdhci.clockDivisor(base_clock_hz, 25_000_000)
var c1 u32 = r.control1
c1 &= ~CTRL1_CLK_EN
emmc_write(&r.control1, c1)
c1 &= ~#as(u32, 0xFFC0) // clear SDCLK freq select [15:6]
c1 |= sdhci.control1ClockBits(tx_div)
emmc_write(&r.control1, c1)
if (!busy_wait_set(&r.control1, CTRL1_CLK_STABLE, 100_000)) {
return -1
}
emmc_write(&r.control1, r.control1 | CTRL1_CLK_EN)
low_clock = false
// Wire the BlockDev vtable now the controller is in transfer state.
// The FAT32 backend reads + writes through block_dev.sd_dev;
// Acceptance #7 checks the slot is populated post-init.
block_dev.sd_dev = .{ .read_fn = read_block, .write_fn = write_block }
return 0
}
// Programmed into BLKSIZECNT for non-data commands. Circle writes
// BLKSIZECNT before *every* command (m_block_size | (m_blocks_to_transfer
// << 16); both fields are 0 outside a data transfer); this driver
// follows defensively — some BCM2711 EMMC2 firmware revisions
// reportedly hang CMD8 when stale BLKSIZECNT bits leak in from a
// prior data op.
const BLKSIZECNT_NONE u32 = 0
const BLKSIZECNT_512x1 u32 = (#as(u32, 1) << 16) | 512
fn send_cmd(cmdtm u32, arg u32, blksizecnt u32) i32 {
const r = regs()
if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT, SPIN_CMD)) {
if (DIAG) {
main_output(MU, "[Debug] send_cmd CMD_INHIBIT stuck\n")
}
return -1
}
// Clear any stale CMD_DONE / error bits left from a previous command.
// The Arasan clock-domain-crossing bug applies to *every* write at
// ID-mode clock, including this one — without the inter-write gap
// the BLKSIZECNT / ARG1 writes that follow can be silently dropped.
emmc_write(&r.interrupt, INTERRUPT_CMD_DONE | INTERRUPT_ERR_MASK)
emmc_write(&r.blksizecnt, blksizecnt)
emmc_write(&r.arg1, arg)
emmc_write(&r.cmdtm, cmdtm)
var spin u32 = 0
while (spin < SPIN_CMD) {
const irpt = r.interrupt
if ((irpt & INTERRUPT_ERR_MASK) != 0) {
if (DIAG) {
main_output(MU, "[Debug] send_cmd ERR_MASK irpt=0x")
main_output_u64(MU, irpt)
main_output(MU, " status=0x")
main_output_u64(MU, r.status)
main_output(MU, " resp0=0x")
main_output_u64(MU, r.resp0)
main_output(MU, " resp1=0x")
main_output_u64(MU, r.resp1)
main_output(MU, "\n")
}
emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
if (DIAG) {
main_output(MU, "[Debug] send_cmd post-clear intr=0x")
main_output_u64(MU, r.interrupt)
main_output(MU, "\n")
}
return -1
}
if ((irpt & INTERRUPT_CMD_DONE) != 0) {
emmc_write(&r.interrupt, INTERRUPT_CMD_DONE)
return 0
}
spin += 1
}
if (DIAG) {
main_output(MU, "[Debug] send_cmd CMD_DONE timeout status=0x")
main_output_u64(MU, r.status)
main_output(MU, " irpt=0x")
main_output_u64(MU, r.interrupt)
main_output(MU, "\n")
}
return -1
}
pub fn read_block(lba u32, buf *mut [512]u8) callconv(.c) i32 {
const r = regs()
if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT | STATUS_DAT_INHIBIT, SPIN_DATA)) {
log_io_fail("read pre-CMD17 inhibit-clear timeout", 0xFFFFFFFF)
return -1
}
// BLKSIZE = 512 (low 12 bits), BLKCNT = 1 (bits 16..31).
if (send_cmd(sdhci.CMD17_READ_SINGLE, lba, BLKSIZECNT_512x1) < 0) {
return -1
}
// SDHCI single-block PIO: READ_RDY fires once when the block buffer
// has the full 512 bytes ready; the host then drains it word-by-word
// without re-polling. Per-word polling is wrong — the interrupt only
// re-fires for the next block (this driver issues one).
if (!busy_wait_set(&r.interrupt, INTERRUPT_READ_RDY | INTERRUPT_ERR_MASK, SPIN_DATA)) {
log_io_fail("read READ_RDY timeout", 0xFFFFFFFF)
return -1
}
if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
log_io_fail("read ERR before READ_RDY", 0xFFFFFFFF)
emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
return -1
}
emmc_write(&r.interrupt, INTERRUPT_READ_RDY)
var i u32 = 0
while (i < 128) {
const w = r.data
// SD bus is little-endian; the data port hands back the wire
// order directly, so a verbatim byte copy preserves layout.
const off = i * 4
const wbytes = std.mem.asBytes(&w)
buf[off + 0] = wbytes[0]
buf[off + 1] = wbytes[1]
buf[off + 2] = wbytes[2]
buf[off + 3] = wbytes[3]
i += 1
}
if (!busy_wait_set(&r.interrupt, INTERRUPT_DATA_DONE | INTERRUPT_ERR_MASK, SPIN_DATA)) {
log_io_fail("read DATA_DONE timeout", 0xFFFFFFFF)
return -1
}
if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
log_io_fail("read ERR before DATA_DONE", 0xFFFFFFFF)
emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
return -1
}
emmc_write(&r.interrupt, INTERRUPT_DATA_DONE)
return 0
}
pub fn write_block(lba u32, buf *[512]u8) callconv(.c) i32 {
const r = regs()
if (!busy_wait_clear(&r.status, STATUS_CMD_INHIBIT | STATUS_DAT_INHIBIT, SPIN_DATA)) {
log_io_fail("write pre-CMD24 inhibit-clear timeout", 0xFFFFFFFF)
return -1
}
if (send_cmd(sdhci.CMD24_WRITE_SINGLE, lba, BLKSIZECNT_512x1) < 0) {
return -1
}
// SDHCI single-block PIO: WRITE_RDY fires once when the block buffer
// is ready to accept 512 bytes; the host then pushes the full block
// word-by-word without re-polling. Per-word polling is wrong — the
// interrupt only re-fires for the next block (this driver issues one).
if (!busy_wait_set(&r.interrupt, INTERRUPT_WRITE_RDY | INTERRUPT_ERR_MASK, SPIN_DATA)) {
log_io_fail("write WRITE_RDY timeout", 0xFFFFFFFF)
return -1
}
if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
log_io_fail("write ERR before WRITE_RDY", 0xFFFFFFFF)
emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
return -1
}
emmc_write(&r.interrupt, INTERRUPT_WRITE_RDY)
var i u32 = 0
while (i < 128) {
const off = i * 4
var w u32 = undefined
const wbytes = std.mem.asBytes(&w)
wbytes[0] = buf[off + 0]
wbytes[1] = buf[off + 1]
wbytes[2] = buf[off + 2]
wbytes[3] = buf[off + 3]
r.data = w
i += 1
}
if (!busy_wait_set(&r.interrupt, INTERRUPT_DATA_DONE | INTERRUPT_ERR_MASK, SPIN_DATA)) {
log_io_fail("write DATA_DONE timeout", 0xFFFFFFFF)
return -1
}
if ((r.interrupt & INTERRUPT_ERR_MASK) != 0) {
log_io_fail("write ERR before DATA_DONE", 0xFFFFFFFF)
emmc_write(&r.interrupt, INTERRUPT_ERR_MASK)
return -1
}
emmc_write(&r.interrupt, INTERRUPT_DATA_DONE)
return 0
}
fn log_io_fail(tag [*:0]u8, word_idx u32) void {
if (DIAG) {
const r = regs()
main_output(MU, "[Debug] EMMC2 ")
main_output(MU, tag)
if (word_idx != 0xFFFFFFFF) {
main_output(MU, " word=0x")
main_output_u64(MU, word_idx)
}
main_output(MU, " status=0x")
main_output_u64(MU, r.status)
main_output(MU, " intr=0x")
main_output_u64(MU, r.interrupt)
main_output(MU, " resp0=0x")
main_output_u64(MU, r.resp0)
main_output(MU, "\n")
}
}
// Polled-bit helpers. Returns true on the bit reaching the target
// state inside `max_spin` iterations, false on timeout. Callers
// translate timeout to a -1 return (send_cmd / read_block / write_block).
fn busy_wait_set(reg *mut volatile u32, mask u32, max_spin u32) bool {
var i u32 = 0
while (i < max_spin) {
if ((reg.* & mask) != 0) {
return true
}
i += 1
}
return false
}
fn busy_wait_clear(reg *mut volatile u32, mask u32, max_spin u32) bool {
var i u32 = 0
while (i < max_spin) {
if ((reg.* & mask) == 0) {
return true
}
i += 1
}
return false
}
// Coarse delay used during ACMD41 polling. Real driver uses the
// generic timer's udelay; dragging that in at this layer would force
// a new named-module dependency for a microsecond pause that is only
// hit during init. A future perf pass can swap. The 100×us multiplier
// is a back-of-envelope match for a 1.5 GHz core with the spin body
// being a single `nop`; QEMU executes faster but the only effect is
// quicker init, which is fine.
fn delay_us(us u32) void {
var i u64 = #as(u64, us) * 100
while (i > 0) {
asm volatile ("nop")
i -= 1
}
}