ajhahn.de
← FlashOS
Flash 1071 lines
// BCM2711 DWC2 USB-OTG device (gadget) driver — CDC-ACM console.
//
// Brings the Synopsys DWC2 core up as a Full-Speed USB device and enumerates
// as a CDC-ACM serial function so macOS binds AppleUSBCDCACM and creates a
// /dev/tty.usbmodem node. Layered bottom-up: core bring-up (MMIO / reset /
// EP0 / the SET_ADDRESS quirk), the CDC descriptor set + class control
// requests (SET/GET_LINE_CODING, SET_CONTROL_LINE_STATE) on EP0, then the
// data path: on SET_CONFIGURATION it hardware-configures the
// CDC endpoints — EP1 IN (interrupt notify, activated but never queued), EP2
// OUT + EP2 IN (bulk) — and partitions a per-EP TX FIFO for each IN endpoint
// inside the core SPRAM. Bulk-OUT bytes drain from the shared RX FIFO straight
// into console.console_push (the same ring fsh reads). Bulk-IN rides a bounded
// preempt-guarded TX ring (cdc_tx → serviceTxRing); backpressure is a brief
// bounded spin then drop, so the kernel never blocks on a host that stopped
// reading. The console mux that routes fsh output through cdc_tx lives in
// sys.zig (console_tx).
//
// Design constraints:
//   * Full-Speed (DCFG.DevSpd = FS) — skips HS chirp + the qualifier descs.
//   * Polled — poll() reads GINTSTS from the PID-0 idle loop; no GIC/IRQ.
//   * Slave/PIO (GAHBCFG.DMAEn = 0) — CPU copies via the FIFO window.
//   * MMIO at 0xFE980000 is already device-mapped by boot.S, so this needs
//     no page allocator; all buffers are static (EP0 FS max packet = 64 B).
//   * Deferred connect — the gadget stays electrically detached until the
//     PID-0 idle loop services poll() at µs rate (sustained idle); see the
//     "Connection manager" section. Attaching any earlier guarantees a
//     failed enumeration (the boot harness starves the idle loop).
//
// QEMU `raspi4b` does NOT emulate the DWC2 *device* path, so this cannot be
// brought up in emulation. Two CI-safety invariants keep `zig build
// test-rpi4b` green there: (1) every wait loop is BOUNDED and usb_init
// fails soft with -1 (kernel logs + degrades, like emmc2.init()); (2)
// poll() is a single bounded pass and a no-op until `inited` is set. A dead
// MMIO read (GSNPSID == 0 / 0xFFFFFFFF) bails before any bring-up.
//
// The driver is debugged from the device-side trace UART — here the
// Mini-UART (TRACE = MU), the single adapter on the bench. macOS
// is near-silent on a failed enum, so every GINTSTS event + every SETUP is
// traced. Trace is event-gated (poll() prints only when a bit is actually
// handled) so an idle bus stays silent and the console remains readable.

const usb_desc = #import("usb_descriptors") // pure: descriptors + SETUP decode
const usb_tx_ring = #import("usb_tx_ring") //   pure: bulk-IN TX byte-ring (host-tested)
const mailbox = #import("mailbox") //          pure: DEVICE_ID_USB_HCD, POWER_STATE_*
const mbox = #import("rpi4b_mailbox") //        board: VideoCore MMIO doorbell
const console = #import("console") //           board-agnostic console RX ring

extern fn main_output(interface i32, str [*:0]u8) void
extern fn main_output_u64(interface i32, n u64) void
// UP mutual exclusion for the TX ring. The ring has TWO producers (kernel
// main_output and the user sys_writeConsole path via console_tx) and one
// consumer (serviceTxRing, from the poll loop), so a plain lock-free SPSC ring
// is not enough — a producer preempted mid-enqueue would corrupt head.
// preempt_disable is the single-core lock (SMP → a real spinlock is future
// work, exactly as src/console.zig documents for the RX side).
extern fn preempt_disable() void
extern fn preempt_enable() void

// Trace sink. MU (interface 0) = Mini-UART, the existing bench adapter.
// Flip to 1 (PL011/UART4, GPIO8-9) if a second adapter is wired and you
// want USB trace off the console cable.
const TRACE i32 = 0
const TRACE_VERBOSE bool = false // gate the [usb] bring-up dump; flip to true to debug USB enumeration

// Per-packet bulk trace (EP2 OUT byte counts, EP2 IN chunk sizes). Off by
// default so normal operation leaves the MU trace readable; flip on for HW
// bring-up to watch the data path move bytes.
const TRACE_BULK bool = false

// ---------------------------------------------------------------------------
// MMIO base + register access
// ---------------------------------------------------------------------------

const LINEAR_MAP_BASE u64 = 0xFFFF000000000000
const DWC2_BASE u64 = 0xFE980000 + LINEAR_MAP_BASE

fn reg_at(off u32) *mut volatile u32 {
    return #as(*mut volatile u32, #ptrFromInt(DWC2_BASE + off))
}

// Stock Synopsys DWC2 register offsets (same layout as TinyUSB `dwc2_regs.h`
// and Linux drivers/usb/dwc2/hw.h). Global core block @ 0x000, device block
// @ 0x800, per-EP IN @ 0x900, per-EP OUT @ 0xB00, FIFO windows @ 0x1000.
const GOTGCTL u32 = 0x000
const GAHBCFG u32 = 0x008
const GUSBCFG u32 = 0x00C
const GRSTCTL u32 = 0x010
const GINTSTS u32 = 0x014
const GINTMSK u32 = 0x018
const GRXSTSP u32 = 0x020 // RX status read + POP (the SETUP/OUT decode)
const GRXFSIZ u32 = 0x024
const GNPTXFSIZ u32 = 0x028 // == DIEPTXF0 in device mode (EP0 IN FIFO)
const GSNPSID u32 = 0x040 // core ID — "OT2"/"OT3" signature for the dead-MMIO gate
const GHWCFG3 u32 = 0x04C // [31:16] = total DFIFO SPRAM depth (words)
const DCFG u32 = 0x800
const DCTL u32 = 0x804
const DSTS u32 = 0x808
const DIEPMSK u32 = 0x810
const DOEPMSK u32 = 0x814
const DAINTMSK u32 = 0x81C
const DIEPCTL0 u32 = 0x900
const DIEPINT0 u32 = 0x908
const DIEPTSIZ0 u32 = 0x910
const DTXFSTS0 u32 = 0x918 // EP0 IN TX-FIFO space available (words)
const DOEPCTL0 u32 = 0xB00
const DOEPINT0 u32 = 0xB08
const DOEPTSIZ0 u32 = 0xB10
const DFIFO0 u32 = 0x1000 // EP0 / non-periodic FIFO push-pop window

// Per-EP IN/OUT register stride is 0x20. EP1 IN = notify, EP2 IN/OUT = bulk.
const DIEPCTL1 u32 = 0x920
const DIEPCTL2 u32 = 0x940
const DIEPINT2 u32 = 0x948
const DIEPTSIZ2 u32 = 0x950
const DTXFSTS2 u32 = 0x958 // EP2 IN TX-FIFO space available (words)
const DOEPCTL2 u32 = 0xB40
const DOEPINT2 u32 = 0xB48
const DOEPTSIZ2 u32 = 0xB50
// Dedicated IN-EP TX-FIFO size/start registers (words): [31:16]=depth,
// [15:0]=start. DIEPTXF0 is GNPTXFSIZ (EP0 IN); DIEPTXFn @ 0x104 + (n-1)*4.
const DIEPTXF1 u32 = 0x104
const DIEPTXF2 u32 = 0x108
// Slave-mode FIFO access windows (0x1000 stride). OUT data is always read
// through window 0 (the shared RX FIFO); each IN endpoint is pushed through
// its own window: EP0 IN → 0x1000, EP2 IN → 0x3000.
const DFIFO2 u32 = 0x3000 // EP2 IN push window

// --- Bit fields (transcribed from stock DWC2; UPPERCASE hex per hygiene). ---
const GAHBCFG_GLBL_INTR_MSK u32 = 1 << 0 // 0 = no IRQ to GIC (we poll)
const GAHBCFG_DMA_EN u32 = 1 << 5 // 0 = slave/PIO (locked)
const GAHBCFG_TXF_EMP_LVL u32 = 1 << 7

const GUSBCFG_PHYSEL u32 = 1 << 6 // 1 = dedicated FS serial PHY; 0 = USB2.0 (HS) PHY
const GUSBCFG_FORCE_HST u32 = 1 << 29
const GUSBCFG_FORCE_DEV u32 = 1 << 30

const GRSTCTL_CSFTRST u32 = 1 << 0
const GRSTCTL_RXFFLSH u32 = 1 << 4
const GRSTCTL_TXFFLSH u32 = 1 << 5
const GRSTCTL_TXFNUM_ALL u32 = 0x10 << 6 // TxFNum = 0x10 → flush all TX FIFOs
const GRSTCTL_AHBIDLE u32 = 1 << 31

const GINTSTS_CURMOD u32 = 1 << 0
const GINTSTS_SOF u32 = 1 << 3 // never unmask — floods the trace at 1 kHz
const GINTSTS_RXFLVL u32 = 1 << 4
const GINTSTS_USBSUSP u32 = 1 << 11
const GINTSTS_USBRST u32 = 1 << 12
const GINTSTS_ENUMDONE u32 = 1 << 13
const GINTSTS_IEPINT u32 = 1 << 18
const GINTSTS_OEPINT u32 = 1 << 19

// GRXSTSP PktSts field [20:17].
const PKTSTS_OUT_DATA u32 = 2
const PKTSTS_SETUP_DATA u32 = 6

// DCFG.DevSpd [1:0]. The wired-PHY choice is the biggest BCM2711 unknown:
// 0b01 = FS on the integrated USB-2.0 (HS) PHY (the expected BCM2711 path);
// 0b11 = FS on a dedicated FS serial transceiver. Paired with PHYSEL below.
const DCFG_DEVSPD_FS_HS_PHY u32 = 0x1
const DCFG_DEVSPD_FS_DEDICATED u32 = 0x3
const DCFG_DEVSPD_MASK u32 = 0x3
const DCFG_DEVADDR_MASK u32 = 0x7F << 4 // DevAddr [10:4]

const DCTL_SFT_DISCON u32 = 1 << 1 // 1 = D+ pull-up OFF; clear LAST
const DCTL_CGNPINNAK u32 = 1 << 8
const DCTL_CGOUTNAK u32 = 1 << 10

const GOTGCTL_BVALOEN u32 = 1 << 6
const GOTGCTL_BVALOVAL u32 = 1 << 7

const DXEPINT_XFERCOMPL u32 = 1 << 0
const DOEPINT_SETUP u32 = 1 << 3
const DIEPINT_TIMEOUT u32 = 1 << 3

const DXEPCTL_CNAK u32 = 1 << 26
const DXEPCTL_STALL u32 = 1 << 21
const DXEPCTL_EPENA u32 = 1 << 31
const DXEPCTL_MPS_MASK u32 = 0x3 // EP0 MPS is an enum (00=64): NOT a byte count

// Non-control-EP control bits. Unlike EP0, MPS [10:0] is a literal byte count.
const DXEPCTL_USBACTEP u32 = 1 << 15 // endpoint is active in the current config
const DXEPCTL_SETD0PID u32 = 1 << 28 // force the data toggle to DATA0 (core then auto-toggles)
const DXEPCTL_EPTYPE_BULK u32 = 2 << 18 // EPType [19:18]: 10 = bulk
const DXEPCTL_EPTYPE_INTR u32 = 3 << 18 // EPType [19:18]: 11 = interrupt
const DXEPCTL_TXFNUM_1 u32 = 1 << 22 // TxFNum [25:22] → dedicated TX FIFO #1
const DXEPCTL_TXFNUM_2 u32 = 2 << 22 // TxFNum [25:22] → dedicated TX FIFO #2
const EP_BULK_MPS u32 = 64 // FS bulk max packet (bytes)
const EP_NOTIFY_MPS u32 = 16 // CDC interrupt notify max packet (bytes)

const DOEPTSIZ0_SUPCNT_3 u32 = 0x3 << 29 // accept up to 3 back-to-back SETUPs
const DXEPTSIZ_PKTCNT_1 u32 = 1 << 19
const DXEPTSIZ_PKTCNT_SHIFT u5 = 19 // EP0 PktCnt is [20:19] (max 3 packets)
const EP0_MPS u32 = 64 // Full-Speed EP0 max packet (bytes)

// HW-probe knob: if enumeration never reaches USBRST on hardware, flip this
// (and the paired DevSpd above) to try the dedicated FS serial PHY path.
const USE_FS_SERIAL_PHY bool = false

// Bounded-wait iteration caps. Each iteration is one MMIO read; 1M reads is
// trivial on real silicon and on QEMU (where the bit may never set, so the
// loop must terminate to keep the watchdog from hanging).
const SPIN u32 = 1_000_000

// ---------------------------------------------------------------------------
// Static state (no page allocator; FS EP0 max packet = 64 B)
// ---------------------------------------------------------------------------

var inited bool = false
var enumerated_flag bool = false
// Tracks the CDC DTR control line (SET_CONTROL_LINE_STATE wValue bit0). A host
// asserts DTR when it opens the tty (screen / piconnect attach); the 0→1 edge
// is our "operator just connected" signal, used to re-emit the login prompt
// (see dispatchSetup). Cleared on bus reset so a re-attach re-fires.
var dtr_asserted bool = false
var current_config u8 = 0
var setup_packet [8]u8 align(4) = [_]u8{0} ** 8

// CDC line coding (115200 8N1 default). GET_LINE_CODING returns it;
// SET_LINE_CODING captures the host's 7-byte OUT data stage into it. Cosmetic
// over USB but macOS round-trips it on port open.
var line_coding [7]u8 = usb_desc.line_coding_default
// A control-OUT write (SET_LINE_CODING) is mid-flight: its 7-byte data stage
// is arriving on EP0 OUT; the IN ZLP status is sent on the OUT XFRC.
var ep0_out_pending bool = false

const EnumState = enum { reset, default_state, addressed, configured }
var enum_state EnumState = .reset

// --- Bulk data path state ---
// Set on SET_CONFIGURATION(>=1) once EP1/EP2 are hardware-configured; cleared
// on USBRST / SET_CONFIGURATION(0). cdc_tx and the bulk-OUT route gate on it.
var data_configured bool = false
// An EP2 IN (bulk) transfer is in flight: EPENA is set and the host has not yet
// ACKed (DIEPINT2.XferCompl). serviceTxRing must not start a new transfer until
// this clears, or it would overwrite the in-flight FIFO contents.
var ep2_in_busy bool = false

// Bulk-IN TX ring. The bounded byte-ring arithmetic (monotone u64 head/tail,
// modulo indexing, overflow→false, peek-then-advance) lives in the pure
// usb_tx_ring module so it is host-unit-tested (same discipline as
// console.zig / pipe.zig). 512 B absorbs interactive bursts; sustained
// overflow past the bounded spin drops (policy: never block on the host).
// Each ring op below is bracketed in preempt_disable — the single-core lock
// between cdc_tx (producer) and serviceTxRing (consumer).
const TX_RING_SIZE u64 = 512
var tx_ring usb_tx_ring.ByteRing(TX_RING_SIZE) = .{}

// Backpressure spin bound (per dropped byte). Sized to cover one bulk-IN packet
// draining at Full-Speed when the host IS reading; a host that stopped reading
// leaves EP2 NAKing so the spin expires and the byte drops — bounded, never a
// kernel stall.
const TX_SPIN u32 = 2_000

// ---------------------------------------------------------------------------
// Timing — accurate delay off the ARM generic-timer counter (self-contained;
// readable from reset, no kernel timer dependency, stays ~real-time on QEMU).
// ---------------------------------------------------------------------------

fn readCntfrq() u64 {
    return asm volatile ("mrs %[v], cntfrq_el0"
        : [v] "=r" (-> u64),
    )
}
fn readCntpct() u64 {
    return asm volatile ("mrs %[v], cntpct_el0"
        : [v] "=r" (-> u64),
    )
}
fn delay_us(us u64) void {
    const freq = readCntfrq()
    if (freq == 0) {
        return // firmware left CNTFRQ unset — skip rather than spin
    }
    const start = readCntpct()
    const ticks = (freq * us) / 1_000_000
    while ((readCntpct() -% start) < ticks) {}
}

// ---------------------------------------------------------------------------
// Trace helpers (all to TRACE / Mini-UART; main_output_u64 prints 16 hex digits)
// ---------------------------------------------------------------------------

fn trace(s [*:0]u8) void {
    if (!TRACE_VERBOSE) {
        return
    }
    main_output(TRACE, s)
}
fn traceHex(s [*:0]u8, v u64) void {
    if (!TRACE_VERBOSE) {
        return
    }
    main_output(TRACE, s)
    main_output_u64(TRACE, v)
    main_output(TRACE, "\n")
}

// ---------------------------------------------------------------------------
// Bounded MMIO waits
// ---------------------------------------------------------------------------

fn waitSet(off u32, mask u32) bool {
    var i u32 = 0
    while (i < SPIN) {
        if ((reg_at(off).* & mask) != 0) {
            return true
        }
        i += 1
    }
    return false
}
fn waitClear(off u32, mask u32) bool {
    var i u32 = 0
    while (i < SPIN) {
        if ((reg_at(off).* & mask) == 0) {
            return true
        }
        i += 1
    }
    return false
}

fn flushTxFifos() void {
    reg_at(GRSTCTL).* = GRSTCTL_TXFNUM_ALL | GRSTCTL_TXFFLSH
    _ = waitClear(GRSTCTL, GRSTCTL_TXFFLSH)
}
fn flushRxFifo() void {
    reg_at(GRSTCTL).* = GRSTCTL_RXFFLSH
    _ = waitClear(GRSTCTL, GRSTCTL_RXFFLSH)
}

// ---------------------------------------------------------------------------
// EP0 control plumbing
// ---------------------------------------------------------------------------

// Arm OUT-EP0 to receive the next SETUP (and the OUT ZLP status of an IN
// transfer). MANDATORY after every control transfer — forgetting it means
// the second SETUP (e.g. SET_ADDRESS after GET_DESCRIPTOR) never arrives.
fn armOutSetup() void {
    reg_at(DOEPTSIZ0).* = DOEPTSIZ0_SUPCNT_3 | DXEPTSIZ_PKTCNT_1 | (8 * 1)
    reg_at(DOEPCTL0).* = reg_at(DOEPCTL0).* | DXEPCTL_EPENA | DXEPCTL_CNAK
}

// Send an EP0 IN transfer. The CDC config descriptor is 67 B > the 64-B FS
// EP0 max packet, so this packetizes: PktCnt = ceil(len / 64) (EP0 PktCnt is
// 2 bits → max 3 packets / 192 B; our largest transfer is the 67-B config).
// All bytes fit the 128-B EP0 TX FIFO at once, so slave-mode PIO pushes the
// whole transfer in one shot and the core splits it into max-packet chunks.
// len == 0 sends the status-stage ZLP. A short final packet (len not a
// multiple of 64 — true for every descriptor we serve) terminates the
// transfer, so no explicit ZLP is needed.
fn ep0SendData(data []u8) void {
    const len u32 = #intCast(data.len)
    const pktcnt u32 = if (len == 0) 1 else (len + EP0_MPS - 1) / EP0_MPS
    reg_at(DIEPTSIZ0).* = (pktcnt << DXEPTSIZ_PKTCNT_SHIFT) | len // PktCnt, XferSize=len
    reg_at(DIEPCTL0).* = reg_at(DIEPCTL0).* | DXEPCTL_EPENA | DXEPCTL_CNAK
    const words = (len + 3) / 4
    if (words == 0) {
        return // ZLP — EPENA alone sends the zero-length packet
    }
    var space u32 = 0
    var i u32 = 0
    while (i < SPIN) { // bounded wait for TX-FIFO space
        space = reg_at(DTXFSTS0).* & 0xFFFF
        if (space >= words) {
            break
        }
        i += 1
    }
    if (space < words) {
        trace("[usb] EP0 IN: TX-FIFO space timeout\n")
        return
    }
    var w u32 = 0
    while (w < words) {
        var word u32 = 0
        var b u32 = 0
        while (b < 4) {
            const idx = w * 4 + b
            if (idx < len) {
                word |= #as(u32, data[idx]) << #as(u5, #intCast(b * 8))
            }
            b += 1
        }
        reg_at(DFIFO0).* = word
        w += 1
    }
}

fn stallEp0() void {
    reg_at(DIEPCTL0).* = reg_at(DIEPCTL0).* | DXEPCTL_STALL
    reg_at(DOEPCTL0).* = reg_at(DOEPCTL0).* | DXEPCTL_STALL
    armOutSetup()
}

// ---------------------------------------------------------------------------
// Bulk + notify endpoint plumbing
// ---------------------------------------------------------------------------

// Arm EP2 OUT to receive one bulk packet. PktCnt=1 / XferSize=MPS: the host's
// next bulk-OUT lands in the shared RX FIFO (drained by onRxFifoLevel), then
// DOEPINT2.XferCompl fires and onOutEpInt re-arms. One packet per arming keeps
// the slave-mode loop simple; a console's OUT rate is human typing.
fn armEp2Out() void {
    reg_at(DOEPTSIZ2).* = DXEPTSIZ_PKTCNT_1 | EP_BULK_MPS
    reg_at(DOEPCTL2).* = reg_at(DOEPCTL2).* | DXEPCTL_EPENA | DXEPCTL_CNAK
}

// Hardware-configure the CDC data + notify endpoints. Called on
// SET_CONFIGURATION(>=1); the TX FIFO partitions were laid down in usb_init.
//   * EP1 IN  — interrupt notify, MPS 16, TX FIFO #1. Activated for a
//     well-formed config but never queued (CDC SERIAL_STATE is optional), so
//     it simply NAKs the host's interrupt polls.
//   * EP2 IN  — bulk, MPS 64, TX FIFO #2. Driven by serviceTxRing.
//   * EP2 OUT — bulk, MPS 64. Armed here; bytes route to console_push.
// SetD0PID starts each toggle at DATA0; the core auto-toggles afterwards.
fn configureDataEndpoints() void {
    reg_at(DIEPCTL1).* = DXEPCTL_USBACTEP | DXEPCTL_EPTYPE_INTR |
        DXEPCTL_TXFNUM_1 | DXEPCTL_SETD0PID | EP_NOTIFY_MPS
    reg_at(DIEPCTL2).* = DXEPCTL_USBACTEP | DXEPCTL_EPTYPE_BULK |
        DXEPCTL_TXFNUM_2 | DXEPCTL_SETD0PID | EP_BULK_MPS
    reg_at(DOEPCTL2).* = DXEPCTL_USBACTEP | DXEPCTL_EPTYPE_BULK |
        DXEPCTL_SETD0PID | EP_BULK_MPS
    // Aggregate EP2 OUT completion into GINTSTS.OEPINT. EP2 IN completion is
    // polled directly off DIEPINT2 in serviceTxRing (the per-EP status bit
    // latches independently of DAINTMSK), so it stays out of the mask.
    reg_at(DAINTMSK).* = reg_at(DAINTMSK).* | (1 << 18)
    ep2_in_busy = false
    data_configured = true
    armEp2Out()
    trace("[usb] data EPs configured (EP1 notify, EP2 bulk in/out)\n")
}

// SET_CONFIGURATION(0): tear the data path back down to the addressed state.
fn deconfigureDataEndpoints() void {
    data_configured = false
    ep2_in_busy = false
    reg_at(DAINTMSK).* = reg_at(DAINTMSK).* & ~#as(u32, 1 << 18)
    tx_ring.clear()
}

// ---------------------------------------------------------------------------
// SETUP decode + standard-request dispatch
// ---------------------------------------------------------------------------

fn dispatchSetup() void {
    const s = usb_desc.decodeSetup(&setup_packet)
    main_output(TRACE, "[usb] SETUP bmRT=")
    main_output_u64(TRACE, s.bmRequestType)
    main_output(TRACE, " bReq=")
    main_output_u64(TRACE, s.bRequest)
    main_output(TRACE, " wVal=")
    main_output_u64(TRACE, s.wValue)
    main_output(TRACE, " wLen=")
    main_output_u64(TRACE, s.wLength)
    main_output(TRACE, "\n")

    switch s.bRequest {
        usb_desc.REQ_GET_DESCRIPTOR => {
            if usb_desc.getDescriptor(s.descType(), s.descIndex()) |d| {
                const n u16 = #min(#as(u16, #intCast(d.len)), s.wLength)
                ep0SendData(d[0..n])
            } else {
                trace("[usb] GET_DESCRIPTOR unknown -> STALL\n")
                stallEp0()
            }
        },
        usb_desc.REQ_SET_ADDRESS => {
            // DWC2 quirk: program DCFG.DevAddr NOW (after decode, before the
            // status-stage ZLP) — the core latches it at status completion.
            const addr = s.address()
            var dcfg u32 = reg_at(DCFG).*
            dcfg &= ~DCFG_DEVADDR_MASK
            dcfg |= #as(u32, addr) << 4
            reg_at(DCFG).* = dcfg
            ep0SendData(&[_]u8{}) // ZLP status
            enum_state = .addressed
            traceHex("[usb] SET_ADDRESS=", addr)
        },
        usb_desc.REQ_SET_CONFIGURATION => {
            current_config = #truncate(s.wValue)
            // Bring the bulk + notify endpoints up (or tear them down on
            // config 0) before acking, so the host can stream immediately.
            if (current_config >= 1) {
                configureDataEndpoints()
            } else {
                deconfigureDataEndpoints()
            }
            ep0SendData(&[_]u8{}) // ZLP status
            enum_state = .configured
            enumerated_flag = (current_config >= 1)
            trace("[usb] SET_CONFIGURATION -> enumerated\n")
        },
        usb_desc.REQ_GET_CONFIGURATION => {
            ep0SendData(&[_]u8{current_config})
        },
        usb_desc.REQ_GET_STATUS => {
            ep0SendData(&[_]u8{ 0x00, 0x00 })
        },
        usb_desc.REQ_SET_FEATURE, usb_desc.REQ_CLEAR_FEATURE => {
            ep0SendData(&[_]u8{}) // ack, no-op
        },
        // --- CDC-ACM class requests (macOS sends these on tty open) ---
        usb_desc.REQ_GET_LINE_CODING => {
            ep0SendData(line_coding[0..]) // 7-byte line coding (control read)
        },
        usb_desc.REQ_SET_LINE_CODING => {
            // H2D with a 7-byte data stage. Defer the IN ZLP status to the OUT
            // XFRC (onOutEpInt); armOutSetup below receives the line coding.
            ep0_out_pending = true
        },
        usb_desc.REQ_SET_CONTROL_LINE_STATE => {
            // wValue bit0 = DTR. The host raises it when a terminal opens the
            // tty. The boot's first `login:` prompt is emitted before any
            // terminal is attached, so it never reaches the operator; on the
            // DTR rising edge we push one newline into the console RX ring so
            // the waiting login (or a running shell) re-emits a fresh prompt —
            // the operator sees `login:` the instant they connect instead of
            // typing the username blind. Rising-edge only, so a host that
            // re-asserts DTR cannot spam prompts. console_push here is the same
            // RX-ring entry the bulk-OUT data path uses below, same context.
            const dtr = (s.wValue & 0x0001) != 0
            if (dtr && !dtr_asserted) {
                console.console_push('\n')
            }
            dtr_asserted = dtr
            ep0SendData(&[_]u8{}) // wLength=0 → ZLP status
        },
        usb_desc.REQ_SEND_BREAK => {
            ep0SendData(&[_]u8{}) // wLength=0 → ZLP status (break ignored)
        },
        else => {
            trace("[usb] unhandled bReq -> STALL\n")
            stallEp0()
        },
    }
    armOutSetup() // ready for the OUT status / the data stage / the next SETUP
}

// ---------------------------------------------------------------------------
// GINTSTS event handlers
// ---------------------------------------------------------------------------

fn onUsbReset() void {
    reg_at(DCTL).* = reg_at(DCTL).* | DCTL_CGNPINNAK | DCTL_CGOUTNAK
    flushTxFifos()
    flushRxFifo()
    var dcfg u32 = reg_at(DCFG).*
    dcfg &= ~DCFG_DEVADDR_MASK // reset always returns to address 0
    reg_at(DCFG).* = dcfg
    armOutSetup()
    enum_state = .reset
    enumerated_flag = false
    dtr_asserted = false // re-attach after reset must re-fire the prompt nudge
    // A reset voids any in-flight bulk transfer and the host session;
    // drop the configured state + buffered TX so re-enumeration starts clean
    // (the TX FIFOs were just flushed above).
    data_configured = false
    ep2_in_busy = false
    tx_ring.clear()
    trace("[usb] USBRST: addr=0, EP0 re-armed\n")
}

fn onEnumDone() void {
    const spd = (reg_at(DSTS).* >> 1) & 0x3
    // EP0 max packet = 64 (FS) → MPS[1:0] = 00 on both control EPs.
    reg_at(DIEPCTL0).* = reg_at(DIEPCTL0).* & ~DXEPCTL_MPS_MASK
    reg_at(DOEPCTL0).* = reg_at(DOEPCTL0).* & ~DXEPCTL_MPS_MASK
    enum_state = .default_state
    traceHex("[usb] ENUMDONE speed=", spd)
}

// Save DAIF, then mask IRQs; the prior DAIF is handed back to irqRestore.
// Save/restore — NOT a blind irq_enable: onRxFifoLevel (below) runs from the
// idle-loop poll (IRQs on) AND the pre-enum timer-tick poll (IRQs already
// masked), so we must never unmask a mask the caller already held. "memory"
// clobber keeps the compiler from hoisting the ring RMW out of the region
// (the same full barrier a `bl irq_disable` call would have implied).
fn irqSave() u64 {
    const daif = asm volatile ("mrs %[v], daif"
        : [v] "=r" (-> u64),
    )
    asm volatile ("msr daifset, #2"
        :
        :
        : .{ .memory = true })
    return daif
}
fn irqRestore(daif u64) void {
    asm volatile ("msr daif, %[v]"
        :
        : [v] "r" (daif),
        : .{ .memory = true })
}

// RX FIFO non-empty: pop GRXSTSP ONCE and drain its data words. Every packet
// MUST be fully drained from DFIFO0 (even discarded ones) or the FIFO never
// empties and RXFLVL stays asserted forever. IRQs are masked across the whole
// drain (irqSave/irqRestore): the EP2 console_push below shares the console RX
// ring with the AUX mini-UART RX IRQ handler (board/rpi4b/irq.zig), so a nested
// console_push would race rx_head and drop/duplicate a byte. The window is one
// GRXSTSP packet — bounded and short.
fn onRxFifoLevel() void {
    const daif = irqSave()
    defer irqRestore(daif)
    const sts = reg_at(GRXSTSP).*
    const epnum = sts & 0xF
    const pktsts = (sts >> 17) & 0xF
    const bcnt = (sts >> 4) & 0x7FF
    const words = (bcnt + 3) / 4
    switch pktsts {
        PKTSTS_SETUP_DATA => {
            var captured u32 = 0
            var i u32 = 0
            while (i < words) {
                const word = reg_at(DFIFO0).*
                var b u32 = 0
                while (b < 4) {
                    if (captured < 8) {
                        setup_packet[captured] = #truncate(word >> #as(u5, #intCast(b * 8)))
                        captured += 1
                    }
                    b += 1
                }
                i += 1
            }
        },
        PKTSTS_OUT_DATA => {
            // Drain every word (or RXFLVL stays asserted). EP2 = CDC bulk OUT →
            // push each real byte into the console RX ring (the same ring fsh
            // reads). EP0 = a pending control-OUT write (SET_LINE_CODING) → keep
            // the first 7 bytes as the line coding.
            var captured u32 = 0
            var i u32 = 0
            while (i < words) {
                const word = reg_at(DFIFO0).*
                var b u32 = 0
                while (b < 4) {
                    const byte u8 = #truncate(word >> #as(u5, #intCast(b * 8)))
                    if (captured < bcnt) {
                        if (epnum == 2) {
                            console.console_push(byte)
                        } else if (ep0_out_pending && captured < line_coding.len) {
                            line_coding[captured] = byte
                        }
                    }
                    captured += 1
                    b += 1
                }
                i += 1
            }
            if (TRACE_BULK && epnum == 2) {
                traceHex("[usb] OUT2 bytes=", bcnt)
            }
        },
        else => {}, // SETUP_COMP / OUT_COMP / GOUT_NAK carry no data words
    }
}

fn onOutEpInt() void {
    const doepint = reg_at(DOEPINT0).*
    if ((doepint & DOEPINT_SETUP) != 0) {
        reg_at(DOEPINT0).* = DOEPINT_SETUP // write-1-clear
        dispatchSetup() // SETUP-complete is the decode trigger (setup_packet already captured)
    }
    if ((doepint & DXEPINT_XFERCOMPL) != 0) {
        reg_at(DOEPINT0).* = DXEPINT_XFERCOMPL
        if (ep0_out_pending) {
            // SET_LINE_CODING data stage done → send the IN ZLP status that
            // finishes the control-OUT write.
            ep0_out_pending = false
            ep0SendData(&[_]u8{})
            trace("[usb] SET_LINE_CODING\n")
        }
        armOutSetup() // OUT status / data complete → re-arm for the next SETUP
    }

    // EP2 bulk OUT transfer complete (data already drained by onRxFifoLevel) →
    // re-arm for the next packet.
    if (data_configured) {
        const doepint2 = reg_at(DOEPINT2).*
        if ((doepint2 & DXEPINT_XFERCOMPL) != 0) {
            reg_at(DOEPINT2).* = DXEPINT_XFERCOMPL
            armEp2Out()
        }
    }
}

fn onInEpInt() void {
    const diepint = reg_at(DIEPINT0).*
    if ((diepint & DXEPINT_XFERCOMPL) != 0) {
        reg_at(DIEPINT0).* = DXEPINT_XFERCOMPL
    }
    if ((diepint & DIEPINT_TIMEOUT) != 0) {
        reg_at(DIEPINT0).* = DIEPINT_TIMEOUT
    }
}

// ---------------------------------------------------------------------------
// Bulk-IN TX path (EP2)
// ---------------------------------------------------------------------------

// Push pending TX-ring bytes onto EP2 bulk IN, one max-packet (64 B) chunk per
// call. Self-contained: it retires a finished prior transfer by polling
// DIEPINT2.XferCompl directly (the per-EP status bit latches independent of
// DAINTMSK), so it makes progress whether driven from the poll loop (idle) or
// opportunistically from cdc_tx (a producer). preempt_disable makes the whole
// body the consumer critical section — mutually exclusive on this single core
// with cdc_tx's enqueue.
fn serviceTxRing() void {
    if (!data_configured) {
        return
    }
    preempt_disable()
    // Retire a completed transfer so the next chunk can launch.
    if (ep2_in_busy && (reg_at(DIEPINT2).* & DXEPINT_XFERCOMPL) != 0) {
        reg_at(DIEPINT2).* = DXEPINT_XFERCOMPL
        ep2_in_busy = false
    }
    if (!ep2_in_busy) {
        // Peek one max-packet chunk WITHOUT consuming it — advance only once
        // the TX FIFO has actually taken the bytes (peek is read-only, so a
        // FIFO-full bail leaves the chunk queued for the next pass).
        var chunk_buf [EP_BULK_MPS]u8 = undefined
        const chunk u32 = #intCast(tx_ring.peek(chunk_buf[0..]))
        if (chunk > 0) {
            const words u32 = (chunk + 3) / 4
            // Only launch if the EP2 TX FIFO can take the whole chunk now.
            if ((reg_at(DTXFSTS2).* & 0xFFFF) >= words) {
                reg_at(DIEPTSIZ2).* = DXEPTSIZ_PKTCNT_1 | chunk // PktCnt=1, XferSize=chunk
                reg_at(DIEPCTL2).* = reg_at(DIEPCTL2).* | DXEPCTL_EPENA | DXEPCTL_CNAK
                var w u32 = 0
                while (w < words) {
                    var word u32 = 0
                    var b u32 = 0
                    while (b < 4) {
                        const idx = w * 4 + b
                        if (idx < chunk) {
                            word |= #as(u32, chunk_buf[idx]) << #as(u5, #intCast(b * 8))
                        }
                        b += 1
                    }
                    reg_at(DFIFO2).* = word
                    w += 1
                }
                tx_ring.advance(chunk)
                ep2_in_busy = true
                if (TRACE_BULK) {
                    traceHex("[usb] IN2 chunk=", chunk)
                }
            }
        }
    }
    preempt_enable()
}

// Push one byte into the TX ring with the bounded spin-then-drop backpressure
// policy: when the ring is full, spin briefly draining the hardware to make
// room, then drop — the kernel must never block on a host that stopped
// reading.
fn txPushByte(byte u8) void {
    var tries u32 = 0
    while true {
        preempt_disable()
        const ok = tx_ring.push(byte)
        preempt_enable()
        if (ok) {
            return
        }
        serviceTxRing() // ring full → drain a chunk to the FIFO, then retry
        tries += 1
        if (tries >= TX_SPIN) {
            return // host not draining → drop this byte
        }
    }
}

// Queue console bytes for the host over EP2 bulk IN. Called by the console mux
// (sys.zig console_tx). data_configured gates it (before enumeration the
// caller falls back to the UART).
pub fn cdc_tx(data []u8) void {
    if (!data_configured) {
        return
    }
    for byte in data {
        // Terminals need CRLF; the kernel writes LF-only. Mirror the
        // Mini-UART driver's newline translation (uart.zig does the same) so
        // both console transports render identically.
        if (byte == '\n') {
            txPushByte('\r')
        }
        txPushByte(byte)
    }
    serviceTxRing() // kick: push what we just queued without waiting for idle
}

// ---------------------------------------------------------------------------
// Connection manager — when to be electrically visible
//
// A USB bus reset hardware-disarms EP0 OUT (DOEPTSIZ0.SUPCNT / DOEPCTL0.EPENA
// do not survive USBRST — Linux dwc2 and TinyUSB re-arm on every reset for
// exactly this reason). The host sends its first SETUP ~20 ms after the reset
// ends, so a SETUP is only ACKed if software re-arms EP0 inside that window.
// The PID-0 idle loop (µs-rate polls) can; the 1 Hz timer-tick backstop
// (board irq.zig) never can. During the boot harness the idle loop is starved,
// so every enumeration attempt the host makes in that window is doomed — and
// macOS permanently disables the port after ~4 failed attempts (~20 s),
// recoverable only by a fresh D+ attach event. Asserting the pull-up inside
// usb_init therefore guarantees a dead console (HW-diagnosed 2026-06-01:
// USBRST/ENUMDONE pairs processed, zero SETUPs ever seen).
//
// Policy: stay detached (DCTL.SftDiscon = 1) until poll() has been arriving
// at idle-loop rate for a sustained window — only then assert the pull-up.
// If the host then fails to enumerate (it gave up before we attached, or the
// system went busy mid-enumeration), pulse a detach long enough for the host
// to register it and re-attach: electrically identical to a physical replug,
// which clears macOS's port-disable state. All timing is wall-clock off
// CNTPCT (the same clock delay_us uses), never iteration counts.
// ---------------------------------------------------------------------------

const ConnState = enum { detached, attaching, attached, pulsing }
var conn_state ConnState = .detached
var last_poll_ts u64 = 0 // CNTPCT at the previous poll() — the gap detector
var idle_streak_ts u64 = 0 // CNTPCT when the current gap-free streak began
var conn_state_ts u64 = 0 // CNTPCT when conn_state last changed

// A poll-to-poll gap above this means the idle loop does NOT own the CPU
// (boot harness / long user command): EP0 re-arm latency would exceed the
// host's SETUP window, so being attached is pointless.
const MAX_POLL_GAP_MS u64 = 10
// Gap-free polling for this long ⇒ sustained idle ⇒ safe to become visible.
const CONNECT_IDLE_MS u64 = 2_000
// Attached but not enumerated for this long ⇒ the host is not talking to us
// (it gave up before we attached, or went away) ⇒ force a fresh attach.
const ENUM_TIMEOUT_MS u64 = 10_000
// D+ released for this long during the re-attach pulse. Must clear the host's
// connect debounce (~100 ms, USB 2.0 §7.1.7.3) with a wide margin.
const DETACH_PULSE_MS u64 = 1_000

inline fn msToTicks(freq u64, ms u64) u64 {
    return (freq * ms) / 1000
}

fn serviceConnection() void {
    const freq = readCntfrq()
    if (freq == 0) {
        return // no counter clock → stay detached (bring-up is dead anyway)
    }
    const now = readCntpct()
    if ((now -% last_poll_ts) > msToTicks(freq, MAX_POLL_GAP_MS)) {
        idle_streak_ts = now // gap → restart the sustained-idle streak
    }
    last_poll_ts = now
    switch conn_state {
        .detached => {
            if ((now -% idle_streak_ts) > msToTicks(freq, CONNECT_IDLE_MS)) {
                reg_at(DCTL).* = reg_at(DCTL).* & ~DCTL_SFT_DISCON
                conn_state = .attaching
                conn_state_ts = now
                // Connection-state traces kept for bring-up but commented out:
                // with no host attached, the attach/timeout/pulse cycle below
                // repeats forever and every transition would land on the fsh
                // console (same MU). Uncomment all three to debug enumeration.
                // trace("[usb] soft-connect (pull-up on)\n");
            }
        },
        .attaching => {
            if (enumerated_flag) {
                conn_state = .attached
                conn_state_ts = now
            } else if ((now -% conn_state_ts) > msToTicks(freq, ENUM_TIMEOUT_MS)) {
                reg_at(DCTL).* = reg_at(DCTL).* | DCTL_SFT_DISCON
                conn_state = .pulsing
                conn_state_ts = now
                // trace("[usb] no enumeration -> detach pulse\n");
            }
        },
        .attached => {
            // onUsbReset cleared enumerated_flag → the host is re-enumerating
            // (host-side replug / port reset). Give it the standard window;
            // if it goes silent instead, .attaching times out into a pulse.
            if (!enumerated_flag) {
                conn_state = .attaching
                conn_state_ts = now
            }
        },
        .pulsing => {
            if ((now -% conn_state_ts) > msToTicks(freq, DETACH_PULSE_MS)) {
                reg_at(DCTL).* = reg_at(DCTL).* & ~DCTL_SFT_DISCON
                conn_state = .attaching
                conn_state_ts = now
                // trace("[usb] re-attach (pull-up on)\n");
            }
        },
    }
}

// ---------------------------------------------------------------------------
// Public surface
// ---------------------------------------------------------------------------

// Re-entrancy guard: poll() is reachable from BOTH the PID-0 idle loop
// (kernel.zig) and — until enumeration completes — the timer IRQ (board
// irq.zig, the 1 Hz service backstop). On a single core the IRQ can
// fire mid-poll; the guard turns the nested call into a no-op so the
// GRXSTSP/FIFO read sequences never interleave.
var in_poll bool = false

// One GINTSTS service pass; driven from the PID-0 idle loop (kernel.zig) and,
// until the gadget enumerates, from the timer tick (board irq.zig).
// No-op until a successful usb_init. Single bounded pass — no internal loop —
// so it can never hang the QEMU watchdog. Event-gated trace (idle = silent).
pub fn poll() void {
    if (!inited) {
        return
    }
    if (in_poll) {
        return
    }
    in_poll = true
    defer in_poll = false
    // Connection management first, INSIDE the in_poll guard: nested timer-IRQ
    // calls must neither pollute the idle-gap measurement nor race the state
    // machine.
    serviceConnection()
    const g = reg_at(GINTSTS).*
    if ((g & GINTSTS_USBRST) != 0) {
        onUsbReset()
        reg_at(GINTSTS).* = GINTSTS_USBRST
    }
    if ((g & GINTSTS_ENUMDONE) != 0) {
        onEnumDone()
        reg_at(GINTSTS).* = GINTSTS_ENUMDONE
    }
    if ((g & GINTSTS_RXFLVL) != 0) {
        onRxFifoLevel() // self-clears via GRXSTSP
    }
    if ((g & GINTSTS_OEPINT) != 0) {
        onOutEpInt()
    }
    if ((g & GINTSTS_IEPINT) != 0) {
        onInEpInt()
    }
    serviceTxRing() // drain queued bulk-IN bytes (no-op until data_configured)
}

pub fn enumerated() bool {
    return enumerated_flag
}

// Bring the OTG core up as a polled Full-Speed device. Returns 0 on success,
// -1 on any bounded-wait timeout / absent core (kernel logs + degrades).
pub fn usb_init() i32 {
    // 1. Power the USB HCD domain (defensive; firmware usually pre-powers).
    _ = mbox.setPowerState(mailbox.DEVICE_ID_USB_HCD, mailbox.POWER_STATE_ON | mailbox.POWER_STATE_WAIT)
    delay_us(2_000)

    // 0. Diagnostic dump + dead-MMIO gate. A live DWC2 core answers GSNPSID
    //    with an "OT" signature; QEMU (no device path) reads 0 / 0xFFFFFFFF.
    const snpsid = reg_at(GSNPSID).*
    traceHex("[usb] GSNPSID=", snpsid)
    traceHex("[usb] GHWCFG3=", reg_at(GHWCFG3).*)
    if (snpsid == 0 || snpsid == 0xFFFFFFFF) {
        trace("[usb] no DWC2 core (dead MMIO) -> skip\n")
        return -1
    }

    // 2. Wait for AHB idle, then 3. core soft-reset.
    if (!waitSet(GRSTCTL, GRSTCTL_AHBIDLE)) {
        trace("[usb] AHBIDLE timeout\n")
        return -1
    }
    reg_at(GRSTCTL).* = reg_at(GRSTCTL).* | GRSTCTL_CSFTRST
    if (!waitClear(GRSTCTL, GRSTCTL_CSFTRST) || !waitSet(GRSTCTL, GRSTCTL_AHBIDLE)) {
        trace("[usb] CSFTRST timeout\n")
        return -1
    }
    trace("[usb] core soft-reset done\n")

    // 3a. Stay electrically detached (D+ released) through the rest of
    //     bring-up AND the rest of OS boot — the connection manager
    //     (serviceConnection) asserts the pull-up only once the idle loop
    //     demonstrably owns the CPU. Explicit write because the CSFTRST
    //     reset value of SftDiscon differs across core versions.
    reg_at(DCTL).* = reg_at(DCTL).* | DCTL_SFT_DISCON

    // 4. PHY / mode select: force device mode, clear host mode, pick PHY.
    var gusbcfg u32 = reg_at(GUSBCFG).*
    gusbcfg &= ~GUSBCFG_FORCE_HST
    gusbcfg |= GUSBCFG_FORCE_DEV
    if (USE_FS_SERIAL_PHY) {
        gusbcfg |= GUSBCFG_PHYSEL
    } else {
        gusbcfg &= ~GUSBCFG_PHYSEL
    }
    reg_at(GUSBCFG).* = gusbcfg
    traceHex("[usb] GUSBCFG=", gusbcfg)

    // 5. ~25 ms settle after ForceDevMode (DWC2 programming-guide requirement;
    //    skipping it silently wedges bring-up).
    delay_us(25_000)
    trace("[usb] post-forcedev settle\n")

    // 6. Slave/PIO + polled: DMAEn=0, GlblIntrMsk=0 (no IRQ to the GIC).
    reg_at(GAHBCFG).* = (reg_at(GAHBCFG).* & ~GAHBCFG_DMA_EN & ~GAHBCFG_GLBL_INTR_MSK) | GAHBCFG_TXF_EMP_LVL

    // 7. Full-Speed.
    var dcfg u32 = reg_at(DCFG).*
    dcfg &= ~DCFG_DEVSPD_MASK
    dcfg |= if (USE_FS_SERIAL_PHY) DCFG_DEVSPD_FS_DEDICATED else DCFG_DEVSPD_FS_HS_PHY
    dcfg &= ~DCFG_DEVADDR_MASK
    reg_at(DCFG).* = dcfg
    traceHex("[usb] DCFG=", dcfg)

    // 8. FIFO partition (words), all inside the core SPRAM (GHWCFG3[31:16]):
    //      RX (shared OUT)      64 @ 0
    //      EP0 IN  (GNPTXFSIZ)  32 @ 64
    //      EP1 IN  (DIEPTXF1)   16 @ 96    (CDC notify)
    //      EP2 IN  (DIEPTXF2)   64 @ 112   (CDC bulk)
    //    Write GRXFSIZ before the TX partitions. BCM2711 SPRAM is 4080 words,
    //    so 176 words used leaves vast headroom. The EP1/EP2 partitions are
    //    laid down here (static) so configureDataEndpoints only flips DIEPCTL.
    const spram = reg_at(GHWCFG3).* >> 16
    const rxfsiz u32 = 64
    const nptx_depth u32 = 32
    const ep1_start u32 = rxfsiz + nptx_depth // 96
    const ep1_depth u32 = 16
    const ep2_start u32 = ep1_start + ep1_depth // 112
    const ep2_depth u32 = 64
    if (spram != 0 && (ep2_start + ep2_depth) > spram) {
        trace("[usb] FIFO partition > SPRAM\n")
        return -1
    }
    reg_at(GRXFSIZ).* = rxfsiz
    reg_at(GNPTXFSIZ).* = (nptx_depth << 16) | rxfsiz
    reg_at(DIEPTXF1).* = (ep1_depth << 16) | ep1_start
    reg_at(DIEPTXF2).* = (ep2_depth << 16) | ep2_start
    traceHex("[usb] GRXFSIZ=", reg_at(GRXFSIZ).*)
    traceHex("[usb] GNPTXFSIZ=", reg_at(GNPTXFSIZ).*)
    traceHex("[usb] DIEPTXF2=", reg_at(DIEPTXF2).*)

    // 9. Unmask the device interrupts we poll on (never SOF), clear stale.
    reg_at(GINTMSK).* = GINTSTS_USBRST | GINTSTS_ENUMDONE | GINTSTS_RXFLVL |
        GINTSTS_IEPINT | GINTSTS_OEPINT | GINTSTS_USBSUSP
    reg_at(DOEPMSK).* = DXEPINT_XFERCOMPL | DOEPINT_SETUP
    reg_at(DIEPMSK).* = DXEPINT_XFERCOMPL | DIEPINT_TIMEOUT
    reg_at(DAINTMSK).* = (1 << 0) | (1 << 16) // IN-EP0 + OUT-EP0
    reg_at(GINTSTS).* = 0xFFFFFFFF // write-1-clear all stale bits

    // 10. Force B-session valid (Mac sources VBUS; external power may leave
    //     session sense unreliable).
    reg_at(GOTGCTL).* = reg_at(GOTGCTL).* | GOTGCTL_BVALOEN | GOTGCTL_BVALOVAL

    // Arm OUT-EP0 for the first SETUP before the host can enumerate.
    armOutSetup()

    // 11. Do NOT soft-connect here. Becoming host-visible before the kernel
    //     can answer SETUPs inside the host's timing guarantees a failed —
    //     and on macOS permanently abandoned — enumeration. The connection
    //     manager above asserts the pull-up once sustained idle is reached.
    conn_state = .detached
    last_poll_ts = 0
    idle_streak_ts = 0
    conn_state_ts = 0
    enum_state = .reset
    enumerated_flag = false
    inited = true
    trace("[usb] init done (detached); connect deferred to idle\n")
    return 0
}