FlashOS/tools/grep_match.flash

Flash 150 lines
// grep match core — the pure substring matcher /bin/grep drives.
//
// A tool-local seam in the spirit of fsh's tokenize.flash and flibc's pager:
// the one piece of grep worth host-testing in isolation, kept apart from the
// driver half (argv parsing, open/read, line assembly) that lives in
// tools/grep.flash. Pure by construction — no allocator, no module state, no
// SVC, no flibc dependency — so it compiles and runs on the host under
// `zig build test` exactly as it does on the device.
//
// The match is a plain windowed substring scan, optionally case-folded over
// ASCII: grep needs no regex for its first cut (the product vision asks only
// for literal-pattern line search). An empty pattern matches every line, the
// GNU grep convention a `grep '' FILE` relies on.

/// True when `line` contains `pat` as a contiguous substring. With
/// `ignore_case`, A–Z fold to a–z on both sides before comparing (ASCII only;
/// bytes >= 0x80 are matched verbatim). An empty `pat` matches every line; a
/// `pat` longer than `line` never matches. O(line.len * pat.len) — fine for the
/// short lines a serial-console grep sees.
pub fn lineContains(line []u8, pat []u8, ignore_case bool) bool {
    if pat.len == 0 {
        return true
    }
    if pat.len > line.len {
        return false
    }
    var i usize = 0
    // Last start offset that still leaves room for the whole pattern.
    while i + pat.len <= line.len {
        if windowEq(line[i .. i + pat.len], pat, ignore_case) {
            return true
        }
        i += 1
    }
    return false
}

/// Byte-equality over two equal-length slices, folding case when asked. The
/// caller guarantees `a.len == pat.len`, so the scan walks `pat.len` bytes.
fn windowEq(a []u8, pat []u8, ignore_case bool) bool {
    var j usize = 0
    while j < pat.len {
        var x = a[j]
        var y = pat[j]
        if ignore_case {
            x = lower(x)
            y = lower(y)
        }
        if x != y {
            return false
        }
        j += 1
    }
    return true
}

/// ASCII lowercase fold: A–Z (0x41–0x5A) map to a–z by +0x20; every other byte
/// is returned unchanged. The guard keeps the add in range (max 'Z'+32 == 'z'),
/// so no overflow even with ReleaseSmall's traps compiled out.
fn lower(c u8) u8 {
    return if (c >= 'A' && c <= 'Z') c + 32 else c
}

/// First offset >= `from` at which `needle` occurs in `hay`, or null if there is
/// none. The offset-returning sibling of lineContains, for /bin/edit's ctrl-W
/// search-from-cursor (grep itself stays on the bool form). Case-sensitive —
/// this search does not fold case. An empty `needle` returns null so a blank search
/// is an inert no-op rather than a match that never advances the cursor. Same
/// windowed scan as lineContains; O((hay.len - from) * needle.len).
pub fn find(hay []u8, needle []u8, from usize) ?usize {
    if needle.len == 0 || from >= hay.len {
        return null
    }
    var i usize = from
    while i + needle.len <= hay.len {
        if windowEq(hay[i .. i + needle.len], needle, false) {
            return i
        }
        i += 1
    }
    return null
}

// ---- host tests ------------------------------------------------------------

const std = #import("std")
const testing = std.testing

test "substring hit anywhere in the line" {
    try testing.expect(lineContains("hello world", "world", false))
    try testing.expect(lineContains("hello world", "hello", false))
    try testing.expect(lineContains("hello world", "lo wo", false))
}

test "no match returns false" {
    try testing.expect(!lineContains("hello world", "xyz", false))
    try testing.expect(!lineContains("abc", "abcd", false)) // pat longer than line
}

test "empty pattern matches every line, even empty" {
    try testing.expect(lineContains("anything", "", false))
    try testing.expect(lineContains("", "", false))
}

test "empty line matches only the empty pattern" {
    try testing.expect(!lineContains("", "a", false))
}

test "case-sensitive by default" {
    try testing.expect(!lineContains("Hello", "hello", false))
    try testing.expect(lineContains("Hello", "Hello", false))
}

test "ignore_case folds both sides over ASCII" {
    try testing.expect(lineContains("Hello World", "hello", true))
    try testing.expect(lineContains("hello world", "WORLD", true))
    try testing.expect(lineContains("MiXeD", "mixed", true))
}

test "ignore_case leaves non-letters and high bytes alone" {
    try testing.expect(lineContains("a1b2c3", "1B2", true))
    // 0x80 has no case; it must match itself, not fold.
    const hi = [_]u8{ 'x', 0x80, 'y' }
    try testing.expect(lineContains(&hi, &hi, true))
}

test "match at the very start and very end" {
    try testing.expect(lineContains("abcdef", "ab", false))
    try testing.expect(lineContains("abcdef", "ef", false))
    try testing.expect(lineContains("abcdef", "abcdef", false))
}

test "find returns the first offset at or after from" {
    try testing.expectEqual(#as(?usize, 0), find("abcabc", "abc", 0))
    try testing.expectEqual(#as(?usize, 3), find("abcabc", "abc", 1)) // skip the first hit
    try testing.expectEqual(#as(?usize, 2), find("xxneedle", "needle", 0))
}

test "find reports no match" {
    try testing.expectEqual(#as(?usize, null), find("abc", "xyz", 0))
    try testing.expectEqual(#as(?usize, null), find("abc", "abcd", 0)) // needle longer than hay
    try testing.expectEqual(#as(?usize, null), find("abcabc", "abc", 4)) // no hit past offset 4
}

test "find with from past the end or an empty needle yields null" {
    try testing.expectEqual(#as(?usize, null), find("abc", "a", 3))
    try testing.expectEqual(#as(?usize, null), find("abc", "a", 99))
    try testing.expectEqual(#as(?usize, null), find("abc", "", 0)) // blank search is inert
}
raw view on GitHub →