All files / src/lib/scan regex.ts

97.61% Statements 82/84
95.65% Branches 44/46
100% Functions 8/8
97.61% Lines 82/84

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277                                                                          169x     169x                                 117x 117x 102x   15x 15x 15x   15x   9x             6x 6x     1x   5x                                 6x 6x 6x 6x 37x   37x       37x 5x 1x   5x 5x   32x 1x 1x 1x   31x 1x 1x 1x   30x 6x 6x   5x     25x   1x     169x 169x 169x 169x 169x               15x 15x 18x   1x 17x 17x       15x 18x                                                                         145x 41x   104x 104x 104x 5x   104x 6x   104x 96x   104x 104x 104x   4x                                     4x 1x   3x                             3x 1x   2x               216x 216x 216x 31x   185x 185x 169x   185x 19x   185x    
/**
 * Minimal regex translation for user-supplied grep patterns.
 *
 * The init wizard's Mastra server sends regex sources written for
 * ripgrep (Rust regex syntax). JS `RegExp` covers almost everything
 * rg's default mode supports, with one real gap: **inline flag groups**
 * like `(?i)foo`. JS requires flags at RegExp construction time; it
 * can't flip them mid-pattern.
 *
 * This module bridges that gap by recognizing a leading `(?[imsU]+)`
 * or `(?[imsU]+:…)` and translating it to JS flags. Everything else
 * is passed to `new RegExp` unchanged — if it's not valid JS regex,
 * `ValidationError` is thrown with the engine's error message.
 *
 * ### Scope
 *
 * - Leading-only. `foo(?i)bar` (mid-pattern flag) stays as-is, which
 *   will typically fail to compile under JS and raise ValidationError.
 * - Flag mapping: `i` → `i`, `m` → `m`, `s` → `s`, `U` → `s` (rg's
 *   `U` == multiline-dotall is modeled by JS's `/s` flag).
 * - The scoped form `(?i:foo)bar` is translated as
 *   `{ cleaned: "foobar", flags: "i" }` — we widen the flag to the
 *   whole pattern because JS can't scope flags to a group. This is a
 *   documented limitation.
 */
 
import { ValidationError } from "../errors.js";
 
/**
 * Matches a leading inline-flag group at position 0 of a regex source.
 * Group 1 captures the flag letters. Group 2 captures `:` if the form
 * is the scoped `(?i:...)` variant, empty otherwise.
 *
 * We don't support uppercase-off flags (e.g. rg's `(?-i)`) — those are
 * rare and harder to translate cleanly; they raise ValidationError at
 * compile time if they sneak through.
 */
const INLINE_FLAG_RE = /^\(\?([imsU]+)(:|\))/;
 
/** Canonical JS-side flag alphabet we emit. Sorted for determinism. */
const VALID_JS_FLAGS = "imsu";
 
/**
 * Extract a leading inline-flag group from `source`.
 *
 * @returns `{ cleaned: pattern-with-flags-stripped, flags: jsFlagString }`.
 *   Callers combine `flags` with their own options (e.g.,
 *   `caseSensitive: false` → force `i`) and pass to `new RegExp`.
 *
 * When `source` has no leading flag group the function returns
 * `{ cleaned: source, flags: "" }` without inspecting the rest of the
 * pattern.
 */
export function extractInlineFlags(source: string): {
  cleaned: string;
  flags: string;
} {
  const match = INLINE_FLAG_RE.exec(source);
  if (!match) {
    return { cleaned: source, flags: "" };
  }
  const rawFlags = match[1] as string;
  const separator = match[2] as ":" | ")";
  const flags = translateFlags(rawFlags);
 
  if (separator === ")") {
    // (?i)pattern — strip the flag group entirely.
    return { cleaned: source.slice(match[0].length), flags };
  }
  // (?i:pattern)tail — unwrap the group, widening the flag to the
  // whole cleaned source. We have to find the matching closing paren,
  // respecting nested groups. A small state machine is enough; we
  // don't need to parse character classes specially because `)` inside
  // `[...]` doesn't close a group.
  const inner = unwrapScopedGroup(source, match[0].length);
  if (inner === null) {
    // Malformed group — leave source alone, report no flags. `new
    // RegExp` downstream will raise a ValidationError.
    return { cleaned: source, flags: "" };
  }
  return { cleaned: inner, flags };
}
 
/**
 * Unwrap `(?i:foo)bar` → `foobar`.
 *
 * `openIndex` points one past the closing `:` (start of `foo`). We
 * find the matching `)`, tracking parenthesis nesting and skipping
 * paired `[]` ranges. If we run off the end or the syntax is
 * malformed, return null so the caller falls back to "no translation."
 *
 * The branchy control flow is inherent to a tiny regex-syntax
 * tokenizer — we track three states (char class, paren depth, escape)
 * and each needs its own branch.
 */
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: regex tokenizer is inherently branchy
function unwrapScopedGroup(source: string, openIndex: number): string | null {
  let depth = 1;
  let i = openIndex;
  let inClass = false;
  while (i < source.length) {
    const ch = source.charCodeAt(i);
    // Backslash escapes the next char regardless of context.
    Iif (ch === CHAR_BACKSLASH) {
      i += 2;
      continue;
    }
    if (inClass) {
      if (ch === CHAR_CLOSE_BRACKET) {
        inClass = false;
      }
      i += 1;
      continue;
    }
    if (ch === CHAR_OPEN_BRACKET) {
      inClass = true;
      i += 1;
      continue;
    }
    if (ch === CHAR_OPEN_PAREN) {
      depth += 1;
      i += 1;
      continue;
    }
    if (ch === CHAR_CLOSE_PAREN) {
      depth -= 1;
      if (depth === 0) {
        // `foo` is source[openIndex..i]; tail is source[i+1..].
        return source.slice(openIndex, i) + source.slice(i + 1);
      }
    }
    i += 1;
  }
  return null;
}
 
const CHAR_BACKSLASH = "\\".charCodeAt(0);
const CHAR_OPEN_PAREN = "(".charCodeAt(0);
const CHAR_CLOSE_PAREN = ")".charCodeAt(0);
const CHAR_OPEN_BRACKET = "[".charCodeAt(0);
const CHAR_CLOSE_BRACKET = "]".charCodeAt(0);
 
/**
 * Translate rg-style inline flag letters to JS RegExp flag letters.
 * Unknown letters are dropped silently (the guard regex already
 * restricts the input to `[imsU]+`).
 */
function translateFlags(raw: string): string {
  const seen = new Set<string>();
  for (const letter of raw) {
    if (letter === "U") {
      // rg's U == dotall (--multiline-dotall). Model with JS /s.
      seen.add("s");
    } else Eif (letter === "i" || letter === "m" || letter === "s") {
      seen.add(letter);
    }
  }
  // Deterministic, RegExp-accepted order.
  return [...seen]
    .filter((f) => VALID_JS_FLAGS.includes(f))
    .sort()
    .join("");
}
 
/** Options for `compilePattern`. Both default to falsy. */
export type CompilePatternOptions = {
  /**
   * When false, forces the `i` flag regardless of inline flags.
   * Default: true (case-sensitive, matching `rg`'s default).
   */
  caseSensitive?: boolean;
  /** Force the `m` flag. Default: false. */
  multiline?: boolean;
};
 
/**
 * Compile a user-supplied pattern (string or RegExp) into a JS RegExp
 * suitable for grep.
 *
 * Pre-compiled regex input is trusted and returned unchanged —
 * callers that want `caseSensitive: false` on an existing RegExp
 * must reconstruct it.
 *
 * String input goes through `extractInlineFlags` + `new RegExp`.
 * The resulting regex is always `g`-less: grep tests one line at a
 * time, and the `g` flag's `lastIndex` state is a foot-gun in that
 * usage. Callers that want a `matchAll`-style regex should build
 * their own.
 *
 * Throws `ValidationError` on any compile-time regex error,
 * preserving the engine's message for user-facing diagnostics.
 */
export function compilePattern(
  pattern: string | RegExp,
  opts: CompilePatternOptions = {}
): RegExp {
  if (pattern instanceof RegExp) {
    return pattern;
  }
  const { cleaned, flags: inline } = extractInlineFlags(pattern);
  const flags = new Set<string>();
  for (const f of inline) {
    flags.add(f);
  }
  if (opts.caseSensitive === false) {
    flags.add("i");
  }
  if (opts.multiline) {
    flags.add("m");
  }
  const flagString = [...flags].sort().join("");
  try {
    return new RegExp(cleaned, flagString);
  } catch (error) {
    throw new ValidationError(
      `Invalid grep pattern: ${(error as Error).message}`,
      "pattern"
    );
  }
}
 
/**
 * Return a RegExp with the `g` flag set. If the input already has
 * `g`, it's returned as-is; otherwise we clone with `g` added.
 *
 * `content.matchAll(regex)` and `regex.exec(content)` with manual
 * `lastIndex` management both require `/g`. The grep engine iterates
 * matches on the whole file buffer, so we need to guarantee the flag
 * is present — `compilePattern` strips `g` by default (historically
 * grep tested one line at a time), so callers must pass through this
 * helper before a whole-buffer iteration.
 */
export function ensureGlobalFlag(regex: RegExp): RegExp {
  if (regex.flags.includes("g")) {
    return regex;
  }
  return new RegExp(regex.source, `${regex.flags}g`);
}
 
/**
 * Return a RegExp with the `m` (multiline) flag set so `^` and `$`
 * match at line boundaries inside a multi-line buffer.
 *
 * Why this exists: grep historically worked by splitting content on
 * `\n` and testing each line individually, which made `^` match the
 * start of any line by accident (each line was its own string). Now
 * that grep iterates the whole buffer via `matchAll`, patterns like
 * `^foo` need the `m` flag for equivalent semantics — without it,
 * `^` anchors to the buffer start and only matches the first line.
 */
export function ensureMultilineFlag(regex: RegExp): RegExp {
  if (regex.flags.includes("m")) {
    return regex;
  }
  return new RegExp(regex.source, `${regex.flags}m`);
}
 
/**
 * Compose `ensureGlobalFlag` + `ensureMultilineFlag` in one clone.
 * Single-pass avoids building a throwaway intermediate RegExp.
 */
export function ensureGlobalMultilineFlags(regex: RegExp): RegExp {
  const needsG = !regex.flags.includes("g");
  const needsM = !regex.flags.includes("m");
  if (!(needsG || needsM)) {
    return regex;
  }
  let flags = regex.flags;
  if (needsG) {
    flags += "g";
  }
  if (needsM) {
    flags += "m";
  }
  return new RegExp(regex.source, flags);
}