scan regex.ts

97.61% Statements 82/84
95.65% Branches 44/46
100% Functions 8/8
97.61% Lines 82/84
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169x
 
 
169x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117x
117x
102x
 
15x
15x
15x
 
15x
 
9x
 
 
 
 
 
 
6x
6x
 
 
1x
 
5x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6x
6x
6x
6x
37x
 
37x
 
 
 
37x
5x
1x
 
5x
5x
 
32x
1x
1x
1x
 
31x
1x
1x
1x
 
30x
6x
6x
 
5x
 
 
25x
 
1x
 
 
169x
169x
169x
169x
169x
 
 
 
 
 
 
 
15x
15x
18x
 
1x
17x
17x
 
 
 
15x
18x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145x
41x
 
104x
104x
104x
5x
 
104x
6x
 
104x
96x
 
104x
104x
104x
 
4x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4x
1x
 
3x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3x
1x
 
2x
 
 
 
 
 
 
 
216x
216x
216x
31x
 
185x
185x
169x
 
185x
19x
 
185x
 
  /**
 * Minimal regex translation for user-supplied grep patterns.
 *
 * The init wizard's Mastra server sends regex sources written for
 * ripgrep (Rust regex syntax). JS `RegExp` covers almost everything
 * rg's default mode supports, with one real gap: **inline flag groups**
 * like `(?i)foo`. JS requires flags at RegExp construction time; it
 * can't flip them mid-pattern.
 *
 * This module bridges that gap by recognizing a leading `(?[imsU]+)`
 * or `(?[imsU]+:…)` and translating it to JS flags. Everything else
 * is passed to `new RegExp` unchanged — if it's not valid JS regex,
 * `ValidationError` is thrown with the engine's error message.
 *
 * ### Scope
 *
 * - Leading-only. `foo(?i)bar` (mid-pattern flag) stays as-is, which
 *   will typically fail to compile under JS and raise ValidationError.
 * - Flag mapping: `i` → `i`, `m` → `m`, `s` → `s`, `U` → `s` (rg's
 *   `U` == multiline-dotall is modeled by JS's `/s` flag).
 * - The scoped form `(?i:foo)bar` is translated as
 *   `{ cleaned: "foobar", flags: "i" }` — we widen the flag to the
 *   whole pattern because JS can't scope flags to a group. This is a
 *   documented limitation.
 */
 
import { ValidationError } from "../errors.js";
 
/**
 * Matches a leading inline-flag group at position 0 of a regex source.
 * Group 1 captures the flag letters. Group 2 captures `:` if the form
 * is the scoped `(?i:...)` variant, empty otherwise.
 *
 * We don't support uppercase-off flags (e.g. rg's `(?-i)`) — those are
 * rare and harder to translate cleanly; they raise ValidationError at
 * compile time if they sneak through.
 */
const INLINE_FLAG_RE = /^\(\?([imsU]+)(:|\))/;
 
/** Canonical JS-side flag alphabet we emit. Sorted for determinism. */
const VALID_JS_FLAGS = "imsu";
 
/**
 * Extract a leading inline-flag group from `source`.
 *
 * @returns `{ cleaned: pattern-with-flags-stripped, flags: jsFlagString }`.
 *   Callers combine `flags` with their own options (e.g.,
 *   `caseSensitive: false` → force `i`) and pass to `new RegExp`.
 *
 * When `source` has no leading flag group the function returns
 * `{ cleaned: source, flags: "" }` without inspecting the rest of the
 * pattern.
 */
export function extractInlineFlags(source: string): {
  cleaned: string;
  flags: string;
} {
  const match = INLINE_FLAG_RE.exec(source);
  if (!match) {
    return { cleaned: source, flags: "" };
  }
  const rawFlags = match[1] as string;
  const separator = match[2] as ":" | ")";
  const flags = translateFlags(rawFlags);
 
  if (separator === ")") {
    // (?i)pattern — strip the flag group entirely.
    return { cleaned: source.slice(match[0].length), flags };
  }
  // (?i:pattern)tail — unwrap the group, widening the flag to the
  // whole cleaned source. We have to find the matching closing paren,
  // respecting nested groups. A small state machine is enough; we
  // don't need to parse character classes specially because `)` inside
  // `[...]` doesn't close a group.
  const inner = unwrapScopedGroup(source, match[0].length);
  if (inner === null) {
    // Malformed group — leave source alone, report no flags. `new
    // RegExp` downstream will raise a ValidationError.
    return { cleaned: source, flags: "" };
  }
  return { cleaned: inner, flags };
}
 
/**
 * Unwrap `(?i:foo)bar` → `foobar`.
 *
 * `openIndex` points one past the closing `:` (start of `foo`). We
 * find the matching `)`, tracking parenthesis nesting and skipping
 * paired `[]` ranges. If we run off the end or the syntax is
 * malformed, return null so the caller falls back to "no translation."
 *
 * The branchy control flow is inherent to a tiny regex-syntax
 * tokenizer — we track three states (char class, paren depth, escape)
 * and each needs its own branch.
 */
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: regex tokenizer is inherently branchy
function unwrapScopedGroup(source: string, openIndex: number): string | null {
  let depth = 1;
  let i = openIndex;
  let inClass = false;
  while (i < source.length) {
    const ch = source.charCodeAt(i);
    // Backslash escapes the next char regardless of context.
    Iif (ch === CHAR_BACKSLASH) {
      i += 2;
      continue;
    }
    if (inClass) {
      if (ch === CHAR_CLOSE_BRACKET) {
        inClass = false;
      }
      i += 1;
      continue;
    }
    if (ch === CHAR_OPEN_BRACKET) {
      inClass = true;
      i += 1;
      continue;
    }
    if (ch === CHAR_OPEN_PAREN) {
      depth += 1;
      i += 1;
      continue;
    }
    if (ch === CHAR_CLOSE_PAREN) {
      depth -= 1;
      if (depth === 0) {
        // `foo` is source[openIndex..i]; tail is source[i+1..].
        return source.slice(openIndex, i) + source.slice(i + 1);
      }
    }
    i += 1;
  }
  return null;
}
 
const CHAR_BACKSLASH = "\\".charCodeAt(0);
const CHAR_OPEN_PAREN = "(".charCodeAt(0);
const CHAR_CLOSE_PAREN = ")".charCodeAt(0);
const CHAR_OPEN_BRACKET = "[".charCodeAt(0);
const CHAR_CLOSE_BRACKET = "]".charCodeAt(0);
 
/**
 * Translate rg-style inline flag letters to JS RegExp flag letters.
 * Unknown letters are dropped silently (the guard regex already
 * restricts the input to `[imsU]+`).
 */
function translateFlags(raw: string): string {
  const seen = new Set<string>();
  for (const letter of raw) {
    if (letter === "U") {
      // rg's U == dotall (--multiline-dotall). Model with JS /s.
      seen.add("s");
    } else Eif (letter === "i" || letter === "m" || letter === "s") {
      seen.add(letter);
    }
  }
  // Deterministic, RegExp-accepted order.
  return [...seen]
    .filter((f) => VALID_JS_FLAGS.includes(f))
    .sort()
    .join("");
}
 
/** Options for `compilePattern`. Both default to falsy. */
export type CompilePatternOptions = {
  /**
   * When false, forces the `i` flag regardless of inline flags.
   * Default: true (case-sensitive, matching `rg`'s default).
   */
  caseSensitive?: boolean;
  /** Force the `m` flag. Default: false. */
  multiline?: boolean;
};
 
/**
 * Compile a user-supplied pattern (string or RegExp) into a JS RegExp
 * suitable for grep.
 *
 * Pre-compiled regex input is trusted and returned unchanged —
 * callers that want `caseSensitive: false` on an existing RegExp
 * must reconstruct it.
 *
 * String input goes through `extractInlineFlags` + `new RegExp`.
 * The resulting regex is always `g`-less: grep tests one line at a
 * time, and the `g` flag's `lastIndex` state is a foot-gun in that
 * usage. Callers that want a `matchAll`-style regex should build
 * their own.
 *
 * Throws `ValidationError` on any compile-time regex error,
 * preserving the engine's message for user-facing diagnostics.
 */
export function compilePattern(
  pattern: string | RegExp,
  opts: CompilePatternOptions = {}
): RegExp {
  if (pattern instanceof RegExp) {
    return pattern;
  }
  const { cleaned, flags: inline } = extractInlineFlags(pattern);
  const flags = new Set<string>();
  for (const f of inline) {
    flags.add(f);
  }
  if (opts.caseSensitive === false) {
    flags.add("i");
  }
  if (opts.multiline) {
    flags.add("m");
  }
  const flagString = [...flags].sort().join("");
  try {
    return new RegExp(cleaned, flagString);
  } catch (error) {
    throw new ValidationError(
      `Invalid grep pattern: ${(error as Error).message}`,
      "pattern"
    );
  }
}
 
/**
 * Return a RegExp with the `g` flag set. If the input already has
 * `g`, it's returned as-is; otherwise we clone with `g` added.
 *
 * `content.matchAll(regex)` and `regex.exec(content)` with manual
 * `lastIndex` management both require `/g`. The grep engine iterates
 * matches on the whole file buffer, so we need to guarantee the flag
 * is present — `compilePattern` strips `g` by default (historically
 * grep tested one line at a time), so callers must pass through this
 * helper before a whole-buffer iteration.
 */
export function ensureGlobalFlag(regex: RegExp): RegExp {
  if (regex.flags.includes("g")) {
    return regex;
  }
  return new RegExp(regex.source, `${regex.flags}g`);
}
 
/**
 * Return a RegExp with the `m` (multiline) flag set so `^` and `$`
 * match at line boundaries inside a multi-line buffer.
 *
 * Why this exists: grep historically worked by splitting content on
 * `\n` and testing each line individually, which made `^` match the
 * start of any line by accident (each line was its own string). Now
 * that grep iterates the whole buffer via `matchAll`, patterns like
 * `^foo` need the `m` flag for equivalent semantics — without it,
 * `^` anchors to the buffer start and only matches the first line.
 */
export function ensureMultilineFlag(regex: RegExp): RegExp {
  if (regex.flags.includes("m")) {
    return regex;
  }
  return new RegExp(regex.source, `${regex.flags}m`);
}
 
/**
 * Compose `ensureGlobalFlag` + `ensureMultilineFlag` in one clone.
 * Single-pass avoids building a throwaway intermediate RegExp.
 */
export function ensureGlobalMultilineFlags(regex: RegExp): RegExp {
  const needsG = !regex.flags.includes("g");
  const needsM = !regex.flags.includes("m");
  if (!(needsG || needsM)) {
    return regex;
  }
  let flags = regex.flags;
  if (needsG) {
    flags += "g";
  }
  if (needsM) {
    flags += "m";
  }
  return new RegExp(regex.source, flags);
}