All files / src/lib/scan binary.ts

100% Statements 22/22
90% Branches 9/10
100% Functions 3/3
100% Lines 21/21

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199                                                                        173x                                                                                                                                                                                                   488x 488x 186102x 66x     422x                                       501x 501x 15x   486x 255x   231x 5x   226x                                 231x 231x 231x 231x   231x 231x   231x      
/**
 * Binary-file detection for the scan module.
 *
 * Uses the standard NUL-byte heuristic: a file is considered binary if
 * any of its first 8 KB is 0x00. This matches rg, git, grep, and
 * file(1). It is deliberately coarse — UTF-16-encoded text is
 * misclassified as binary because its ASCII-range code units produce
 * NUL bytes; callers that care can add a UTF-16 BOM check on top.
 *
 * Two entry points:
 *
 * - `classifyByExtension` — O(1) fast path. Classifies known text
 *   and known-binary extensions without a disk read. Returns `null`
 *   when the extension is ambiguous so the caller falls through to
 *   the NUL-sniff path.
 * - `readHeadAndSniff` — opens the file, reads the first 8 KB via
 *   `fs.promises.open` + `handle.read`, runs the sniff, returns the head
 *   buffer alongside the classification.
 */
 
import { open } from "node:fs/promises";
import { extname } from "node:path";
import { BINARY_SNIFF_BYTES } from "./constants.js";
 
/**
 * Extensions that are unambiguously binary. Listed extensions
 * return `{ isBinary: true }` from `classifyByExtension` with no
 * disk read — a 60-80ms win on fixtures rich in binary blobs
 * (`.bin`, build outputs full of `.png`/`.woff2`/`.pdf`, etc.).
 *
 * Inclusion rule: only extensions whose file-format specification
 * mandates non-text content. Ambiguous cases (`.log`, `.lock`,
 * `.map`) fall through to the NUL-sniff — treating them as binary
 * would silently drop text matches they may contain. `.svg` is XML
 * text, NOT included. `.json` and `.yaml` are text, NOT included.
 */
export const BINARY_EXTENSIONS: ReadonlySet<string> = new Set([
  // Images (raster/bitmap)
  ".png",
  ".jpg",
  ".jpeg",
  ".gif",
  ".webp",
  ".ico",
  ".bmp",
  ".tiff",
  ".tif",
  ".avif",
  ".heic",
  ".heif",
  // Fonts
  ".woff",
  ".woff2",
  ".ttf",
  ".otf",
  ".eot",
  // Archives
  ".zip",
  ".gz",
  ".bz2",
  ".xz",
  ".7z",
  ".rar",
  ".tar",
  ".tgz",
  ".tbz2",
  ".txz",
  // Media
  ".mp3",
  ".mp4",
  ".wav",
  ".ogg",
  ".oga",
  ".ogv",
  ".webm",
  ".flac",
  ".m4a",
  ".m4v",
  ".avi",
  ".mov",
  ".mkv",
  ".opus",
  // Documents (binary office formats)
  ".pdf",
  ".doc",
  ".docx",
  ".xls",
  ".xlsx",
  ".ppt",
  ".pptx",
  ".odt",
  ".ods",
  ".odp",
  // Executables and compiled artifacts
  // NOTE: `.obj` is deliberately EXCLUDED — it's shared with the
  // Wavefront OBJ 3D model format (plain-text ASCII), common in
  // game-dev / AR / 3D-printing repos. MSVC `.obj` outputs land in
  // `build/`/`target/` dirs which DEFAULT_SKIP_DIRS prunes anyway.
  ".exe",
  ".dll",
  ".so",
  ".dylib",
  ".wasm",
  ".class",
  ".o",
  ".a",
  ".pyc",
  ".pyo",
  ".node",
  // Databases (binary-format SQLite / Access)
  ".db",
  ".sqlite",
  ".sqlite3",
  ".mdb",
  // Unambiguously-binary disk images.
  // NOTE: generic blob extensions are deliberately EXCLUDED:
  //   - `.bin`  — used for both firmware/raw data AND arbitrary
  //     text dumps; no format spec.
  //   - `.dat`  — countless text data formats use this.
  //   - `.dump` — frequently plain-text SQL (`pg_dump`, `mysqldump`
  //     default to text).
  // All three fall back to the NUL-sniff, which classifies them
  // correctly by content.
  ".iso",
  ".img",
]);
 
/**
 * Inspect up to 8 KB of `head` for a NUL byte.
 *
 * Empty buffers are treated as text — they correspond to zero-byte
 * files, which are conventionally text (nothing to be confused about).
 */
export function isLikelyBinary(head: Uint8Array): boolean {
  const sniffLen = Math.min(head.length, BINARY_SNIFF_BYTES);
  for (let i = 0; i < sniffLen; i += 1) {
    if (head[i] === 0) {
      return true;
    }
  }
  return false;
}
 
/**
 * Extension-based classification for the fast path.
 *
 * Returns `{ isBinary: false }` when the extension is a known text
 * type, `{ isBinary: true }` when it is unambiguously binary (see
 * `BINARY_EXTENSIONS`). Returns `null` for the ambiguous middle
 * ground — the caller falls through to `readHeadAndSniff`.
 *
 * This is a performance hint, not a safety guarantee: even known-
 * text extensions could in principle hold NUL bytes, and files
 * without any extension (`.sentryclirc`, `Makefile`, `README`) or
 * with unusual extensions always fall through to the NUL-sniff.
 */
export function classifyByExtension(
  absPath: string,
  textExtensions: ReadonlySet<string>
): { isBinary: boolean } | null {
  const ext = extname(absPath).toLowerCase();
  if (!ext) {
    return null;
  }
  if (textExtensions.has(ext)) {
    return { isBinary: false };
  }
  if (BINARY_EXTENSIONS.has(ext)) {
    return { isBinary: true };
  }
  return null;
}
 
/**
 * Open `absPath`, read up to 8 KB from offset 0, and classify.
 *
 * The returned `head` is a borrowed view of the read buffer — do NOT
 * retain it beyond the current stack frame, as the backing allocation
 * is not pooled. When the file is shorter than 8 KB, the buffer is
 * sliced to the actual number of bytes read.
 *
 * Errors are re-thrown. Callers that want to swallow fs errors should
 * wrap this in try/catch.
 */
export async function readHeadAndSniff(
  absPath: string
): Promise<{ head: Uint8Array; isBinary: boolean }> {
  const handle = await open(absPath, "r");
  try {
    const buf = new Uint8Array(BINARY_SNIFF_BYTES);
    const { bytesRead } = await handle.read(buf, 0, BINARY_SNIFF_BYTES, 0);
    const head =
      bytesRead === BINARY_SNIFF_BYTES ? buf : buf.subarray(0, bytesRead);
    return { head, isBinary: isLikelyBinary(head) };
  } finally {
    await handle.close();
  }
}