fix(rar-reader): use unrar lt (technical) so file listings actually work

Diagnosed from production: all 4,380 RAR packages in the database have fileCount = 0. The old parser used \`unrar l -v\` and a regex that expected an 8-column \`Attributes Size Packed Ratio% Date Time CRC32 Name\` output. unrar 6.21's actual \`l -v\` output is 5 columns: \`Attributes Size Date Time Name\` — no Packed, no Ratio, no CRC32. So every RAR silently parsed to zero entries. Switch to \`unrar lt\` (list technical), which emits one block per file with key:value lines: Name: Lost Kingdom 2023 01 January/Nagas/NagaCaptainBody.stl Type: File Size: 22503584 Packed size: 21430123 CRC32: A1B2C3D4 ... The new parser tokenizes blocks on blank lines and matches "key: value" lines per block. Handles multi-word keys ("Packed size", "Host OS") and gracefully skips Directory entries and the archive header block. Also tolerates BLAKE2sp checksums for newer RAR archives. Verified against a live 644MB RAR with 201 entries (194 files, 7 dirs); parser returns 194 entries with correct paths, sizes, and CRC32s. Future RAR ingestions will populate fileCount and PackageFile rows correctly. Backfilling existing 4,380 packages requires a separate pass — added in a follow-up commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 08:13:04 +00:00 · 2026-05-24 00:38:46 +02:00
parent 901f32ff41
commit 0bdd4ba0cc
1 changed files with 95 additions and 56 deletions
--- a/worker/src/archive/rar-reader.ts
+++ b/worker/src/archive/rar-reader.ts
@@ -8,83 +8,122 @@ const execFileAsync = promisify(execFile);
 const log = childLogger("rar-reader");

 /**
- * Parse output of `unrar l -v <file>` to extract file metadata.
- * unrar automatically discovers sibling parts when they're co-located.
+ * Parse output of `unrar lt <file>` to extract file metadata.
+ *
+ * `lt` (list technical) emits one block per archived file with key:value
+ * lines — far more reliable than the column-based default `l -v` output,
+ * which has changed format twice across unrar versions.
+ *
+ * unrar automatically discovers sibling multipart files when they're
+ * co-located (e.g. *.part1.rar + *.part2.rar in the same directory).
+ *
+ * Returns [] on any failure (best-effort: ingestion still succeeds with
+ * an empty file list rather than failing the whole archive).
 */
 export async function readRarContents(
  firstPartPath: string
 ): Promise<FileEntry[]> {
  try {
-    const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
-      timeout: 30000,
-      maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
+    const { stdout } = await execFileAsync("unrar", ["lt", firstPartPath], {
+      timeout: 60_000,
+      maxBuffer: 50 * 1024 * 1024, // 50MB for archives with very many files
    });

-    return parseUnrarOutput(stdout);
+    const entries = parseUnrarTechnical(stdout);
+    if (entries.length === 0) {
+      // Log a sample of the output so we can diagnose format changes
+      log.warn(
+        { file: firstPartPath, sample: stdout.slice(0, 500) },
+        "unrar lt returned no parseable entries"
+      );
+    }
+    return entries;
  } catch (err) {
    log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
-    return []; // Fallback: return empty on error
+    return [];
  }
 }

 /**
- * Parse the tabular output of `unrar l -v`.
+ * Parse `unrar lt` output: header followed by per-file key:value blocks
+ * separated by blank lines.
 *
- * Example output format:
- *  Archive: test.rar
- *  Details: RAR 5
+ * Example block:
 *
- *   Attributes      Size     Packed Ratio   Date   Time   CRC-32  Name
- *  ----------- ---------  --------- ----- -------- ----- --------  ----
- *   ...A....      12345      10234  83%  2024-01-15 10:30 DEADBEEF  folder/file.stl
- *  ----------- ---------  --------- ----- -------- ----- --------  ----
+ *         Name: folder/file.stl
+ *         Type: File
+ *         Size: 12345
+ *  Packed size: 10234
+ *        Ratio: 83%
+ *        mtime: 2024-01-15 10:30:00,000000000
+ *   Attributes: ..A....
+ *        CRC32: DEADBEEF
+ *      Host OS: Windows
+ *  Compression: RAR 5.0(v50) -m3 -md=32M
 */
-function parseUnrarOutput(output: string): FileEntry[] {
+function parseUnrarTechnical(output: string): FileEntry[] {
  const entries: FileEntry[] = [];
-  const lines = output.split("\n");
+  // Split into blocks on blank lines, then on each block read key:value pairs.
+  const blocks = output.split(/\r?\n\s*\r?\n/);

-  let inFileList = false;
-  let separatorCount = 0;
+  for (const block of blocks) {
+    const fields = parseBlock(block);
+    if (!fields) continue;

-  for (const line of lines) {
-    const trimmed = line.trim();
+    // Only File entries (skip Directory, and anything missing the basics)
+    if (fields.type && fields.type.toLowerCase() !== "file") continue;
+    if (!fields.name || fields.size === undefined) continue;

-    // Detect separator lines (------- pattern)
-    if (/^-{5,}/.test(trimmed)) {
-      separatorCount++;
-      if (separatorCount === 1) {
-        inFileList = true;
-      } else if (separatorCount >= 2) {
-        inFileList = false;
-      }
-      continue;
-    }
-
-    if (!inFileList) continue;
-
-    // Parse file entry line
-    // Format: Attributes Size Packed Ratio Date Time CRC Name
-    const match = trimmed.match(
-      /^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
-    );
-
-    if (match) {
-      const [, uncompressedStr, compressedStr, crc32, filePath] = match;
-
-      // Skip directory entries (typically end with / or have size 0 with dir attributes)
-      if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
-
-      const ext = path.extname(filePath).toLowerCase();
-      entries.push({
-        path: filePath,
-        fileName: path.basename(filePath),
-        extension: ext ? ext.slice(1) : null,
-        compressedSize: BigInt(compressedStr),
-        uncompressedSize: BigInt(uncompressedStr),
-        crc32: crc32.toLowerCase(),
-      });
-    }
+    const filePath = fields.name;
+    const ext = path.extname(filePath).toLowerCase();
+    entries.push({
+      path: filePath,
+      fileName: path.basename(filePath),
+      extension: ext ? ext.slice(1) : null,
+      uncompressedSize: BigInt(fields.size),
+      compressedSize: fields.packedSize !== undefined
+        ? BigInt(fields.packedSize)
+        : BigInt(fields.size),
+      crc32: fields.crc32 ? fields.crc32.toLowerCase() : null,
+    });
  }

  return entries;
 }
+
+interface BlockFields {
+  name?: string;
+  type?: string;
+  size?: string;
+  packedSize?: string;
+  crc32?: string;
+}
+
+function parseBlock(block: string): BlockFields | null {
+  // Skip the archive-header block (contains "Archive:" / "Details:" lines
+  // and lacks a Name field).
+  if (!/^\s*Name:/m.test(block)) return null;
+
+  const fields: BlockFields = {};
+  const lines = block.split(/\r?\n/);
+
+  for (const line of lines) {
+    // Match "  key: value" with arbitrary leading whitespace and a multi-word
+    // key (e.g. "Packed size", "Host OS").
+    const m = line.match(/^\s*([A-Za-z][A-Za-z0-9 ]*?)\s*:\s*(.*)$/);
+    if (!m) continue;
+    const key = m[1].trim().toLowerCase();
+    const value = m[2].trim();
+    if (key === "name") fields.name = value;
+    else if (key === "type") fields.type = value;
+    else if (key === "size") fields.size = value;
+    else if (key === "packed size") fields.packedSize = value;
+    else if (key === "crc32" || key === "blake2sp" || key === "checksum") {
+      // unrar may report BLAKE2sp for newer archives instead of CRC32.
+      // Either way we just store it as a hex string in our crc32 field.
+      fields.crc32 = value;
+    }
+  }
+
+  return fields;
+}