fix(rar-reader): use unrar lt (technical) so file listings actually work

Diagnosed from production: all 4,380 RAR packages in the database have
fileCount = 0. The old parser used \`unrar l -v\` and a regex that
expected an 8-column \`Attributes Size Packed Ratio% Date Time CRC32 Name\`
output. unrar 6.21's actual \`l -v\` output is 5 columns: \`Attributes
Size Date Time Name\` — no Packed, no Ratio, no CRC32. So every RAR
silently parsed to zero entries.

Switch to \`unrar lt\` (list technical), which emits one block per file
with key:value lines:

         Name: Lost Kingdom 2023 01 January/Nagas/NagaCaptainBody.stl
         Type: File
         Size: 22503584
  Packed size: 21430123
         CRC32: A1B2C3D4
         ...

The new parser tokenizes blocks on blank lines and matches "key: value"
lines per block. Handles multi-word keys ("Packed size", "Host OS") and
gracefully skips Directory entries and the archive header block. Also
tolerates BLAKE2sp checksums for newer RAR archives.

Verified against a live 644MB RAR with 201 entries (194 files, 7 dirs);
parser returns 194 entries with correct paths, sizes, and CRC32s.

Future RAR ingestions will populate fileCount and PackageFile rows
correctly. Backfilling existing 4,380 packages requires a separate
pass — added in a follow-up commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 00:38:46 +02:00
parent 901f32ff41
commit 0bdd4ba0cc

View File

@@ -8,83 +8,122 @@ const execFileAsync = promisify(execFile);
const log = childLogger("rar-reader");
/**
* Parse output of `unrar l -v <file>` to extract file metadata.
* unrar automatically discovers sibling parts when they're co-located.
* Parse output of `unrar lt <file>` to extract file metadata.
*
* `lt` (list technical) emits one block per archived file with key:value
* lines — far more reliable than the column-based default `l -v` output,
* which has changed format twice across unrar versions.
*
* unrar automatically discovers sibling multipart files when they're
* co-located (e.g. *.part1.rar + *.part2.rar in the same directory).
*
* Returns [] on any failure (best-effort: ingestion still succeeds with
* an empty file list rather than failing the whole archive).
*/
export async function readRarContents(
firstPartPath: string
): Promise<FileEntry[]> {
try {
const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
timeout: 30000,
maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
const { stdout } = await execFileAsync("unrar", ["lt", firstPartPath], {
timeout: 60_000,
maxBuffer: 50 * 1024 * 1024, // 50MB for archives with very many files
});
return parseUnrarOutput(stdout);
const entries = parseUnrarTechnical(stdout);
if (entries.length === 0) {
// Log a sample of the output so we can diagnose format changes
log.warn(
{ file: firstPartPath, sample: stdout.slice(0, 500) },
"unrar lt returned no parseable entries"
);
}
return entries;
} catch (err) {
log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
return []; // Fallback: return empty on error
return [];
}
}
/**
* Parse the tabular output of `unrar l -v`.
* Parse `unrar lt` output: header followed by per-file key:value blocks
* separated by blank lines.
*
* Example output format:
* Archive: test.rar
* Details: RAR 5
* Example block:
*
* Attributes Size Packed Ratio Date Time CRC-32 Name
* ----------- --------- --------- ----- -------- ----- -------- ----
* ...A.... 12345 10234 83% 2024-01-15 10:30 DEADBEEF folder/file.stl
* ----------- --------- --------- ----- -------- ----- -------- ----
* Name: folder/file.stl
* Type: File
* Size: 12345
* Packed size: 10234
* Ratio: 83%
* mtime: 2024-01-15 10:30:00,000000000
* Attributes: ..A....
* CRC32: DEADBEEF
* Host OS: Windows
* Compression: RAR 5.0(v50) -m3 -md=32M
*/
function parseUnrarOutput(output: string): FileEntry[] {
function parseUnrarTechnical(output: string): FileEntry[] {
const entries: FileEntry[] = [];
const lines = output.split("\n");
// Split into blocks on blank lines, then on each block read key:value pairs.
const blocks = output.split(/\r?\n\s*\r?\n/);
let inFileList = false;
let separatorCount = 0;
for (const block of blocks) {
const fields = parseBlock(block);
if (!fields) continue;
for (const line of lines) {
const trimmed = line.trim();
// Only File entries (skip Directory, and anything missing the basics)
if (fields.type && fields.type.toLowerCase() !== "file") continue;
if (!fields.name || fields.size === undefined) continue;
// Detect separator lines (------- pattern)
if (/^-{5,}/.test(trimmed)) {
separatorCount++;
if (separatorCount === 1) {
inFileList = true;
} else if (separatorCount >= 2) {
inFileList = false;
}
continue;
}
if (!inFileList) continue;
// Parse file entry line
// Format: Attributes Size Packed Ratio Date Time CRC Name
const match = trimmed.match(
/^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
);
if (match) {
const [, uncompressedStr, compressedStr, crc32, filePath] = match;
// Skip directory entries (typically end with / or have size 0 with dir attributes)
if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
const ext = path.extname(filePath).toLowerCase();
entries.push({
path: filePath,
fileName: path.basename(filePath),
extension: ext ? ext.slice(1) : null,
compressedSize: BigInt(compressedStr),
uncompressedSize: BigInt(uncompressedStr),
crc32: crc32.toLowerCase(),
});
}
const filePath = fields.name;
const ext = path.extname(filePath).toLowerCase();
entries.push({
path: filePath,
fileName: path.basename(filePath),
extension: ext ? ext.slice(1) : null,
uncompressedSize: BigInt(fields.size),
compressedSize: fields.packedSize !== undefined
? BigInt(fields.packedSize)
: BigInt(fields.size),
crc32: fields.crc32 ? fields.crc32.toLowerCase() : null,
});
}
return entries;
}
interface BlockFields {
name?: string;
type?: string;
size?: string;
packedSize?: string;
crc32?: string;
}
function parseBlock(block: string): BlockFields | null {
// Skip the archive-header block (contains "Archive:" / "Details:" lines
// and lacks a Name field).
if (!/^\s*Name:/m.test(block)) return null;
const fields: BlockFields = {};
const lines = block.split(/\r?\n/);
for (const line of lines) {
// Match " key: value" with arbitrary leading whitespace and a multi-word
// key (e.g. "Packed size", "Host OS").
const m = line.match(/^\s*([A-Za-z][A-Za-z0-9 ]*?)\s*:\s*(.*)$/);
if (!m) continue;
const key = m[1].trim().toLowerCase();
const value = m[2].trim();
if (key === "name") fields.name = value;
else if (key === "type") fields.type = value;
else if (key === "size") fields.size = value;
else if (key === "packed size") fields.packedSize = value;
else if (key === "crc32" || key === "blake2sp" || key === "checksum") {
// unrar may report BLAKE2sp for newer archives instead of CRC32.
// Either way we just store it as a hex string in our crc32 field.
fields.crc32 = value;
}
}
return fields;
}