mirror of
https://github.com/xCyanGrizzly/DragonsStash.git
synced 2026-06-09 18:51:16 +00:00
fix(rar-reader): use unrar lt (technical) so file listings actually work
Diagnosed from production: all 4,380 RAR packages in the database have
fileCount = 0. The old parser used \`unrar l -v\` and a regex that
expected an 8-column \`Attributes Size Packed Ratio% Date Time CRC32 Name\`
output. unrar 6.21's actual \`l -v\` output is 5 columns: \`Attributes
Size Date Time Name\` — no Packed, no Ratio, no CRC32. So every RAR
silently parsed to zero entries.
Switch to \`unrar lt\` (list technical), which emits one block per file
with key:value lines:
Name: Lost Kingdom 2023 01 January/Nagas/NagaCaptainBody.stl
Type: File
Size: 22503584
Packed size: 21430123
CRC32: A1B2C3D4
...
The new parser tokenizes blocks on blank lines and matches "key: value"
lines per block. Handles multi-word keys ("Packed size", "Host OS") and
gracefully skips Directory entries and the archive header block. Also
tolerates BLAKE2sp checksums for newer RAR archives.
Verified against a live 644MB RAR with 201 entries (194 files, 7 dirs);
parser returns 194 entries with correct paths, sizes, and CRC32s.
Future RAR ingestions will populate fileCount and PackageFile rows
correctly. Backfilling existing 4,380 packages requires a separate
pass — added in a follow-up commit.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,83 +8,122 @@ const execFileAsync = promisify(execFile);
|
||||
const log = childLogger("rar-reader");
|
||||
|
||||
/**
|
||||
* Parse output of `unrar l -v <file>` to extract file metadata.
|
||||
* unrar automatically discovers sibling parts when they're co-located.
|
||||
* Parse output of `unrar lt <file>` to extract file metadata.
|
||||
*
|
||||
* `lt` (list technical) emits one block per archived file with key:value
|
||||
* lines — far more reliable than the column-based default `l -v` output,
|
||||
* which has changed format twice across unrar versions.
|
||||
*
|
||||
* unrar automatically discovers sibling multipart files when they're
|
||||
* co-located (e.g. *.part1.rar + *.part2.rar in the same directory).
|
||||
*
|
||||
* Returns [] on any failure (best-effort: ingestion still succeeds with
|
||||
* an empty file list rather than failing the whole archive).
|
||||
*/
|
||||
export async function readRarContents(
|
||||
firstPartPath: string
|
||||
): Promise<FileEntry[]> {
|
||||
try {
|
||||
const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
|
||||
timeout: 30000,
|
||||
maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
|
||||
const { stdout } = await execFileAsync("unrar", ["lt", firstPartPath], {
|
||||
timeout: 60_000,
|
||||
maxBuffer: 50 * 1024 * 1024, // 50MB for archives with very many files
|
||||
});
|
||||
|
||||
return parseUnrarOutput(stdout);
|
||||
const entries = parseUnrarTechnical(stdout);
|
||||
if (entries.length === 0) {
|
||||
// Log a sample of the output so we can diagnose format changes
|
||||
log.warn(
|
||||
{ file: firstPartPath, sample: stdout.slice(0, 500) },
|
||||
"unrar lt returned no parseable entries"
|
||||
);
|
||||
}
|
||||
return entries;
|
||||
} catch (err) {
|
||||
log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
|
||||
return []; // Fallback: return empty on error
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the tabular output of `unrar l -v`.
|
||||
* Parse `unrar lt` output: header followed by per-file key:value blocks
|
||||
* separated by blank lines.
|
||||
*
|
||||
* Example output format:
|
||||
* Archive: test.rar
|
||||
* Details: RAR 5
|
||||
* Example block:
|
||||
*
|
||||
* Attributes Size Packed Ratio Date Time CRC-32 Name
|
||||
* ----------- --------- --------- ----- -------- ----- -------- ----
|
||||
* ...A.... 12345 10234 83% 2024-01-15 10:30 DEADBEEF folder/file.stl
|
||||
* ----------- --------- --------- ----- -------- ----- -------- ----
|
||||
* Name: folder/file.stl
|
||||
* Type: File
|
||||
* Size: 12345
|
||||
* Packed size: 10234
|
||||
* Ratio: 83%
|
||||
* mtime: 2024-01-15 10:30:00,000000000
|
||||
* Attributes: ..A....
|
||||
* CRC32: DEADBEEF
|
||||
* Host OS: Windows
|
||||
* Compression: RAR 5.0(v50) -m3 -md=32M
|
||||
*/
|
||||
function parseUnrarOutput(output: string): FileEntry[] {
|
||||
function parseUnrarTechnical(output: string): FileEntry[] {
|
||||
const entries: FileEntry[] = [];
|
||||
const lines = output.split("\n");
|
||||
// Split into blocks on blank lines, then on each block read key:value pairs.
|
||||
const blocks = output.split(/\r?\n\s*\r?\n/);
|
||||
|
||||
let inFileList = false;
|
||||
let separatorCount = 0;
|
||||
for (const block of blocks) {
|
||||
const fields = parseBlock(block);
|
||||
if (!fields) continue;
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
// Only File entries (skip Directory, and anything missing the basics)
|
||||
if (fields.type && fields.type.toLowerCase() !== "file") continue;
|
||||
if (!fields.name || fields.size === undefined) continue;
|
||||
|
||||
// Detect separator lines (------- pattern)
|
||||
if (/^-{5,}/.test(trimmed)) {
|
||||
separatorCount++;
|
||||
if (separatorCount === 1) {
|
||||
inFileList = true;
|
||||
} else if (separatorCount >= 2) {
|
||||
inFileList = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!inFileList) continue;
|
||||
|
||||
// Parse file entry line
|
||||
// Format: Attributes Size Packed Ratio Date Time CRC Name
|
||||
const match = trimmed.match(
|
||||
/^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
|
||||
);
|
||||
|
||||
if (match) {
|
||||
const [, uncompressedStr, compressedStr, crc32, filePath] = match;
|
||||
|
||||
// Skip directory entries (typically end with / or have size 0 with dir attributes)
|
||||
if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
|
||||
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
entries.push({
|
||||
path: filePath,
|
||||
fileName: path.basename(filePath),
|
||||
extension: ext ? ext.slice(1) : null,
|
||||
compressedSize: BigInt(compressedStr),
|
||||
uncompressedSize: BigInt(uncompressedStr),
|
||||
crc32: crc32.toLowerCase(),
|
||||
});
|
||||
}
|
||||
const filePath = fields.name;
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
entries.push({
|
||||
path: filePath,
|
||||
fileName: path.basename(filePath),
|
||||
extension: ext ? ext.slice(1) : null,
|
||||
uncompressedSize: BigInt(fields.size),
|
||||
compressedSize: fields.packedSize !== undefined
|
||||
? BigInt(fields.packedSize)
|
||||
: BigInt(fields.size),
|
||||
crc32: fields.crc32 ? fields.crc32.toLowerCase() : null,
|
||||
});
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
|
||||
interface BlockFields {
|
||||
name?: string;
|
||||
type?: string;
|
||||
size?: string;
|
||||
packedSize?: string;
|
||||
crc32?: string;
|
||||
}
|
||||
|
||||
function parseBlock(block: string): BlockFields | null {
|
||||
// Skip the archive-header block (contains "Archive:" / "Details:" lines
|
||||
// and lacks a Name field).
|
||||
if (!/^\s*Name:/m.test(block)) return null;
|
||||
|
||||
const fields: BlockFields = {};
|
||||
const lines = block.split(/\r?\n/);
|
||||
|
||||
for (const line of lines) {
|
||||
// Match " key: value" with arbitrary leading whitespace and a multi-word
|
||||
// key (e.g. "Packed size", "Host OS").
|
||||
const m = line.match(/^\s*([A-Za-z][A-Za-z0-9 ]*?)\s*:\s*(.*)$/);
|
||||
if (!m) continue;
|
||||
const key = m[1].trim().toLowerCase();
|
||||
const value = m[2].trim();
|
||||
if (key === "name") fields.name = value;
|
||||
else if (key === "type") fields.type = value;
|
||||
else if (key === "size") fields.size = value;
|
||||
else if (key === "packed size") fields.packedSize = value;
|
||||
else if (key === "crc32" || key === "blake2sp" || key === "checksum") {
|
||||
// unrar may report BLAKE2sp for newer archives instead of CRC32.
|
||||
// Either way we just store it as a hex string in our crc32 field.
|
||||
fields.crc32 = value;
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user