feat: add Telegram integration with forum topic support and creator tracking

Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source
channels for archive files, deduplicates by content hash, extracts metadata,
uploads to archive channel, and indexes in Postgres. Forum supergroups are
scanned per-topic with topic names used as creator. Filename-based creator
extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback.

Includes admin UI for managing accounts/channels, simplified account setup
(API credentials via env vars), auth code/password submission dialog,
package browser with creator column, and live ingestion activity tracking.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
xCyanGrizzly
2026-02-24 16:02:06 +01:00
parent beb9cfb312
commit b427193d17
70 changed files with 8627 additions and 2 deletions

View File

@@ -0,0 +1,21 @@
/**
* Extract a creator name from common archive file naming patterns.
*
* Priority in the worker: topic name > filename extraction.
* This is the fallback when no forum topic name is available.
*
* Patterns handled (split on ` - `):
* "Mammoth Factory - 2026-01.zip" → "Mammoth Factory"
* "Artist Name - Pack Title.part01.rar" → "Artist Name"
* "some_random_file.zip" → null
*/
export function extractCreatorFromFileName(fileName: string): string | null {
// Strip archive extensions (.zip, .rar, .part01.rar, .z01, etc.)
const bare = fileName.replace(/(\.(part\d+\.rar|z\d{2}|zip|rar))+$/i, "");
const idx = bare.indexOf(" - ");
if (idx <= 0) return null;
const creator = bare.slice(0, idx).trim();
return creator.length > 0 ? creator : null;
}

View File

@@ -0,0 +1,96 @@
export type ArchiveFormat = "ZIP" | "RAR";
export interface MultipartInfo {
baseName: string;
partNumber: number;
format: ArchiveFormat;
pattern: "ZIP_NUMBERED" | "ZIP_LEGACY" | "RAR_PART" | "RAR_LEGACY" | "SINGLE";
}
const patterns: {
regex: RegExp;
format: ArchiveFormat;
pattern: MultipartInfo["pattern"];
getBaseName: (match: RegExpMatchArray) => string;
getPartNumber: (match: RegExpMatchArray) => number;
}[] = [
// pack.zip.001, pack.zip.002
{
regex: /^(.+\.zip)\.(\d{3,})$/i,
format: "ZIP",
pattern: "ZIP_NUMBERED",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.z01, pack.z02 (legacy split — final part is pack.zip)
{
regex: /^(.+)\.z(\d{2,})$/i,
format: "ZIP",
pattern: "ZIP_LEGACY",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.part1.rar, pack.part2.rar
{
regex: /^(.+)\.part(\d+)\.rar$/i,
format: "RAR",
pattern: "RAR_PART",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.r00, pack.r01 (legacy split — final part is pack.rar)
{
regex: /^(.+)\.r(\d{2,})$/i,
format: "RAR",
pattern: "RAR_LEGACY",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
];
/**
* Detect if a filename is an archive and extract multipart info.
*/
export function detectArchive(fileName: string): MultipartInfo | null {
// Check multipart patterns first
for (const p of patterns) {
const match = fileName.match(p.regex);
if (match) {
return {
baseName: p.getBaseName(match),
partNumber: p.getPartNumber(match),
format: p.format,
pattern: p.pattern,
};
}
}
// Single .zip file — could be a standalone or the final part of a ZIP_LEGACY set
if (/\.zip$/i.test(fileName)) {
return {
baseName: fileName.replace(/\.zip$/i, ""),
partNumber: -1, // -1 signals "could be single or final legacy part"
format: "ZIP",
pattern: "SINGLE",
};
}
// Single .rar file — could be standalone or final part of RAR_LEGACY set
if (/\.rar$/i.test(fileName)) {
return {
baseName: fileName.replace(/\.rar$/i, ""),
partNumber: -1,
format: "RAR",
pattern: "SINGLE",
};
}
return null;
}
/**
* Check if a filename looks like any archive attachment we should process.
*/
export function isArchiveAttachment(fileName: string): boolean {
return detectArchive(fileName) !== null;
}

View File

@@ -0,0 +1,25 @@
import { createReadStream } from "fs";
import { createHash } from "crypto";
import { pipeline } from "stream/promises";
import { PassThrough } from "stream";
/**
* Compute SHA-256 hash of one or more files by streaming them in order.
* Memory usage: O(1) — reads in 64KB chunks regardless of total size.
* For multipart archives, pass all parts sorted by part number.
*/
export async function hashParts(filePaths: string[]): Promise<string> {
const hash = createHash("sha256");
for (const filePath of filePaths) {
await pipeline(
createReadStream(filePath),
new PassThrough({
transform(chunk, _encoding, callback) {
hash.update(chunk);
callback();
},
})
);
}
return hash.digest("hex");
}

View File

@@ -0,0 +1,100 @@
import { detectArchive, type ArchiveFormat, type MultipartInfo } from "./detect.js";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
const log = childLogger("multipart");
export interface TelegramMessage {
id: bigint;
fileName: string;
fileId: string;
fileSize: bigint;
date: Date;
}
export interface ArchiveSet {
type: ArchiveFormat;
baseName: string;
parts: TelegramMessage[];
isMultipart: boolean;
}
/**
* Group messages into archive sets (single files + multipart groups).
* Messages should be pre-filtered to only include archive attachments.
*/
export function groupArchiveSets(messages: TelegramMessage[]): ArchiveSet[] {
// Detect and annotate each message
const annotated: { msg: TelegramMessage; info: MultipartInfo }[] = [];
for (const msg of messages) {
const info = detectArchive(msg.fileName);
if (info) {
annotated.push({ msg, info });
}
}
// Group by baseName + format
const groups = new Map<string, { msg: TelegramMessage; info: MultipartInfo }[]>();
for (const item of annotated) {
const key = `${item.info.format}:${item.info.baseName.toLowerCase()}`;
const group = groups.get(key) ?? [];
group.push(item);
groups.set(key, group);
}
const results: ArchiveSet[] = [];
for (const [, group] of groups) {
const format = group[0].info.format;
const baseName = group[0].info.baseName;
// Separate explicit multipart entries from potential singles
const multipartEntries = group.filter((g) => g.info.pattern !== "SINGLE");
const singleEntries = group.filter((g) => g.info.pattern === "SINGLE");
if (multipartEntries.length > 0) {
// This is a multipart set
// Check if any single entry is the "final part" of a legacy split
const allEntries = [...multipartEntries, ...singleEntries];
// Check time span — skip if parts span too long
const dates = allEntries.map((e) => e.msg.date.getTime());
const span = Math.max(...dates) - Math.min(...dates);
const maxSpanMs = config.multipartTimeoutHours * 60 * 60 * 1000;
if (span > maxSpanMs) {
log.warn(
{ baseName, format, span: span / 3600000 },
"Multipart set spans too long, skipping"
);
continue;
}
// Sort by part number (singles get a very high number so they come last — they're the final part)
allEntries.sort((a, b) => {
const aNum = a.info.partNumber === -1 ? 999999 : a.info.partNumber;
const bNum = b.info.partNumber === -1 ? 999999 : b.info.partNumber;
return aNum - bNum;
});
results.push({
type: format,
baseName,
parts: allEntries.map((e) => e.msg),
isMultipart: true,
});
} else {
// All entries are singles — each is its own archive set
for (const entry of singleEntries) {
results.push({
type: format,
baseName: entry.info.baseName,
parts: [entry.msg],
isMultipart: false,
});
}
}
}
return results;
}

View File

@@ -0,0 +1,90 @@
import { execFile } from "child_process";
import { promisify } from "util";
import path from "path";
import { childLogger } from "../util/logger.js";
import type { FileEntry } from "./zip-reader.js";
const execFileAsync = promisify(execFile);
const log = childLogger("rar-reader");
/**
* Parse output of `unrar l -v <file>` to extract file metadata.
* unrar automatically discovers sibling parts when they're co-located.
*/
export async function readRarContents(
firstPartPath: string
): Promise<FileEntry[]> {
try {
const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
timeout: 30000,
maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
});
return parseUnrarOutput(stdout);
} catch (err) {
log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
return []; // Fallback: return empty on error
}
}
/**
* Parse the tabular output of `unrar l -v`.
*
* Example output format:
* Archive: test.rar
* Details: RAR 5
*
* Attributes Size Packed Ratio Date Time CRC-32 Name
* ----------- --------- --------- ----- -------- ----- -------- ----
* ...A.... 12345 10234 83% 2024-01-15 10:30 DEADBEEF folder/file.stl
* ----------- --------- --------- ----- -------- ----- -------- ----
*/
function parseUnrarOutput(output: string): FileEntry[] {
const entries: FileEntry[] = [];
const lines = output.split("\n");
let inFileList = false;
let separatorCount = 0;
for (const line of lines) {
const trimmed = line.trim();
// Detect separator lines (------- pattern)
if (/^-{5,}/.test(trimmed)) {
separatorCount++;
if (separatorCount === 1) {
inFileList = true;
} else if (separatorCount >= 2) {
inFileList = false;
}
continue;
}
if (!inFileList) continue;
// Parse file entry line
// Format: Attributes Size Packed Ratio Date Time CRC Name
const match = trimmed.match(
/^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
);
if (match) {
const [, uncompressedStr, compressedStr, crc32, filePath] = match;
// Skip directory entries (typically end with / or have size 0 with dir attributes)
if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
const ext = path.extname(filePath).toLowerCase();
entries.push({
path: filePath,
fileName: path.basename(filePath),
extension: ext ? ext.slice(1) : null,
compressedSize: BigInt(compressedStr),
uncompressedSize: BigInt(uncompressedStr),
crc32: crc32.toLowerCase(),
});
}
}
return entries;
}

View File

@@ -0,0 +1,48 @@
import { createReadStream, createWriteStream } from "fs";
import { stat } from "fs/promises";
import path from "path";
import { pipeline } from "stream/promises";
import { childLogger } from "../util/logger.js";
const log = childLogger("split");
/** 2GB in bytes — Telegram's file size limit */
const MAX_PART_SIZE = 2n * 1024n * 1024n * 1024n;
/**
* Split a file into ≤2GB parts using byte-level splitting.
* Returns paths to the split parts. If the file is already ≤2GB, returns the original path.
*/
export async function byteLevelSplit(filePath: string): Promise<string[]> {
const stats = await stat(filePath);
const fileSize = BigInt(stats.size);
if (fileSize <= MAX_PART_SIZE) {
return [filePath];
}
const dir = path.dirname(filePath);
const baseName = path.basename(filePath);
const partSize = Number(MAX_PART_SIZE);
const totalParts = Math.ceil(Number(fileSize) / partSize);
const parts: string[] = [];
log.info({ filePath, fileSize: Number(fileSize), totalParts }, "Splitting file");
for (let i = 0; i < totalParts; i++) {
const partNum = String(i + 1).padStart(3, "0");
const partPath = path.join(dir, `${baseName}.${partNum}`);
const start = i * partSize;
const end = Math.min(start + partSize - 1, Number(fileSize) - 1);
await pipeline(
createReadStream(filePath, { start, end }),
createWriteStream(partPath)
);
parts.push(partPath);
}
log.info({ filePath, parts: parts.length }, "File split complete");
return parts;
}

View File

@@ -0,0 +1,61 @@
import yauzl from "yauzl";
import path from "path";
import { childLogger } from "../util/logger.js";
const log = childLogger("zip-reader");
export interface FileEntry {
path: string;
fileName: string;
extension: string | null;
compressedSize: bigint;
uncompressedSize: bigint;
crc32: string | null;
}
/**
* Read the central directory of a ZIP file without extracting any contents.
* For multipart ZIPs, pass the paths sorted by part order.
* We attempt to read from the last part first (central directory is at the end).
*/
export async function readZipCentralDirectory(
filePaths: string[]
): Promise<FileEntry[]> {
// The central directory lives at the end of the last file
const targetFile = filePaths[filePaths.length - 1];
return new Promise((resolve, reject) => {
yauzl.open(targetFile, { lazyEntries: true, autoClose: true }, (err, zipFile) => {
if (err) {
log.warn({ err, file: targetFile }, "Failed to open ZIP for reading");
resolve([]); // Fallback: return empty on error
return;
}
const entries: FileEntry[] = [];
zipFile.readEntry();
zipFile.on("entry", (entry: yauzl.Entry) => {
// Skip directories
if (!entry.fileName.endsWith("/")) {
const ext = path.extname(entry.fileName).toLowerCase();
entries.push({
path: entry.fileName,
fileName: path.basename(entry.fileName),
extension: ext ? ext.slice(1) : null, // Remove leading dot
compressedSize: BigInt(entry.compressedSize),
uncompressedSize: BigInt(entry.uncompressedSize),
crc32: entry.crc32 !== 0 ? entry.crc32.toString(16).padStart(8, "0") : null,
});
}
zipFile.readEntry();
});
zipFile.on("end", () => resolve(entries));
zipFile.on("error", (error) => {
log.warn({ error, file: targetFile }, "Error reading ZIP entries");
resolve(entries); // Return whatever we got
});
});
});
}