feat: add Telegram integration with forum topic support and creator tracking

Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source channels for archive files, deduplicates by content hash, extracts metadata, uploads to archive channel, and indexes in Postgres. Forum supergroups are scanned per-topic with topic names used as creator. Filename-based creator extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback. Includes admin UI for managing accounts/channels, simplified account setup (API credentials via env vars), auth code/password submission dialog, package browser with creator column, and live ingestion activity tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 06:11:15 +00:00 · 2026-02-24 16:02:06 +01:00
parent beb9cfb312
commit b427193d17
70 changed files with 8627 additions and 2 deletions
--- a/worker/src/archive/creator.ts
+++ b/worker/src/archive/creator.ts
@@ -0,0 +1,21 @@
+/**
+ * Extract a creator name from common archive file naming patterns.
+ *
+ * Priority in the worker: topic name > filename extraction.
+ * This is the fallback when no forum topic name is available.
+ *
+ * Patterns handled (split on ` - `):
+ *   "Mammoth Factory - 2026-01.zip"        → "Mammoth Factory"
+ *   "Artist Name - Pack Title.part01.rar"   → "Artist Name"
+ *   "some_random_file.zip"                  → null
+ */
+export function extractCreatorFromFileName(fileName: string): string | null {
+  // Strip archive extensions (.zip, .rar, .part01.rar, .z01, etc.)
+  const bare = fileName.replace(/(\.(part\d+\.rar|z\d{2}|zip|rar))+$/i, "");
+
+  const idx = bare.indexOf(" - ");
+  if (idx <= 0) return null;
+
+  const creator = bare.slice(0, idx).trim();
+  return creator.length > 0 ? creator : null;
+}
--- a/worker/src/archive/detect.ts
+++ b/worker/src/archive/detect.ts
@@ -0,0 +1,96 @@
+export type ArchiveFormat = "ZIP" | "RAR";
+
+export interface MultipartInfo {
+  baseName: string;
+  partNumber: number;
+  format: ArchiveFormat;
+  pattern: "ZIP_NUMBERED" | "ZIP_LEGACY" | "RAR_PART" | "RAR_LEGACY" | "SINGLE";
+}
+
+const patterns: {
+  regex: RegExp;
+  format: ArchiveFormat;
+  pattern: MultipartInfo["pattern"];
+  getBaseName: (match: RegExpMatchArray) => string;
+  getPartNumber: (match: RegExpMatchArray) => number;
+}[] = [
+  // pack.zip.001, pack.zip.002
+  {
+    regex: /^(.+\.zip)\.(\d{3,})$/i,
+    format: "ZIP",
+    pattern: "ZIP_NUMBERED",
+    getBaseName: (m) => m[1],
+    getPartNumber: (m) => parseInt(m[2], 10),
+  },
+  // pack.z01, pack.z02 (legacy split — final part is pack.zip)
+  {
+    regex: /^(.+)\.z(\d{2,})$/i,
+    format: "ZIP",
+    pattern: "ZIP_LEGACY",
+    getBaseName: (m) => m[1],
+    getPartNumber: (m) => parseInt(m[2], 10),
+  },
+  // pack.part1.rar, pack.part2.rar
+  {
+    regex: /^(.+)\.part(\d+)\.rar$/i,
+    format: "RAR",
+    pattern: "RAR_PART",
+    getBaseName: (m) => m[1],
+    getPartNumber: (m) => parseInt(m[2], 10),
+  },
+  // pack.r00, pack.r01 (legacy split — final part is pack.rar)
+  {
+    regex: /^(.+)\.r(\d{2,})$/i,
+    format: "RAR",
+    pattern: "RAR_LEGACY",
+    getBaseName: (m) => m[1],
+    getPartNumber: (m) => parseInt(m[2], 10),
+  },
+];
+
+/**
+ * Detect if a filename is an archive and extract multipart info.
+ */
+export function detectArchive(fileName: string): MultipartInfo | null {
+  // Check multipart patterns first
+  for (const p of patterns) {
+    const match = fileName.match(p.regex);
+    if (match) {
+      return {
+        baseName: p.getBaseName(match),
+        partNumber: p.getPartNumber(match),
+        format: p.format,
+        pattern: p.pattern,
+      };
+    }
+  }
+
+  // Single .zip file — could be a standalone or the final part of a ZIP_LEGACY set
+  if (/\.zip$/i.test(fileName)) {
+    return {
+      baseName: fileName.replace(/\.zip$/i, ""),
+      partNumber: -1, // -1 signals "could be single or final legacy part"
+      format: "ZIP",
+      pattern: "SINGLE",
+    };
+  }
+
+  // Single .rar file — could be standalone or final part of RAR_LEGACY set
+  if (/\.rar$/i.test(fileName)) {
+    return {
+      baseName: fileName.replace(/\.rar$/i, ""),
+      partNumber: -1,
+      format: "RAR",
+      pattern: "SINGLE",
+    };
+  }
+
+  return null;
+}
+
+/**
+ * Check if a filename looks like any archive attachment we should process.
+ */
+export function isArchiveAttachment(fileName: string): boolean {
+  return detectArchive(fileName) !== null;
+}
--- a/worker/src/archive/hash.ts
+++ b/worker/src/archive/hash.ts
@@ -0,0 +1,25 @@
+import { createReadStream } from "fs";
+import { createHash } from "crypto";
+import { pipeline } from "stream/promises";
+import { PassThrough } from "stream";
+
+/**
+ * Compute SHA-256 hash of one or more files by streaming them in order.
+ * Memory usage: O(1) — reads in 64KB chunks regardless of total size.
+ * For multipart archives, pass all parts sorted by part number.
+ */
+export async function hashParts(filePaths: string[]): Promise<string> {
+  const hash = createHash("sha256");
+  for (const filePath of filePaths) {
+    await pipeline(
+      createReadStream(filePath),
+      new PassThrough({
+        transform(chunk, _encoding, callback) {
+          hash.update(chunk);
+          callback();
+        },
+      })
+    );
+  }
+  return hash.digest("hex");
+}
--- a/worker/src/archive/multipart.ts
+++ b/worker/src/archive/multipart.ts
@@ -0,0 +1,100 @@
+import { detectArchive, type ArchiveFormat, type MultipartInfo } from "./detect.js";
+import { config } from "../util/config.js";
+import { childLogger } from "../util/logger.js";
+
+const log = childLogger("multipart");
+
+export interface TelegramMessage {
+  id: bigint;
+  fileName: string;
+  fileId: string;
+  fileSize: bigint;
+  date: Date;
+}
+
+export interface ArchiveSet {
+  type: ArchiveFormat;
+  baseName: string;
+  parts: TelegramMessage[];
+  isMultipart: boolean;
+}
+
+/**
+ * Group messages into archive sets (single files + multipart groups).
+ * Messages should be pre-filtered to only include archive attachments.
+ */
+export function groupArchiveSets(messages: TelegramMessage[]): ArchiveSet[] {
+  // Detect and annotate each message
+  const annotated: { msg: TelegramMessage; info: MultipartInfo }[] = [];
+  for (const msg of messages) {
+    const info = detectArchive(msg.fileName);
+    if (info) {
+      annotated.push({ msg, info });
+    }
+  }
+
+  // Group by baseName + format
+  const groups = new Map<string, { msg: TelegramMessage; info: MultipartInfo }[]>();
+  for (const item of annotated) {
+    const key = `${item.info.format}:${item.info.baseName.toLowerCase()}`;
+    const group = groups.get(key) ?? [];
+    group.push(item);
+    groups.set(key, group);
+  }
+
+  const results: ArchiveSet[] = [];
+
+  for (const [, group] of groups) {
+    const format = group[0].info.format;
+    const baseName = group[0].info.baseName;
+
+    // Separate explicit multipart entries from potential singles
+    const multipartEntries = group.filter((g) => g.info.pattern !== "SINGLE");
+    const singleEntries = group.filter((g) => g.info.pattern === "SINGLE");
+
+    if (multipartEntries.length > 0) {
+      // This is a multipart set
+      // Check if any single entry is the "final part" of a legacy split
+      const allEntries = [...multipartEntries, ...singleEntries];
+
+      // Check time span — skip if parts span too long
+      const dates = allEntries.map((e) => e.msg.date.getTime());
+      const span = Math.max(...dates) - Math.min(...dates);
+      const maxSpanMs = config.multipartTimeoutHours * 60 * 60 * 1000;
+
+      if (span > maxSpanMs) {
+        log.warn(
+          { baseName, format, span: span / 3600000 },
+          "Multipart set spans too long, skipping"
+        );
+        continue;
+      }
+
+      // Sort by part number (singles get a very high number so they come last — they're the final part)
+      allEntries.sort((a, b) => {
+        const aNum = a.info.partNumber === -1 ? 999999 : a.info.partNumber;
+        const bNum = b.info.partNumber === -1 ? 999999 : b.info.partNumber;
+        return aNum - bNum;
+      });
+
+      results.push({
+        type: format,
+        baseName,
+        parts: allEntries.map((e) => e.msg),
+        isMultipart: true,
+      });
+    } else {
+      // All entries are singles — each is its own archive set
+      for (const entry of singleEntries) {
+        results.push({
+          type: format,
+          baseName: entry.info.baseName,
+          parts: [entry.msg],
+          isMultipart: false,
+        });
+      }
+    }
+  }
+
+  return results;
+}
--- a/worker/src/archive/rar-reader.ts
+++ b/worker/src/archive/rar-reader.ts
@@ -0,0 +1,90 @@
+import { execFile } from "child_process";
+import { promisify } from "util";
+import path from "path";
+import { childLogger } from "../util/logger.js";
+import type { FileEntry } from "./zip-reader.js";
+
+const execFileAsync = promisify(execFile);
+const log = childLogger("rar-reader");
+
+/**
+ * Parse output of `unrar l -v <file>` to extract file metadata.
+ * unrar automatically discovers sibling parts when they're co-located.
+ */
+export async function readRarContents(
+  firstPartPath: string
+): Promise<FileEntry[]> {
+  try {
+    const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
+      timeout: 30000,
+      maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
+    });
+
+    return parseUnrarOutput(stdout);
+  } catch (err) {
+    log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
+    return []; // Fallback: return empty on error
+  }
+}
+
+/**
+ * Parse the tabular output of `unrar l -v`.
+ *
+ * Example output format:
+ *  Archive: test.rar
+ *  Details: RAR 5
+ *
+ *   Attributes      Size     Packed Ratio   Date   Time   CRC-32  Name
+ *  ----------- ---------  --------- ----- -------- ----- --------  ----
+ *   ...A....      12345      10234  83%  2024-01-15 10:30 DEADBEEF  folder/file.stl
+ *  ----------- ---------  --------- ----- -------- ----- --------  ----
+ */
+function parseUnrarOutput(output: string): FileEntry[] {
+  const entries: FileEntry[] = [];
+  const lines = output.split("\n");
+
+  let inFileList = false;
+  let separatorCount = 0;
+
+  for (const line of lines) {
+    const trimmed = line.trim();
+
+    // Detect separator lines (------- pattern)
+    if (/^-{5,}/.test(trimmed)) {
+      separatorCount++;
+      if (separatorCount === 1) {
+        inFileList = true;
+      } else if (separatorCount >= 2) {
+        inFileList = false;
+      }
+      continue;
+    }
+
+    if (!inFileList) continue;
+
+    // Parse file entry line
+    // Format: Attributes Size Packed Ratio Date Time CRC Name
+    const match = trimmed.match(
+      /^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
+    );
+
+    if (match) {
+      const [, uncompressedStr, compressedStr, crc32, filePath] = match;
+
+      // Skip directory entries (typically end with / or have size 0 with dir attributes)
+      if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
+
+      const ext = path.extname(filePath).toLowerCase();
+      entries.push({
+        path: filePath,
+        fileName: path.basename(filePath),
+        extension: ext ? ext.slice(1) : null,
+        compressedSize: BigInt(compressedStr),
+        uncompressedSize: BigInt(uncompressedStr),
+        crc32: crc32.toLowerCase(),
+      });
+    }
+  }
+
+  return entries;
+}
--- a/worker/src/archive/split.ts
+++ b/worker/src/archive/split.ts
@@ -0,0 +1,48 @@
+import { createReadStream, createWriteStream } from "fs";
+import { stat } from "fs/promises";
+import path from "path";
+import { pipeline } from "stream/promises";
+import { childLogger } from "../util/logger.js";
+
+const log = childLogger("split");
+
+/** 2GB in bytes — Telegram's file size limit */
+const MAX_PART_SIZE = 2n * 1024n * 1024n * 1024n;
+
+/**
+ * Split a file into ≤2GB parts using byte-level splitting.
+ * Returns paths to the split parts. If the file is already ≤2GB, returns the original path.
+ */
+export async function byteLevelSplit(filePath: string): Promise<string[]> {
+  const stats = await stat(filePath);
+  const fileSize = BigInt(stats.size);
+
+  if (fileSize <= MAX_PART_SIZE) {
+    return [filePath];
+  }
+
+  const dir = path.dirname(filePath);
+  const baseName = path.basename(filePath);
+  const partSize = Number(MAX_PART_SIZE);
+  const totalParts = Math.ceil(Number(fileSize) / partSize);
+  const parts: string[] = [];
+
+  log.info({ filePath, fileSize: Number(fileSize), totalParts }, "Splitting file");
+
+  for (let i = 0; i < totalParts; i++) {
+    const partNum = String(i + 1).padStart(3, "0");
+    const partPath = path.join(dir, `${baseName}.${partNum}`);
+    const start = i * partSize;
+    const end = Math.min(start + partSize - 1, Number(fileSize) - 1);
+
+    await pipeline(
+      createReadStream(filePath, { start, end }),
+      createWriteStream(partPath)
+    );
+
+    parts.push(partPath);
+  }
+
+  log.info({ filePath, parts: parts.length }, "File split complete");
+  return parts;
+}
--- a/worker/src/archive/zip-reader.ts
+++ b/worker/src/archive/zip-reader.ts
@@ -0,0 +1,61 @@
+import yauzl from "yauzl";
+import path from "path";
+import { childLogger } from "../util/logger.js";
+
+const log = childLogger("zip-reader");
+
+export interface FileEntry {
+  path: string;
+  fileName: string;
+  extension: string | null;
+  compressedSize: bigint;
+  uncompressedSize: bigint;
+  crc32: string | null;
+}
+
+/**
+ * Read the central directory of a ZIP file without extracting any contents.
+ * For multipart ZIPs, pass the paths sorted by part order.
+ * We attempt to read from the last part first (central directory is at the end).
+ */
+export async function readZipCentralDirectory(
+  filePaths: string[]
+): Promise<FileEntry[]> {
+  // The central directory lives at the end of the last file
+  const targetFile = filePaths[filePaths.length - 1];
+
+  return new Promise((resolve, reject) => {
+    yauzl.open(targetFile, { lazyEntries: true, autoClose: true }, (err, zipFile) => {
+      if (err) {
+        log.warn({ err, file: targetFile }, "Failed to open ZIP for reading");
+        resolve([]); // Fallback: return empty on error
+        return;
+      }
+
+      const entries: FileEntry[] = [];
+
+      zipFile.readEntry();
+      zipFile.on("entry", (entry: yauzl.Entry) => {
+        // Skip directories
+        if (!entry.fileName.endsWith("/")) {
+          const ext = path.extname(entry.fileName).toLowerCase();
+          entries.push({
+            path: entry.fileName,
+            fileName: path.basename(entry.fileName),
+            extension: ext ? ext.slice(1) : null, // Remove leading dot
+            compressedSize: BigInt(entry.compressedSize),
+            uncompressedSize: BigInt(entry.uncompressedSize),
+            crc32: entry.crc32 !== 0 ? entry.crc32.toString(16).padStart(8, "0") : null,
+          });
+        }
+        zipFile.readEntry();
+      });
+
+      zipFile.on("end", () => resolve(entries));
+      zipFile.on("error", (error) => {
+        log.warn({ error, file: targetFile }, "Error reading ZIP entries");
+        resolve(entries); // Return whatever we got
+      });
+    });
+  });
+}