feat: add Telegram integration with forum topic support and creator tracking

Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source channels for archive files, deduplicates by content hash, extracts metadata, uploads to archive channel, and indexes in Postgres. Forum supergroups are scanned per-topic with topic names used as creator. Filename-based creator extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback. Includes admin UI for managing accounts/channels, simplified account setup (API credentials via env vars), auth code/password submission dialog, package browser with creator column, and live ingestion activity tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 14:21:15 +00:00 · 2026-02-24 16:02:06 +01:00
parent beb9cfb312
commit b427193d17
70 changed files with 8627 additions and 2 deletions
--- a/worker/src/preview/match.ts
+++ b/worker/src/preview/match.ts
@@ -0,0 +1,86 @@
+import { childLogger } from "../util/logger.js";
+
+const log = childLogger("preview-match");
+
+export interface TelegramPhoto {
+  id: bigint;
+  date: Date;
+  /** Caption text on the photo message (if any). */
+  caption: string;
+  /** The smallest photo size available — used as thumbnail. */
+  fileId: string;
+  fileSize: number;
+}
+
+export interface ArchiveRef {
+  baseName: string;
+  firstMessageId: bigint;
+  firstMessageDate: Date;
+}
+
+/**
+ * Try to match a photo message to an archive by:
+ * 1. Caption contains the archive baseName (without extension)
+ * 2. Photo was posted within ±10 messages (time-window: ±6 hours)
+ *
+ * Returns the best match (closest in time), or null.
+ */
+export function matchPreviewToArchive(
+  photos: TelegramPhoto[],
+  archives: ArchiveRef[]
+): Map<string, TelegramPhoto> {
+  const results = new Map<string, TelegramPhoto>();
+  const TIME_WINDOW_MS = 6 * 60 * 60 * 1000; // 6 hours
+
+  for (const archive of archives) {
+    // Normalize the archive base name for matching
+    const normalizedBase = normalizeForMatch(archive.baseName);
+    if (!normalizedBase) continue;
+
+    let bestMatch: TelegramPhoto | null = null;
+    let bestTimeDiff = Infinity;
+
+    for (const photo of photos) {
+      const timeDiff = Math.abs(
+        photo.date.getTime() - archive.firstMessageDate.getTime()
+      );
+
+      // Must be within time window
+      if (timeDiff > TIME_WINDOW_MS) continue;
+
+      // Check if the photo caption contains the archive base name
+      const normalizedCaption = normalizeForMatch(photo.caption);
+      if (!normalizedCaption) continue;
+
+      const matches =
+        normalizedCaption.includes(normalizedBase) ||
+        normalizedBase.includes(normalizedCaption);
+
+      if (matches && timeDiff < bestTimeDiff) {
+        bestMatch = photo;
+        bestTimeDiff = timeDiff;
+      }
+    }
+
+    if (bestMatch) {
+      log.debug(
+        { baseName: archive.baseName, photoId: bestMatch.id.toString() },
+        "Matched preview photo to archive"
+      );
+      results.set(archive.baseName, bestMatch);
+    }
+  }
+
+  return results;
+}
+
+/**
+ * Strip extension, punctuation, and normalize for fuzzy matching.
+ */
+function normalizeForMatch(input: string): string {
+  return input
+    .toLowerCase()
+    .replace(/\.[a-z0-9]{1,5}$/i, "") // strip extension
+    .replace(/[_\-.\s]+/g, " ") // normalize separators
+    .trim();
+}